# To run the experiment without the impermissible tokens replace 'train.txt' with 'train_1.txt' (and likewise for dev.txt and test.txt)


# Ensure that you're pointing towards the impermissible tokens csv path in your code e.g.         df = pd.read_csv('C:/Users/howar/Downloads/deceptive-attention-master/deceptive-attention-master/src/classification_tasks/pytorch-pretrained-BERT/examples/impermissible.csv')


In [1]:
from __future__ import absolute_import, division, print_function

import argparse
import csv
import logging
import os
import random
import sys

sys.path.append(os.path.join(os.getcwd(), 'pytorch-pretrained-BERT'))

import numpy as np
import pandas as pd

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from torch.nn import CrossEntropyLoss, MSELoss
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score

from pytorch_pretrained_bert_local.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert_local.modeling import BertForSequenceClassification, BertConfig
from pytorch_pretrained_bert_local.tokenization import BertTokenizer
from pytorch_pretrained_bert_local.optimization import BertAdam, WarmupLinearSchedule

logger = logging.getLogger(__name__)




In [2]:

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None, block=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
        self.block = block


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
            return lines

    @classmethod
    def _read_txt(cls, input_file):
        """Reads a text file."""
        return open(input_file, encoding='UTF8').readlines()

class SstWikiProcessor(DataProcessor):
    """Processor for the SST-binary + Wikipedia data set (sentence level)."""

    def get_train_examples(self, data_dir, limit=0):
        """See base class."""
        ret = self._create_examples(
            SstWikiProcessor._read_txt(os.path.join(data_dir, "train.txt")), "train")
        if limit:
            ret = ret[0:limit]
        return ret

    def get_dev_examples(self, data_dir, limit=0):
        """See base class."""
        ret = self._create_examples(
            SstWikiProcessor._read_txt(os.path.join(data_dir, "dev.txt")), "dev")
        if limit:
            ret = ret[0:limit]
        return ret


    def get_test_examples(self, data_dir, limit=0):
        """See base class."""
        ret = self._create_examples(
            SstWikiProcessor._read_txt(os.path.join(data_dir, "test.txt")), "test")
        if limit:
            ret = ret[0:limit]
        return ret

    def get_labels(self):
        """See base class."""
        return ["1", "2","3", "4", "5"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            label_text = line.split('\t', 1)
            text_split = label_text[1].split('[SEP]')
            text_sst = text_split[0].strip()
            text_wiki = text_split[1].strip() 
            label = label_text[0]
            examples.append(
                InputExample(guid=guid, text_a=text_wiki, text_b=text_sst, label=label))
        return examples

class PronounProcessor(DataProcessor):
    """Processor for the Pronoun data set (sentence level)."""

    def get_train_examples(self, data_dir, limit=0):
        """See base class."""
        ret = self._create_examples(
            PronounProcessor._read_txt(os.path.join(data_dir, "train.txt")),
            PronounProcessor._read_txt(os.path.join(data_dir, "train.txt.block")),
            "train")
        if limit:
            ret = ret[0:limit]
        return ret

    def get_dev_examples(self, data_dir, limit=0):
        """See base class."""
        ret = self._create_examples(
            PronounProcessor._read_txt(os.path.join(data_dir, "dev.txt")),
            PronounProcessor._read_txt(os.path.join(data_dir, "dev.txt.block")),
            "dev")
        if limit:
            ret = ret[0:limit]
        return ret


    def get_test_examples(self, data_dir, limit=0):
        """See base class."""
        ret = self._create_examples(
            PronounProcessor._read_txt(os.path.join(data_dir, "test.txt")),
            PronounProcessor._read_txt(os.path.join(data_dir, "test.txt.block")),
            "test")
        if limit:
            ret = ret[0:limit]
        return ret

    def get_labels(self):
        """See base class."""
        return ["1", "2","3", "4", "5"]

    def _create_examples(self, lines, block_lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            label_text = line.split('\t', 1)
            if len(label_text)>1:
                # print('index')
                # print(i)
                # print(len(block_lines))
                label = label_text[0]
                text = label_text[1]
                block = block_lines[i]
                examples.append(
                    InputExample(guid=guid, text_a=text, text_b=None, label=label, block=block))

                # print(block_lines)
        return examples

def convert_examples_to_features(examples, label_list, max_seq_length,
                                 tokenizer, output_mode):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d" % (ex_index, len(examples)))

        tokens_a = tokenizer.tokenize(example.text_a)
        # print('example.block')
        # print(example.block)
        # print(tokens_a)
        # segment_ids=[int(x) token in tokens_a]
        # if example.block:
        # print(tokens_a)
        # segment_ids = [int(item) for item in example.block.split()]
        # print(os.getcwd())
        #         # # print('basbkdas')
        # filepath=os.path.join(os.getcwd(),'../pytorch-pretrained-BERT/examples/impermissible.csv')
        # print(filepath)
        df = pd.read_csv('C:/Users/howar/Downloads/deceptive-attention-master/deceptive-attention-master/src/classification_tasks/pytorch-pretrained-BERT/examples/impermissible.csv')
        # df = pd.read_csv('/home/lgpu0151/Howard/src/classification_tasks/pytorch-pretrained-BERT/examples/impermissible.csv')


        pronoun_list=df['0'].to_list()
        #     pronoun_list = ["her", "his", "him", "she", "he", "herself", "himself", "hers", "mr", "mrs", "ms", "mr.", "mrs.", "ms."]
        segment_ids = [1 if token.lower() in pronoun_list else 0 for token in tokens_a]
        # else:
        #     segment_ids = [0]*len(tokens_a)
        # print(segment_ids)
        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]
                segment_ids = segment_ids[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] + segment_ids + [0]

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding
        # print("OOOOOOOO", len(input_ids), len(input_mask), len(segment_ids), max_seq_length)
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if output_mode == "classification":
            # print(label_map)
            label_id = label_map[example.label]
        else:
            raise KeyError(output_mode)

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


def simple_accuracy(preds, labels):
    return (preds == labels).mean()


def acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    return {
        "acc": acc,
        "f1": f1,
        "acc_and_f1": (acc + f1) / 2,
    }


def pearson_and_spearman(preds, labels):
    pearson_corr = pearsonr(preds, labels)[0]
    spearman_corr = spearmanr(preds, labels)[0]
    return {
        "pearson": pearson_corr,
        "spearmanr": spearman_corr,
        "corr": (pearson_corr + spearman_corr) / 2,
    }


def compute_metrics(input_processor_type, preds, labels):
    assert len(preds) == len(labels)
    if input_processor_type == "sst-wiki" or input_processor_type == "pronoun":
        return {"acc": simple_accuracy(preds, labels)}
    else:
        raise KeyError(input_processor_type)

def attention_regularization_loss(attention_probs_layers, 
                                    pay_attention_mask,
                                    pad_attention_mask,
                                    hammer_coeff=0.0,
                                    optimize_func='mean',
                                    debug=False):
    float_type = torch.FloatTensor
    if torch.cuda.is_available():
        float_type = torch.cuda.FloatTensor

    reg_attention_mask = pay_attention_mask.unsqueeze(1).unsqueeze(2).type(float_type)
    pad_attention_mask = (1-pad_attention_mask).unsqueeze(1).unsqueeze(2).type(float_type)
    non_reg_attention_mask = 1 - (reg_attention_mask + pad_attention_mask)
    # attention_probs_layers - [B x H x aW x bW] [32, 12, 128, 128]
    # pay_attention_mask     -  B x W             32,  1,   1, 128
    #                        -  0..., 1..., 0... - WIKI, SST, PAD
    # minimize attention to SST words

    # We are only interested in last layer, and CLS token (first token)
    attention_probs_layer = attention_probs_layers[-1][:, :, 0, :].unsqueeze(2)
    # 32, 12, 1, 128

    reg_attention_maps     = attention_probs_layer * reg_attention_mask
    pad_attention_maps     = attention_probs_layer * pad_attention_mask
    non_reg_attention_maps = attention_probs_layer * non_reg_attention_mask
    if debug:
        print(f"Regularized attention mask:{reg_attention_mask}")
        print(f"Non-Regular attention mask:{non_reg_attention_mask}")
    # 32, 12, 1, 128
    # 32, 12, 1, 128 -> 32, 12, 1
    reg_attention_sum = torch.sum(reg_attention_maps, -1)
    pad_attention_sum = torch.sum(pad_attention_maps, -1)
    non_reg_attention_sum = torch.sum(non_reg_attention_maps, -1)
    total_attention_sum=torch.sum(attention_probs_layer, -1)

    if optimize_func == 'mean':
        hammer_reg = torch.mean( torch.log(1 - reg_attention_sum) )
    else:
        # minimize max attention_sum
        # minimize min log(1 - attention_sum)
        hammer_reg = torch.min( torch.log(1 - reg_attention_sum) )
    return - hammer_coeff * hammer_reg, torch.mean(reg_attention_sum), torch.mean(non_reg_attention_sum),reg_attention_sum, torch.mean(pad_attention_sum), torch.max(reg_attention_sum), torch.max(non_reg_attention_sum), torch.argmax(non_reg_attention_sum, dim=1)

def main(params):
    parser = argparse.ArgumentParser()
#     parser.parse_args()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default='data/twitter_bert',
                        type=str,
                        required=False,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default="bert-base-uncased", type=str, required=False,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--input_processor_type",
                        default="pronoun",
                        type=str,
                        required=False,
                        help="The type of processor to use for reading data.")
    parser.add_argument("--output_dir",
                        default=params['output_dir']+"/"+str(params['hammer_coeff'])+params['optfunc'],
                        type=str,
                        required=False,
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--cache_dir",
                        default="",
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=500,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        default=True,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=True,

                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        default=True,

                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=1,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=1,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=2e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=4.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=0,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--hammer_coeff',
                        type=float,
                        default=0.0,
                        help="Hammer loss coefficient")
    parser.add_argument('--att_opt_func',
                        type=str,
                        default=params['optfunc'],
                        help="Attention optimization function")
    parser.add_argument("--debug",
                        action='store_true')
    parser.add_argument("--first_run",
                        default=True,
                        action='store_true')
    parser.add_argument("--name",
                        type=str)
    args = parser.parse_args(args=[])

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()
    
    base_labels = {}
    print(f"FIRST RUN: {args.first_run}")
    if not args.first_run:
        for typ in ["dev", "test"]:
            base_labels_content = open("{}_base_labels_{}.txt".format(args.name, typ), 'r').readlines()
            base_labels[typ] = [int(label.strip()) for label in base_labels_content]
            
    debug = args.debug
    if debug:
        args.train_batch_size = 2
        args.eval_batch_size = 2
        args.num_train_epochs = 1

    processors = {
        "sst-wiki": SstWikiProcessor,
        "pronoun": PronounProcessor
    }

    output_modes = {
        "sst-wiki": "classification",
        "pronoun": "classification",
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')

    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps


    torch.manual_seed(params['seed'])
    if n_gpu > 0:
        torch.cuda.manual_seed_all(params['seed'])

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    input_processor_type = args.input_processor_type.lower()

    if input_processor_type not in processors:
        raise ValueError("Task not found: %s" % (input_processor_type))

    processor = processors[input_processor_type]()
    output_mode = output_modes[input_processor_type]

    label_list = processor.get_labels()
    print(label_list)
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        limit = 2 if debug else 0
        train_examples = processor.get_train_examples(args.data_dir, limit)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))
    model = BertForSequenceClassification.from_pretrained(args.bert_model,
              cache_dir=cache_dir,
              num_labels=num_labels)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizerb
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
                                             t_total=num_train_optimization_steps)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        print("typ\tepoch\tacc\tavg_mean_mass\tavg_max_mass\tloss\thammer_loss\tlabel_match_score\tavg_mean_vn\tavg_max_vn\tavg_min_vn")
        model.train()

        for epoch in trange(int(args.num_train_epochs) + 1, desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            
            if epoch > 0:
                for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, input_mask, segment_ids, label_ids = batch

                    # define a new function to compute loss values for both output_modes
                    logits, attention_probs_layers, category_mask, _ = model(input_ids, 
                                                                            token_type_ids=segment_ids,
                                                                            pad_attention_mask=input_mask,
                                                                            manipulate_attention=True,
                                                                            category_mask=None,
                                                                            labels=None)
                    # logits - B x 2 
                    loss_fct = CrossEntropyLoss() # averages the loss over B
                    # print(label_ids)
                    # # print(logits)
                    loss =loss_fct(logits, label_ids)
                    # print('attetnion prob layer')
                    # print(attention_probs_layers)
                    # loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                    loss += attention_regularization_loss(attention_probs_layers, 
                                                            category_mask,
                                                            input_mask,
                                                            args.hammer_coeff, 
                                                            optimize_func=args.att_opt_func,
                                                            debug=debug)[0]
                    
                    if n_gpu > 1:
                        loss = loss.mean() # mean() to average on multi-gpu.
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    if args.fp16:
                        optimizer.backward(loss)
                    else:
                        loss.backward()

                    tr_loss += loss.item()
                    nb_tr_examples += input_ids.size(0)
                    nb_tr_steps += 1
                    if (step + 1) % args.gradient_accumulation_steps == 0:
                        if args.fp16:
                            # modify learning rate with special warm up BERT uses
                            # if args.fp16 is False, BertAdam is used that handles this automatically
                            lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step/num_train_optimization_steps,
                                                                                    args.warmup_proportion)
                            for param_group in optimizer.param_groups:
                                param_group['lr'] = lr_this_step
                        optimizer.step()
                        optimizer.zero_grad()
                        global_step += 1
                    if debug:
                        break

            # EVALUATION AFTER EVERY EPOCH
            eval_preds = {}
            for typ in ["dev", "test"]:
                eval_preds[typ] = run_evaluation(args, processor, label_list, tokenizer, output_mode, epoch, 
                                                    model, num_labels, tr_loss, global_step, device, input_processor_type, 
                                                    base_labels, debug, typ)

            #dump labels after the last epoch, or when first_run
            if args.first_run or epoch == args.num_train_epochs:
                for typ in ["dev", "test"]:
                    preds = eval_preds[typ]
                    filename = "{}_labels_{}_.txt".format(typ, epoch)
                    labels_file = os.path.join(args.output_dir, filename)
                    with open(labels_file, "w") as writer:
                        logger.info("Dumping labels in the file: {}".format(labels_file))
                        writer.write('\n'.join([str(pred) for pred in preds]))

    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
    else:
        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
    model.to(device)

def run_evaluation(args, processor, label_list, tokenizer, output_mode, epoch, 
                    model, num_labels, tr_loss, global_step, device, input_processor_type, 
                    base_labels, debug, typ="dev"):
    
    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        
        limit = 2 if debug else 0
        if typ == "dev":
            eval_examples = processor.get_dev_examples(args.data_dir, limit)
        else:
            eval_examples = processor.get_test_examples(args.data_dir, limit)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
        logger.info("***** Running evaluation on " + typ + " data*****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)

        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        eval_loss = 0
        ce_eval_loss = 0
        ar_eval_loss = 0
        nb_eval_steps = 0
        preds = []
        attmaxidx=[]
        tms=[]
        tmp_avg_attention_mass = 0.0
        tmp_max_attention_mass = 0.0
        tmp_max_attention_mass_non_reg = 0.0
        tmp_non_reg_mass = 0.0
        tmp_pad_mass = 0.0

        tmp_vnfs = [0., 0., 0.]

        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits, attention_probs_layers, category_mask, vnfs = model(input_ids, 
                                                                        token_type_ids=segment_ids,
                                                                        pad_attention_mask=input_mask,
                                                                        manipulate_attention=True,
                                                                        category_mask=None, 
                                                                        labels=None)

            # create eval loss and other metric required by the task
            loss_fct = CrossEntropyLoss() # averages the loss over B
            tmp_ce_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))

            tmp_ar_eval_loss, avg_attention_mass, non_reg_mass,total_mass, pad_mass, max_attention_mass,max_non_reg_mass,idxmaxloss = \
                attention_regularization_loss(attention_probs_layers, 
                                                category_mask,
                                                input_mask,
                                                args.hammer_coeff,
                                                optimize_func=args.att_opt_func)
            tmp_max_attention_mass_non_reg += max_non_reg_mass.item()
            tmp_avg_attention_mass += avg_attention_mass.item()
            tmp_max_attention_mass += max_attention_mass.item()
            tmp_non_reg_mass += non_reg_mass.item()
            tmp_pad_mass += pad_mass.item()

            tmp_vnfs = [tmp_vnfs[i] + vnfs[i] for i in range(3)]

            tmp_eval_loss = tmp_ce_eval_loss + tmp_ar_eval_loss

            eval_loss += tmp_eval_loss.mean().item()
            ce_eval_loss += tmp_ce_eval_loss.mean().item()
            ar_eval_loss += tmp_ar_eval_loss.mean().item()
            attmaxidx.append(non_reg_mass.item())
            print('reg attention sum per layer')
            print(total_mass)
            tms.append(total_mass)

            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
            else:
                preds[0] = np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)

            if debug:
                break
        eval_loss = eval_loss / nb_eval_steps
        ce_eval_loss = ce_eval_loss / nb_eval_steps
        ar_eval_loss = ar_eval_loss / nb_eval_steps
        print('attetnion')
        print(pd.Series(attmaxidx).describe())
        # print(pd.Series(tms).describe())
        # print(attmaxidx)
        preds = preds[0]
        preds = np.argmax(preds, axis=1)
        result = compute_metrics(input_processor_type, preds, all_label_ids.numpy())
        loss = tr_loss/global_step if (args.do_train and epoch > 0) else None

        result['eval_loss'] = eval_loss
        result['ce_eval_loss'] = ce_eval_loss
        result['ar_eval_loss'] = ar_eval_loss

        result['global_step'] = global_step
        result['loss'] = loss

        result['avg_mean_attention_mass'] = tmp_avg_attention_mass / nb_eval_steps
        result['avg_max_attention_mass'] = tmp_max_attention_mass / nb_eval_steps
        result['avg_non_reg_attention_mass'] = tmp_non_reg_mass / nb_eval_steps
        result['avg_pad_attention_mass'] = tmp_pad_mass / nb_eval_steps
        result['avg_max_attention_mass_non_reg'] =tmp_max_attention_mass_non_reg / nb_eval_steps
        result['avg_mean_value_norm'] = tmp_vnfs[0]*1. / nb_eval_steps
        result['avg_max_value_norm'] = tmp_vnfs[1]*1. / nb_eval_steps
        result['avg_min_value_norm'] = tmp_vnfs[2]*1. / nb_eval_steps
        result['attmaxidx']=attmaxidx

        result['label_match_score'] = 0.0
        if not args.first_run:
            num_labels = len(preds)
            result['label_match_score'] = simple_accuracy(preds, base_labels[typ][0:num_labels])

        output_eval_file = os.path.join(args.output_dir, typ+ "_"+str(params['seed'])+ "_"+str(args.hammer_coeff) + "_"+str(epoch)+"_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** {} results *****".format(typ))
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        print('\t'.join([ str(elem) for elem in 
                        [typ, 
                        epoch, 
                        result['acc'],
                        # result['f1'],
                        result['avg_mean_attention_mass'],
                        result['avg_max_attention_mass'],
                        result['avg_max_attention_mass_non_reg'],
                        result['eval_loss'],
                        result['ar_eval_loss'],
                        result['label_match_score'],
                        result['avg_mean_value_norm'], 
                        result['avg_max_value_norm'], 
                        result['avg_min_value_norm'] ,
                        result['attmaxidx']
                    ]]))
#         print(preds)
        return preds




In [None]:
params={}
odir='twitnotebook'
for seed in range(1,5,1):
    for hammer_coeff in [0,0.1,1.0]:
        for opt_func in ['mean','max']:
            params['output_dir']=odir
            params['hammer_coeff']=hammer_coeff
            params['seed']=seed
            params['optfunc']=opt_func
            main(params)

01/28/2021 14:23:55 - INFO - __main__ -   device: cuda n_gpu: 1, distributed training: False, 16-bits training: False


FIRST RUN: True
['1', '2', '3', '4', '5']


01/28/2021 14:23:56 - INFO - pytorch_pretrained_bert_local.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\howar\.pytorch_pretrained_bert\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
01/28/2021 14:23:57 - INFO - pytorch_pretrained_bert_local.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at C:\Users\howar\.pytorch_pretrained_bert\distributed_-1\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
01/28/2021 14:23:57 - INFO - pytorch_pretrained_bert_local.modeling -   extracting archive file C:\Users\howar\.pytorch_pretrained_bert\distributed_-1\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af6397

Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}



01/28/2021 14:24:02 - INFO - pytorch_pretrained_bert_local.modeling -   Weights of BertForSequenceClassification not initialized from pretrained model: ['classifier.weight', 'classifier.bias']
01/28/2021 14:24:02 - INFO - pytorch_pretrained_bert_local.modeling -   Weights from pretrained model not used in BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
01/28/2021 14:24:03 - INFO - __main__ -   Writing example 0 of 7000
01/28/2021 14:24:03 - INFO - __main__ -   *** Example ***
01/28/2021 14:24:03 - INFO - __main__ -   guid: train-0
01/28/2021 14:24:03 - INFO - __main__ -   tokens: [CLS] my wife took me here on my birthday for breakfast and it was excellent . the weather was perfect which made sitting outside overlooki

01/28/2021 14:24:03 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

01/28/2021 14:24:03 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

01/28/2021 14:26:00 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

01/28/2021 14:26:00 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

01/28/2021 14:26:00 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

typ	epoch	acc	avg_mean_mass	avg_max_mass	loss	hammer_loss	label_match_score	avg_mean_vn	avg_max_vn	avg_min_vn


01/28/2021 14:26:16 - INFO - __main__ -   ***** Running evaluation on dev data*****
01/28/2021 14:26:16 - INFO - __main__ -     Num examples = 1000
01/28/2021 14:26:16 - INFO - __main__ -     Batch size = 1

Evaluating:   0%|                                                                                                                                                                                             | 0/1000 [00:00<?, ?it/s][A
Evaluating:   0%|▏                                                                                                                                                                                    | 1/1000 [00:00<05:57,  2.79it/s][A
Evaluating:   0%|▌                                                                                                                                                                                    | 3/1000 [00:00<04:25,  3.76it/s][A

reg attention sum per layer
tensor([[[0.0628],
         [0.0854],
         [0.0590],
         [0.0230],
         [0.0083],
         [0.0088],
         [0.0191],
         [0.0065],
         [0.0018],
         [0.0061],
         [0.0611],
         [0.0262]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0473],
         [0.0826],
         [0.0777],
         [0.0206],
         [0.0218],
         [0.0107],
         [0.0099],
         [0.0216],
         [0.0138],
         [0.0039],
         [0.0273],
         [0.0235]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0331],
         [0.0372],
         [0.0267],
         [0.0118],
         [0.0301],
         [0.0099],
         [0.0158],
         [0.0049],
         [0.0083],
         [0.0114],
         [0.0456],
         [0.0139]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0260],
         [0.0241],
         [0.0654],
         [0.0162],
         [0.0526],
         [0.0080],
         [0.0121],
         [0.00


Evaluating:   1%|█                                                                                                                                                                                    | 6/1000 [00:00<03:19,  4.99it/s][A

tensor([[[0.0706],
         [0.0533],
         [0.0407],
         [0.0175],
         [0.0373],
         [0.0180],
         [0.0139],
         [0.0284],
         [0.0154],
         [0.0136],
         [0.0246],
         [0.0296]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0187],
         [0.0143],
         [0.0176],
         [0.0129],
         [0.0154],
         [0.0063],
         [0.0166],
         [0.0094],
         [0.0135],
         [0.0054],
         [0.0045],
         [0.0175]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1109],
         [0.1251],
         [0.0358],
         [0.0204],
         [0.0218],
         [0.0087],
         [0.0401],
         [0.0217],
         [0.0103],
         [0.0134],
         [0.0039],
         [0.0235]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0036],
         [0.0103],
         [0.0508],
         [0.0106],
         [0.0095],
         [0.0018],
         [0.0029],
         [0.0041],
         [0.0020],
    


Evaluating:   1%|█▋                                                                                                                                                                                   | 9/1000 [00:00<02:32,  6.51it/s][A
Evaluating:   1%|██▏                                                                                                                                                                                 | 12/1000 [00:00<01:59,  8.24it/s][A


reg attention sum per layer
tensor([[[0.0056],
         [0.0039],
         [0.0056],
         [0.0028],
         [0.0065],
         [0.0032],
         [0.0031],
         [0.0018],
         [0.0006],
         [0.0014],
         [0.0073],
         [0.0043]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0160],
         [0.0145],
         [0.0709],
         [0.0097],
         [0.0128],
         [0.0134],
         [0.0047],
         [0.0073],
         [0.0015],
         [0.0070],
         [0.0242],
         [0.0760]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0096],
         [0.0736],
         [0.0148],
         [0.0051],
         [0.0042],
         [0.0033],
         [0.0122],
         [0.0019],
         [0.0008],
         [0.0031],
         [0.0423],
         [0.0139]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0209],
         [0.0313],
         [0.1329],
         [0.0307],
         [0.1001],
         [0.0109],
         [0.0093],
         [0.0


Evaluating:   2%|██▋                                                                                                                                                                                 | 15/1000 [00:01<01:37, 10.13it/s][A

reg attention sum per layer
tensor([[[0.0844],
         [0.0412],
         [0.0686],
         [0.0131],
         [0.0684],
         [0.0437],
         [0.0343],
         [0.0311],
         [0.0042],
         [0.0247],
         [0.0281],
         [0.0253]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0454],
         [0.0446],
         [0.0357],
         [0.0061],
         [0.0211],
         [0.0091],
         [0.0115],
         [0.0124],
         [0.0046],
         [0.0039],
         [0.0138],
         [0.0318]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0202],
         [0.0139],
         [0.0474],
         [0.0110],
         [0.0396],
         [0.0112],
         [0.0100],
         [0.0245],
         [0.0094],
         [0.0056],
         [0.0158],
         [0.0397]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0439],
         [0.1135],
         [0.0603],
         [0.0228],
         [0.0164],
         [0.0079],
         [0.0723],
         [0.01


Evaluating:   2%|███▏                                                                                                                                                                                | 18/1000 [00:01<01:21, 12.04it/s][A
Evaluating:   2%|███▊                                                                                                                                                                                | 21/1000 [00:01<01:10, 13.91it/s][A

tensor([[[0.0189],
         [0.0177],
         [0.0333],
         [0.0296],
         [0.0080],
         [0.0248],
         [0.0115],
         [0.0012],
         [0.0008],
         [0.0021],
         [0.0555],
         [0.0273]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0090],
         [0.0286],
         [0.0323],
         [0.0054],
         [0.0030],
         [0.0057],
         [0.0080],
         [0.0052],
         [0.0009],
         [0.0014],
         [0.0087],
         [0.0129]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0575],
         [0.0362],
         [0.1455],
         [0.0320],
         [0.0221],
         [0.0081],
         [0.0103],
         [0.0095],
         [0.0102],
         [0.0064],
         [0.0233],
         [0.0330]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0068],
         [0.0132],
         [0.0187],
         [0.0048],
         [0.0088],
         [0.0047],
         [0.0074],
         [0.0019],
         [0.0042],
    


Evaluating:   2%|████▎                                                                                                                                                                               | 24/1000 [00:01<01:02, 15.60it/s][A

tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0169],
         [0.0788],
         [0.0394],
         [0.0203],
         [0.0138],
         [0.0115],
         [0.0152],
         [0.0045],
         [0.0042],
         [0.0027],
         [0.0803],
         [0.0608]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0194],
         [0.0345],
         [0.0471],
         [0.0110],
         [0.0090],
         [0.0117],
         [0.0102],
         [0.0113],
         [0.0041],
         [0.0052],
         [0.0119],
         [0.0201]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0073],
         [0.0239],
         [0.0149],
         [0.0026],
         [0.0022],
         [0.0058],
         [0.0083],
         [0.0025],
         [0.0014],
         [0.0019],
         [0.0146],
         [0.00


Evaluating:   3%|████▊                                                                                                                                                                               | 27/1000 [00:01<00:57, 17.00it/s][A

tensor([[[0.0241],
         [0.0165],
         [0.0485],
         [0.0072],
         [0.0488],
         [0.0160],
         [0.0238],
         [0.0072],
         [0.0280],
         [0.0105],
         [0.0131],
         [0.0292]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0570],
         [0.0184],
         [0.0304],
         [0.0149],
         [0.0142],
         [0.0078],
         [0.0325],
         [0.0190],
         [0.0061],
         [0.0081],
         [0.0170],
         [0.0080]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0192],
         [0.0188],
         [0.0809],
         [0.0267],
         [0.0109],
         [0.0050],
         [0.0148],
         [0.0151],
         [0.0037],
         [0.0123],
         [0.1323],
         [0.0544]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0366],
         [0.0114],
         [0.0570],
         [0.0068],
         [0.0085],
         [0.0150],
         [0.0188],
         [0.0050],
         [0.0163],
    


Evaluating:   3%|█████▍                                                                                                                                                                              | 30/1000 [00:01<00:53, 18.20it/s][A
Evaluating:   3%|█████▉                                                                                                                                                                              | 33/1000 [00:01<00:50, 19.04it/s][A


reg attention sum per layer
tensor([[[0.0486],
         [0.0588],
         [0.0731],
         [0.0166],
         [0.0377],
         [0.0171],
         [0.0279],
         [0.0269],
         [0.0095],
         [0.0148],
         [0.0200],
         [0.0234]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0269],
         [0.0591],
         [0.0654],
         [0.0268],
         [0.0244],
         [0.0147],
         [0.0195],
         [0.0078],
         [0.0101],
         [0.0150],
         [0.0668],
         [0.0120]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0609],
         [0.0310],
         [0.0374],
         [0.0261],
         [0.0417],
         [0.0066],
         [0.0224],
         [0.0094],
         [0.0067],
         [0.0096],
         [0.0131],
         [0.0460]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0114],
         [0.0361],
         [0.0742],
         [0.0257],
         [0.0120],
         [0.0164],
         [0.0054],
         [0.0


Evaluating:   4%|██████▍                                                                                                                                                                             | 36/1000 [00:01<00:48, 19.75it/s][A


reg attention sum per layer
tensor([[[0.0857],
         [0.1216],
         [0.0590],
         [0.0156],
         [0.0134],
         [0.0116],
         [0.0170],
         [0.0096],
         [0.0024],
         [0.0103],
         [0.0341],
         [0.0210]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0338],
         [0.0119],
         [0.0391],
         [0.0113],
         [0.0328],
         [0.0153],
         [0.0093],
         [0.0157],
         [0.0100],
         [0.0140],
         [0.0064],
         [0.0593]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0292],
         [0.0806],
         [0.0559],
         [0.0210],
         [0.0380],
         [0.0220],
         [0.0238],
         [0.0242],
         [0.0270],
         [0.0177],
         [0.0206],
         [0.0302]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0197],
         [0.0330],
         [0.0587],
         [0.0274],
         [0.0309],
         [0.0118],
         [0.0685],
         [0.0


Evaluating:   4%|███████                                                                                                                                                                             | 39/1000 [00:02<00:47, 20.28it/s][A
Evaluating:   4%|███████▌                                                                                                                                                                            | 42/1000 [00:02<00:46, 20.79it/s][A

reg attention sum per layer
tensor([[[0.0248],
         [0.0512],
         [0.0416],
         [0.0170],
         [0.0119],
         [0.0074],
         [0.0293],
         [0.0057],
         [0.0055],
         [0.0111],
         [0.0274],
         [0.0089]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0386],
         [0.0793],
         [0.1406],
         [0.0194],
         [0.0325],
         [0.0197],
         [0.0183],
         [0.0274],
         [0.0217],
         [0.0131],
         [0.0553],
         [0.0395]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0340],
         [0.0255],
         [0.0707],
         [0.0118],
         [0.0177],
         [0.0212],
         [0.0344],
         [0.0336],
         [0.0095],
         [0.0078],
         [0.0117],
         [0.0342]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0358],
         [0.0622],
         [0.0534],
         [0.0200],
         [0.0253],
         [0.0126],
         [0.0960],
         [0.01


Evaluating:   4%|████████                                                                                                                                                                            | 45/1000 [00:02<00:45, 21.04it/s][A

tensor([[[0.0186],
         [0.0463],
         [0.0146],
         [0.0044],
         [0.0174],
         [0.0700],
         [0.0339],
         [0.0054],
         [0.0031],
         [0.0074],
         [0.0366],
         [0.0120]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0183],
         [0.0417],
         [0.0089],
         [0.0083],
         [0.0162],
         [0.0112],
         [0.0393],
         [0.0047],
         [0.0029],
         [0.0061],
         [0.0273],
         [0.0082]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1197],
         [0.1265],
         [0.0558],
         [0.0284],
         [0.0143],
         [0.0107],
         [0.0414],
         [0.0121],
         [0.0048],
         [0.0141],
         [0.0232],
         [0.0496]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0324],
         [0.0261],
         [0.0994],
         [0.0209],
         [0.0597],
         [0.0353],
         [0.0315],
         [0.0113],
         [0.0100],
    


Evaluating:   5%|████████▋                                                                                                                                                                           | 48/1000 [00:02<00:45, 21.13it/s][A

tensor([[[0.0121],
         [0.0629],
         [0.0510],
         [0.0083],
         [0.0258],
         [0.0203],
         [0.0051],
         [0.0037],
         [0.0014],
         [0.0024],
         [0.0804],
         [0.0393]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0427],
         [0.0422],
         [0.0230],
         [0.0151],
         [0.0601],
         [0.0166],
         [0.0362],
         [0.0055],
         [0.0083],
         [0.0112],
         [0.0290],
         [0.0269]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0071],
         [0.0308],
         [0.0468],
         [0.0088],
         [0.0129],
         [0.0195],
         [0.0201],
         [0.0567],
         [0.0135],
         [0.0114],
         [0.0241],
         [0.0167]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1508],
         [0.0305],
         [0.0122],
         [0.0137],
         [0.0136],
         [0.0272],
         [0.0242],
         [0.0262],
         [0.0032],
    


Evaluating:   5%|█████████▏                                                                                                                                                                          | 51/1000 [00:02<00:44, 21.32it/s][A
Evaluating:   5%|█████████▋                                                                                                                                                                          | 54/1000 [00:02<00:43, 21.51it/s][A


reg attention sum per layer
tensor([[[0.0356],
         [0.0423],
         [0.2966],
         [0.0766],
         [0.0520],
         [0.0126],
         [0.0092],
         [0.0104],
         [0.0046],
         [0.0132],
         [0.3547],
         [0.2036]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0289],
         [0.0542],
         [0.0616],
         [0.0232],
         [0.0300],
         [0.0182],
         [0.0278],
         [0.0083],
         [0.0067],
         [0.0090],
         [0.0322],
         [0.0123]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0381],
         [0.0136],
         [0.0450],
         [0.0090],
         [0.0166],
         [0.0039],
         [0.0093],
         [0.0106],
         [0.0015],
         [0.0079],
         [0.0076],
         [0.0095]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0229],
         [0.0171],
         [0.0249],
         [0.0082],
         [0.0216],
         [0.0125],
         [0.0215],
         [0.0


Evaluating:   6%|██████████▎                                                                                                                                                                         | 57/1000 [00:02<00:43, 21.55it/s][A

reg attention sum per layer
tensor([[[0.0153],
         [0.0220],
         [0.0216],
         [0.0099],
         [0.0071],
         [0.0056],
         [0.0245],
         [0.0276],
         [0.0073],
         [0.0101],
         [0.0052],
         [0.0280]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0477],
         [0.0290],
         [0.0692],
         [0.0141],
         [0.0373],
         [0.0128],
         [0.0342],
         [0.0095],
         [0.0181],
         [0.0125],
         [0.0076],
         [0.0460]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0109],
         [0.0171],
         [0.0562],
         [0.0228],
         [0.0487],
         [0.0101],
         [0.0126],
         [0.0022],
         [0.0082],
         [0.0065],
         [0.0165],
         [0.0328]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0393],
         [0.0261],
         [0.3148],
         [0.0443],
         [0.0844],
         [0.0306],
         [0.1532],
         [0.00


Evaluating:   6%|██████████▊                                                                                                                                                                         | 60/1000 [00:03<00:43, 21.58it/s][A
Evaluating:   6%|███████████▎                                                                                                                                                                        | 63/1000 [00:03<00:43, 21.69it/s][A

tensor([[[0.0217],
         [0.0625],
         [0.0556],
         [0.0271],
         [0.0506],
         [0.0085],
         [0.0218],
         [0.0074],
         [0.0060],
         [0.0059],
         [0.0265],
         [0.0334]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0624],
         [0.0616],
         [0.0121],
         [0.0146],
         [0.0098],
         [0.0077],
         [0.0069],
         [0.0055],
         [0.0004],
         [0.0045],
         [0.0317],
         [0.0134]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0131],
         [0.0060],
         [0.0056],
         [0.0076],
         [0.0069],
         [0.0016],
         [0.0072],
         [0.0014],
         [0.0027],
         [0.0037],
         [0.0023],
         [0.0034]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0285],
         [0.0121],
         [0.0274],
         [0.0118],
         [0.0135],
         [0.0031],
         [0.0096],
         [0.0042],
         [0.0399],
    


Evaluating:   7%|███████████▉                                                                                                                                                                        | 66/1000 [00:03<00:43, 21.58it/s][A

tensor([[[0.0189],
         [0.0327],
         [0.0440],
         [0.0092],
         [0.0193],
         [0.0083],
         [0.0184],
         [0.0257],
         [0.0031],
         [0.0073],
         [0.0082],
         [0.0141]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0157],
         [0.0096],
         [0.0121],
         [0.0077],
         [0.0175],
         [0.0044],
         [0.0138],
         [0.0101],
         [0.0115],
         [0.0052],
         [0.0027],
         [0.0194]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0094],
         [0.0079],
         [0.0097],
         [0.0026],
         [0.0074],
         [0.0079],
         [0.0138],
         [0.0011],
         [0.0032],
         [0.0044],
         [0.0076],
         [0.0170]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0093],
         [0.0167],
         [0.0366],
         [0.0334],
         [0.0157],
         [0.0142],
         [0.0013],
         [0.0028],
         [0.0005],
    


Evaluating:   7%|████████████▍                                                                                                                                                                       | 69/1000 [00:03<00:43, 21.60it/s][A

tensor([[[0.0206],
         [0.0098],
         [0.0495],
         [0.0117],
         [0.0203],
         [0.0166],
         [0.0433],
         [0.0027],
         [0.0048],
         [0.0072],
         [0.0125],
         [0.0172]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1206],
         [0.1197],
         [0.0624],
         [0.0678],
         [0.0287],
         [0.0365],
         [0.1796],
         [0.0230],
         [0.0276],
         [0.0443],
         [0.2293],
         [0.1137]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0377],
         [0.0308],
         [0.0858],
         [0.0250],
         [0.0170],
         [0.0050],
         [0.0181],
         [0.0078],
         [0.0426],
         [0.0107],
         [0.0237],
         [0.0470]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0230],
         [0.0240],
         [0.0850],
         [0.0240],
         [0.0158],
         [0.0045],
         [0.0066],
         [0.0035],
         [0.0010],
    


Evaluating:   7%|████████████▉                                                                                                                                                                       | 72/1000 [00:03<00:42, 21.65it/s][A
Evaluating:   8%|█████████████▌                                                                                                                                                                      | 75/1000 [00:03<00:42, 21.75it/s][A


reg attention sum per layer
tensor([[[0.0155],
         [0.0171],
         [0.0524],
         [0.0226],
         [0.0320],
         [0.0082],
         [0.0169],
         [0.0063],
         [0.0063],
         [0.0050],
         [0.0193],
         [0.0238]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0454],
         [0.0192],
         [0.0193],
         [0.0076],
         [0.0091],
         [0.0116],
         [0.0656],
         [0.0037],
         [0.0072],
         [0.0064],
         [0.0020],
         [0.0115]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0192],
         [0.0177],
         [0.0314],
         [0.0123],
         [0.0474],
         [0.0085],
         [0.0080],
         [0.0060],
         [0.0038],
         [0.0074],
         [0.0110],
         [0.0148]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0189],
         [0.0188],
         [0.0367],
         [0.0079],
         [0.0083],
         [0.0208],
         [0.0181],
         [0.0


Evaluating:   8%|██████████████                                                                                                                                                                      | 78/1000 [00:03<00:42, 21.71it/s][A

reg attention sum per layer
tensor([[[0.0263],
         [0.0103],
         [0.0253],
         [0.0101],
         [0.0187],
         [0.0061],
         [0.0184],
         [0.0067],
         [0.0084],
         [0.0067],
         [0.0077],
         [0.0196]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0215],
         [0.0095],
         [0.0149],
         [0.0074],
         [0.0223],
         [0.0209],
         [0.0274],
         [0.0051],
         [0.0135],
         [0.0138],
         [0.0096],
         [0.0077]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0491],
         [0.0316],
         [0.0253],
         [0.0104],
         [0.0151],
         [0.0252],
         [0.0461],
         [0.0053],
         [0.0028],
         [0.0111],
         [0.0301],
         [0.0270]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0118],
         [0.0127],
         [0.0117],
         [0.0065],
         [0.0086],
         [0.0040],
         [0.0027],
         [0.00


Evaluating:   8%|██████████████▌                                                                                                                                                                     | 81/1000 [00:04<00:42, 21.69it/s][A
Evaluating:   8%|███████████████                                                                                                                                                                     | 84/1000 [00:04<00:42, 21.77it/s][A

tensor([[[0.0534],
         [0.0345],
         [0.0347],
         [0.0073],
         [0.0111],
         [0.0032],
         [0.0320],
         [0.0186],
         [0.0124],
         [0.0114],
         [0.0052],
         [0.0398]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0876],
         [0.0303],
         [0.0375],
         [0.0101],
         [0.0428],
         [0.0370],
         [0.0107],
         [0.0319],
         [0.0057],
         [0.0126],
         [0.0200],
         [0.0193]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0460],
         [0.0393],
         [0.0359],
         [0.0213],
         [0.0482],
         [0.0077],
         [0.0386],
         [0.0119],
         [0.0121],
         [0.0218],
         [0.0095],
         [0.0248]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0121],
         [0.0129],
         [0.0169],
         [0.0042],
         [0.0119],
         [0.0092],
         [0.0162],
         [0.0136],
         [0.0048],
    


Evaluating:   9%|███████████████▋                                                                                                                                                                    | 87/1000 [00:04<00:41, 21.75it/s][A

tensor([[[0.1398],
         [0.0650],
         [0.0128],
         [0.0094],
         [0.0226],
         [0.0060],
         [0.0576],
         [0.0250],
         [0.0263],
         [0.0601],
         [0.0025],
         [0.0204]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0411],
         [0.0162],
         [0.0239],
         [0.0196],
         [0.0683],
         [0.0405],
         [0.0141],
         [0.0077],
         [0.0531],
         [0.0106],
         [0.0302],
         [0.0589]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0386],
         [0.0514],
         [0.0406],
         [0.0200],
         [0.0238],
         [0.0054],
         [0.0286],
         [0.0105],
         [0.0032],
         [0.0089],
         [0.0320],
         [0.0142]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0485],
         [0.0668],
         [0.1141],
         [0.0337],
         [0.0241],
         [0.0450],
         [0.0205],
         [0.0092],
         [0.0054],
    


Evaluating:   9%|████████████████▏                                                                                                                                                                   | 90/1000 [00:04<00:41, 21.72it/s][A
Evaluating:   9%|████████████████▋                                                                                                                                                                   | 93/1000 [00:04<00:41, 21.74it/s][A


reg attention sum per layer
tensor([[[0.0458],
         [0.0459],
         [0.0115],
         [0.0103],
         [0.0067],
         [0.0099],
         [0.0213],
         [0.0139],
         [0.0049],
         [0.0084],
         [0.0079],
         [0.0123]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0132],
         [0.0170],
         [0.0195],
         [0.0095],
         [0.0125],
         [0.0047],
         [0.0154],
         [0.0097],
         [0.0026],
         [0.0053],
         [0.0077],
         [0.0075]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0532],
         [0.0287],
         [0.0377],
         [0.0152],
         [0.0301],
         [0.0134],
         [0.0081],
         [0.0140],
         [0.0041],
         [0.0084],
         [0.0342],
         [0.0163]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0409],
         [0.0166],
         [0.0440],
         [0.0076],
         [0.0128],
         [0.0072],
         [0.0192],
         [0.0


Evaluating:  10%|█████████████████▎                                                                                                                                                                  | 96/1000 [00:04<00:41, 21.71it/s][A

reg attention sum per layer
tensor([[[0.0255],
         [0.0181],
         [0.0091],
         [0.0090],
         [0.0047],
         [0.0094],
         [0.0035],
         [0.0217],
         [0.0019],
         [0.0054],
         [0.0352],
         [0.0069]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0462],
         [0.0579],
         [0.0794],
         [0.0301],
         [0.0464],
         [0.0382],
         [0.0161],
         [0.0035],
         [0.0028],
         [0.0116],
         [0.1477],
         [0.0904]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0224],
         [0.0360],
         [0.0728],
         [0.0140],
         [0.0796],
         [0.0228],
         [0.0336],
         [0.0138],
         [0.0213],
         [0.0522],
         [0.0413],
         [0.0114]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0167],
         [0.0425],
         [0.0291],
         [0.0141],
         [0.0104],
         [0.0048],
         [0.0210],
         [0.01


Evaluating:  10%|█████████████████▊                                                                                                                                                                  | 99/1000 [00:04<00:41, 21.68it/s][A

tensor([[[0.0164],
         [0.0062],
         [0.0074],
         [0.0127],
         [0.0359],
         [0.0103],
         [0.0081],
         [0.0019],
         [0.0055],
         [0.0074],
         [0.0136],
         [0.0047]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0379],
         [0.0185],
         [0.0185],
         [0.0178],
         [0.0063],
         [0.0095],
         [0.0427],
         [0.0023],
         [0.0016],
         [0.0090],
         [0.0124],
         [0.0098]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0463],
         [0.0221],
         [0.0477],
         [0.0209],
         [0.0386],
         [0.0063],
         [0.0079],
         [0.0201],
         [0.0104],
         [0.0029],
         [0.0145],
         [0.0620]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0149],
         [0.0556],
         [0.0553],
         [0.0157],
         [0.0473],
         [0.0114],
         [0.0238],
         [0.0113],
         [0.0097],
    


Evaluating:  10%|██████████████████▎                                                                                                                                                                | 102/1000 [00:05<00:41, 21.58it/s][A
Evaluating:  10%|██████████████████▊                                                                                                                                                                | 105/1000 [00:05<00:41, 21.59it/s][A

tensor([[[0.0236],
         [0.0496],
         [0.0484],
         [0.0083],
         [0.0931],
         [0.1201],
         [0.0237],
         [0.0072],
         [0.0098],
         [0.0177],
         [0.1228],
         [0.0440]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0197],
         [0.0302],
         [0.0217],
         [0.0138],
         [0.0043],
         [0.0106],
         [0.0108],
         [0.0021],
         [0.0006],
         [0.0023],
         [0.0206],
         [0.0148]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0366],
         [0.0279],
         [0.0568],
         [0.0315],
         [0.0039],
         [0.0085],
         [0.0103],
         [0.0045],
         [0.0017],
         [0.0021],
         [0.0231],
         [0.0745]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0342],
         [0.0333],
         [0.0686],
         [0.0095],
         [0.0304],
         [0.0162],
         [0.0228],
         [0.0081],
         [0.0212],
    


Evaluating:  11%|███████████████████▎                                                                                                                                                               | 108/1000 [00:05<00:41, 21.61it/s][A

tensor([[[0.0423],
         [0.0094],
         [0.0110],
         [0.0348],
         [0.0189],
         [0.0260],
         [0.0257],
         [0.0156],
         [0.0041],
         [0.0079],
         [0.0074],
         [0.0170]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0795],
         [0.0097],
         [0.0147],
         [0.0080],
         [0.0315],
         [0.0051],
         [0.0418],
         [0.0066],
         [0.0065],
         [0.0075],
         [0.0053],
         [0.0139]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0053],
         [0.0113],
         [0.0811],
         [0.0080],
         [0.0103],
         [0.0011],
         [0.0073],
         [0.0448],
         [0.0038],
         [0.0014],
         [0.0155],
         [0.0170]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0147],
         [0.0072],
         [0.0435],
         [0.0054],
         [0.0296],
         [0.0217],
         [0.0364],
         [0.0059],
         [0.0097],
    


Evaluating:  11%|███████████████████▊                                                                                                                                                               | 111/1000 [00:05<00:41, 21.62it/s][A

tensor([[[0.0378],
         [0.0818],
         [0.0590],
         [0.0133],
         [0.0093],
         [0.0112],
         [0.0363],
         [0.0074],
         [0.0058],
         [0.0099],
         [0.0537],
         [0.0455]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0502],
         [0.0488],
         [0.1021],
         [0.0179],
         [0.0402],
         [0.0453],
         [0.0172],
         [0.0251],
         [0.0051],
         [0.0068],
         [0.0588],
         [0.0366]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0033],
         [0.0044],
         [0.0150],
         [0.0048],
         [0.0074],
         [0.0054],
         [0.0076],
         [0.0074],
         [0.0131],
         [0.0026],
         [0.0017],
         [0.01


Evaluating:  11%|████████████████████▍                                                                                                                                                              | 114/1000 [00:05<00:40, 21.62it/s][A
Evaluating:  12%|████████████████████▉                                                                                                                                                              | 117/1000 [00:05<00:41, 21.49it/s][A


reg attention sum per layer
tensor([[[0.0080],
         [0.0401],
         [0.0194],
         [0.0053],
         [0.0044],
         [0.0197],
         [0.0105],
         [0.0057],
         [0.0018],
         [0.0023],
         [0.0428],
         [0.0213]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0556],
         [0.0490],
         [0.0234],
         [0.0071],
         [0.0081],
         [0.0098],
         [0.0234],
         [0.0074],
         [0.0061],
         [0.0045],
         [0.0051],
         [0.0165]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0291],
         [0.0639],
         [0.0341],
         [0.0090],
         [0.0139],
         [0.0186],
         [0.0171],
         [0.0285],
         [0.0074],
         [0.0091],
         [0.0294],
         [0.0232]]], device='cuda:0')
reg attention sum per layer



Evaluating:  12%|█████████████████████▍                                                                                                                                                             | 120/1000 [00:05<00:41, 21.40it/s][A

tensor([[[0.0241],
         [0.0446],
         [0.1400],
         [0.0095],
         [0.0612],
         [0.0140],
         [0.0294],
         [0.0147],
         [0.0297],
         [0.0091],
         [0.0365],
         [0.0975]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0194],
         [0.0582],
         [0.0562],
         [0.0611],
         [0.0341],
         [0.0106],
         [0.0245],
         [0.0010],
         [0.0048],
         [0.0049],
         [0.4437],
         [0.0334]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1008],
         [0.0550],
         [0.0405],
         [0.0109],
         [0.0117],
         [0.0356],
         [0.0682],
         [0.0107],
         [0.0054],
         [0.0065],
         [0.0145],
         [0.0156]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0083],
         [0.0112],
         [0.0273],
         [0.0030],
         [0.0090],
         [0.0031],
         [0.0114],
         [0.0020],
         [0.0014],
    


Evaluating:  12%|██████████████████████                                                                                                                                                             | 123/1000 [00:06<00:41, 21.38it/s][A


tensor([[[0.0828],
         [0.0788],
         [0.0264],
         [0.0281],
         [0.0100],
         [0.0207],
         [0.0262],
         [0.0140],
         [0.0166],
         [0.0571],
         [0.0170],
         [0.0479]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0538],
         [0.0350],
         [0.0439],
         [0.0187],
         [0.0338],
         [0.1213],
         [0.0386],
         [0.0105],
         [0.0152],
         [0.0301],
         [0.0362],
         [0.0339]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0258],
         [0.0179],
         [0.0293],
         [0.0187],
         [0.0134],
         [0.0101],
         [0.0147],
         [0.0180],
         [0.0063],
         [0.0103],
         [0.0090],
         [0.0446]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0091],
         [0.0115],
         [0.0320],
         [0.0086],
         [0.0201],
         [0.0162],
         [0.0085],
         [0.0057],
         [0.0052],
    

Evaluating:  13%|██████████████████████▌                                                                                                                                                            | 126/1000 [00:06<00:40, 21.50it/s][A
Evaluating:  13%|███████████████████████                                                                                                                                                            | 129/1000 [00:06<00:40, 21.51it/s][A

reg attention sum per layer
tensor([[[0.1896],
         [0.1196],
         [0.0404],
         [0.0159],
         [0.0238],
         [0.0057],
         [0.0507],
         [0.0075],
         [0.0084],
         [0.0124],
         [0.0111],
         [0.0133]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0057],
         [0.0049],
         [0.0183],
         [0.0063],
         [0.0097],
         [0.0074],
         [0.0049],
         [0.0004],
         [0.0018],
         [0.0070],
         [0.0836],
         [0.0056]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0238],
         [0.0080],
         [0.0350],
         [0.0113],
         [0.0217],
         [0.0278],
         [0.0647],
         [0.0119],
         [0.0209],
         [0.0063],
         [0.0074],
         [0.0156]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0418],
         [0.0394],
         [0.0505],
         [0.0156],
         [0.0290],
         [0.0147],
         [0.0101],
         [0.01


Evaluating:  13%|███████████████████████▋                                                                                                                                                           | 132/1000 [00:06<00:40, 21.51it/s][A
Evaluating:  14%|████████████████████████▏                                                                                                                                                          | 135/1000 [00:06<00:40, 21.55it/s][A

reg attention sum per layer
tensor([[[0.0320],
         [0.0393],
         [0.0777],
         [0.0150],
         [0.0588],
         [0.0086],
         [0.0170],
         [0.0162],
         [0.0286],
         [0.0074],
         [0.0218],
         [0.0607]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0253],
         [0.0425],
         [0.0322],
         [0.0052],
         [0.0150],
         [0.0060],
         [0.0080],
         [0.0152],
         [0.0077],
         [0.0187],
         [0.0566],
         [0.0175]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0515],
         [0.0398],
         [0.0282],
         [0.0097],
         [0.0088],
         [0.0111],
         [0.0288],
         [0.0035],
         [0.0023],
         [0.0059],
         [0.0225],
         [0.0301]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0240],
         [0.0404],
         [0.0740],
         [0.0180],
         [0.0300],
         [0.0198],
         [0.0189],
         [0.00


Evaluating:  14%|████████████████████████▋                                                                                                                                                          | 138/1000 [00:06<00:39, 21.57it/s][A

tensor([[[0.0439],
         [0.1218],
         [0.0565],
         [0.0319],
         [0.0605],
         [0.1066],
         [0.0586],
         [0.1028],
         [0.0448],
         [0.0167],
         [0.0876],
         [0.0978]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0252],
         [0.0322],
         [0.2931],
         [0.0398],
         [0.1131],
         [0.0226],
         [0.0068],
         [0.0410],
         [0.0072],
         [0.0064],
         [0.1063],
         [0.0685]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0439],
         [0.0231],
         [0.0200],
         [0.0126],
         [0.0209],
         [0.0099],
         [0.0417],
         [0.0282],
         [0.0323],
         [0.0072],
         [0.0035],
         [0.0346]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0296],
         [0.0231],
         [0.0389],
         [0.0053],
         [0.0146],
         [0.0112],
         [0.0104],
         [0.0113],
         [0.0218],
    


Evaluating:  14%|█████████████████████████▏                                                                                                                                                         | 141/1000 [00:06<00:39, 21.50it/s][A

tensor([[[0.0392],
         [0.0328],
         [0.0115],
         [0.0073],
         [0.0120],
         [0.0298],
         [0.0337],
         [0.0014],
         [0.0011],
         [0.0058],
         [0.0522],
         [0.0332]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0268],
         [0.0299],
         [0.0313],
         [0.0126],
         [0.0439],
         [0.0118],
         [0.0222],
         [0.0078],
         [0.0153],
         [0.0111],
         [0.0358],
         [0.0238]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0627],
         [0.0534],
         [0.0663],
         [0.0081],
         [0.0112],
         [0.0103],
         [0.0118],
         [0.0100],
         [0.0036],
         [0.0047],
         [0.0169],
         [0.0285]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0238],
         [0.0221],
         [0.0487],
         [0.0151],
         [0.0351],
         [0.0139],
         [0.0286],
         [0.0079],
         [0.0039],
    


Evaluating:  14%|█████████████████████████▊                                                                                                                                                         | 144/1000 [00:06<00:39, 21.54it/s][A
Evaluating:  15%|██████████████████████████▎                                                                                                                                                        | 147/1000 [00:07<00:39, 21.57it/s][A

tensor([[[0.0562],
         [0.0533],
         [0.0689],
         [0.0247],
         [0.0183],
         [0.0210],
         [0.0257],
         [0.0174],
         [0.0081],
         [0.0041],
         [0.0348],
         [0.0365]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0323],
         [0.0251],
         [0.0395],
         [0.0164],
         [0.0371],
         [0.0084],
         [0.0225],
         [0.0068],
         [0.0020],
         [0.0180],
         [0.0158],
         [0.0353]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0200],
         [0.0569],
         [0.0476],
         [0.0049],
         [0.0106],
         [0.0216],
         [0.0268],
         [0.0167],
         [0.0176],
         [0.0107],
         [0.0357],
         [0.0260]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0121],
         [0.0091],
         [0.0090],
         [0.0079],
         [0.0132],
         [0.0025],
         [0.0115],
         [0.0170],
         [0.0043],
    


Evaluating:  15%|██████████████████████████▊                                                                                                                                                        | 150/1000 [00:07<00:39, 21.45it/s][A

tensor([[[0.0364],
         [0.0460],
         [0.0335],
         [0.0209],
         [0.0161],
         [0.0086],
         [0.0291],
         [0.0181],
         [0.0060],
         [0.0098],
         [0.0114],
         [0.0128]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0411],
         [0.0338],
         [0.0299],
         [0.0217],
         [0.0240],
         [0.0222],
         [0.0152],
         [0.1016],
         [0.0191],
         [0.0258],
         [0.0214],
         [0.0459]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0177],
         [0.0545],
         [0.1352],
         [0.0352],
         [0.1057],
         [0.0128],
         [0.0238],
         [0.0046],
         [0.0102],
         [0.0126],
         [0.0347],
         [0.1192]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0145],
         [0.0356],
         [0.0195],
         [0.0083],
         [0.0082],
         [0.0042],
         [0.0134],
         [0.0052],
         [0.0028],
    


Evaluating:  15%|███████████████████████████▍                                                                                                                                                       | 153/1000 [00:07<00:39, 21.42it/s][A

tensor([[[0.0119],
         [0.0098],
         [0.0462],
         [0.0086],
         [0.0190],
         [0.0047],
         [0.0076],
         [0.0070],
         [0.0301],
         [0.0062],
         [0.0124],
         [0.0314]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0455],
         [0.0222],
         [0.0414],
         [0.0143],
         [0.0198],
         [0.0154],
         [0.0156],
         [0.0209],
         [0.0041],
         [0.0040],
         [0.0151],
         [0.0311]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0158],
         [0.0272],
         [0.0519],
         [0.0157],
         [0.0234],
         [0.0225],
         [0.0079],
         [0.0028],
         [0.0019],
         [0.0074],
         [0.0425],
         [0.0290]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1070],
         [0.0916],
         [0.0884],
         [0.0711],
         [0.0168],
         [0.0065],
         [0.0971],
         [0.0142],
         [0.0180],
    


Evaluating:  16%|███████████████████████████▉                                                                                                                                                       | 156/1000 [00:07<00:39, 21.58it/s][A
Evaluating:  16%|████████████████████████████▍                                                                                                                                                      | 159/1000 [00:07<00:38, 21.63it/s][A


reg attention sum per layer
tensor([[[0.1113],
         [0.0344],
         [0.0168],
         [0.0073],
         [0.0122],
         [0.0290],
         [0.0423],
         [0.0047],
         [0.0031],
         [0.0194],
         [0.0072],
         [0.0159]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0492],
         [0.0092],
         [0.1959],
         [0.0091],
         [0.0155],
         [0.0157],
         [0.0356],
         [0.0009],
         [0.0047],
         [0.0356],
         [0.0085],
         [0.0509]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0198],
         [0.0065],
         [0.0267],
         [0.0132],
         [0.0122],
         [0.0193],
         [0.0086],
         [0.0014],
         [0.0011],
         [0.0010],
         [0.0481],
         [0.0384]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0403],
         [0.0441],
         [0.0222],
         [0.0181],
         [0.0204],
         [0.0575],
         [0.0343],
         [0.0


Evaluating:  16%|████████████████████████████▉                                                                                                                                                      | 162/1000 [00:07<00:38, 21.68it/s][A


reg attention sum per layer
tensor([[[0.0169],
         [0.0520],
         [0.0338],
         [0.0150],
         [0.0124],
         [0.0123],
         [0.0071],
         [0.0048],
         [0.0018],
         [0.0021],
         [0.0200],
         [0.0591]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0276],
         [0.0826],
         [0.0879],
         [0.0233],
         [0.0247],
         [0.0151],
         [0.0103],
         [0.0063],
         [0.0046],
         [0.0065],
         [0.0516],
         [0.0419]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0351],
         [0.0201],
         [0.0838],
         [0.0565],
         [0.0775],
         [0.0229],
         [0.0093],
         [0.0096],
         [0.0234],
         [0.0121],
         [0.0508],
         [0.0519]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1322],
         [0.0608],
         [0.0652],
         [0.0447],
         [0.0635],
         [0.0152],
         [0.0505],
         [0.0


Evaluating:  16%|█████████████████████████████▌                                                                                                                                                     | 165/1000 [00:07<00:38, 21.58it/s][A
Evaluating:  17%|██████████████████████████████                                                                                                                                                     | 168/1000 [00:08<00:38, 21.71it/s][A

reg attention sum per layer
tensor([[[0.0931],
         [0.1290],
         [0.0340],
         [0.0277],
         [0.0201],
         [0.0155],
         [0.0241],
         [0.0104],
         [0.0054],
         [0.0186],
         [0.0198],
         [0.0279]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0127],
         [0.0465],
         [0.0310],
         [0.0110],
         [0.0131],
         [0.0254],
         [0.0087],
         [0.0160],
         [0.0032],
         [0.0074],
         [0.0279],
         [0.0198]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0256],
         [0.0276],
         [0.0948],
         [0.0148],
         [0.0157],
         [0.0105],
         [0.0090],
         [0.0055],
         [0.0051],
         [0.0031],
     


Evaluating:  17%|██████████████████████████████▌                                                                                                                                                    | 171/1000 [00:08<00:38, 21.55it/s][A

tensor([[[0.0811],
         [0.0536],
         [0.0989],
         [0.0280],
         [0.0920],
         [0.0582],
         [0.0362],
         [0.0445],
         [0.0506],
         [0.0198],
         [0.0579],
         [0.0767]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0310],
         [0.0242],
         [0.1018],
         [0.0169],
         [0.0260],
         [0.0069],
         [0.0249],
         [0.0313],
         [0.0194],
         [0.0103],
         [0.0180],
         [0.0621]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0724],
         [0.0464],
         [0.0192],
         [0.0075],
         [0.0055],
         [0.0154],
         [0.0350],
         [0.0412],
         [0.0090],
         [0.0249],
         [0.0044],
         [0.0126]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0085],
         [0.0098],
         [0.0380],
         [0.0074],
         [0.0031],
         [0.0080],
         [0.0070],
         [0.0014],
         [0.0020],
    


Evaluating:  17%|███████████████████████████████▏                                                                                                                                                   | 174/1000 [00:08<00:38, 21.57it/s][A

tensor([[[0.0057],
         [0.0109],
         [0.0280],
         [0.0019],
         [0.0063],
         [0.0053],
         [0.0089],
         [0.0012],
         [0.0006],
         [0.0024],
         [0.0251],
         [0.0203]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0551],
         [0.0088],
         [0.0116],
         [0.0059],
         [0.0058],
         [0.0018],
         [0.0164],
         [0.0007],
         [0.0013],
         [0.0046],
         [0.0046],
         [0.0104]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0432],
         [0.0204],
         [0.0381],
         [0.0125],
         [0.0331],
         [0.0030],
         [0.0067],
         [0.0035],
         [0.0250],
         [0.0056],
         [0.0250],
         [0.0360]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0268],
         [0.0773],
         [0.0656],
         [0.0194],
         [0.0257],
         [0.0134],
         [0.0410],
         [0.0021],
         [0.0027],
    


Evaluating:  18%|███████████████████████████████▋                                                                                                                                                   | 177/1000 [00:08<00:38, 21.60it/s][A
Evaluating:  18%|████████████████████████████████▏                                                                                                                                                  | 180/1000 [00:08<00:37, 21.65it/s][A

tensor([[[0.0156],
         [0.0836],
         [0.1359],
         [0.0207],
         [0.0451],
         [0.0130],
         [0.0092],
         [0.0222],
         [0.0037],
         [0.0040],
         [0.0610],
         [0.0601]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0193],
         [0.0133],
         [0.0427],
         [0.0130],
         [0.0196],
         [0.0268],
         [0.0201],
         [0.0078],
         [0.0076],
         [0.0080],
         [0.0158],
         [0.0196]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0418],
         [0.0381],
         [0.0698],
         [0.0119],
         [0.0266],
         [0.0518],
         [0.0194],
         [0.0102],
         [0.0040],
         [0.0053],
         [0.0114],
         [0.0233]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0034],
         [0.0132],
         [0.0795],
         [0.0096],
         [0.0149],
         [0.0036],
         [0.0053],
         [0.0033],
         [0.0013],
    


Evaluating:  18%|████████████████████████████████▊                                                                                                                                                  | 183/1000 [00:08<00:37, 21.51it/s][A

tensor([[[0.0255],
         [0.0144],
         [0.0178],
         [0.0063],
         [0.0174],
         [0.0112],
         [0.0106],
         [0.0106],
         [0.0039],
         [0.0076],
         [0.0061],
         [0.0177]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0384],
         [0.0539],
         [0.0312],
         [0.0064],
         [0.0118],
         [0.0135],
         [0.0250],
         [0.0267],
         [0.0054],
         [0.0188],
         [0.0194],
         [0.0166]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0614],
         [0.0405],
         [0.0515],
         [0.0100],
         [0.0227],
         [0.0118],
         [0.0349],
         [0.0099],
         [0.0487],
         [0.0176],
         [0.0049],
         [0.0749]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0351],
         [0.0463],
         [0.0892],
         [0.0211],
         [0.0127],
         [0.0245],
         [0.0315],
         [0.0149],
         [0.0094],
    


Evaluating:  19%|█████████████████████████████████▎                                                                                                                                                 | 186/1000 [00:08<00:38, 21.37it/s][A

tensor([[[0.0777],
         [0.0310],
         [0.0594],
         [0.0172],
         [0.0451],
         [0.0190],
         [0.0705],
         [0.0345],
         [0.0265],
         [0.0147],
         [0.0208],
         [0.0257]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0961],
         [0.2471],
         [0.0444],
         [0.0108],
         [0.0248],
         [0.0238],
         [0.0516],
         [0.0032],
         [0.0120],
         [0.0111],
         [0.0426],
         [0.0366]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0270],
         [0.0159],
         [0.0377],
         [0.0062],
         [0.0114],
         [0.0153],
         [0.0098],
         [0.0027],
         [0.0029],
         [0.0036],
         [0.0081],
         [0.0392]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0470],
         [0.0198],
         [0.0175],
         [0.0065],
         [0.0198],
         [0.0059],
         [0.0889],
         [0.0038],
         [0.0163],
    


Evaluating:  19%|█████████████████████████████████▊                                                                                                                                                 | 189/1000 [00:09<00:37, 21.40it/s][A
Evaluating:  19%|██████████████████████████████████▎                                                                                                                                                | 192/1000 [00:09<00:37, 21.41it/s][A

tensor([[[0.0110],
         [0.0184],
         [0.0542],
         [0.0122],
         [0.0166],
         [0.0076],
         [0.0131],
         [0.0056],
         [0.0021],
         [0.0056],
         [0.0143],
         [0.0209]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0288],
         [0.0374],
         [0.0379],
         [0.0082],
         [0.0179],
         [0.0120],
         [0.0377],
         [0.0045],
         [0.0176],
         [0.0110],
         [0.0098],
         [0.0315]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0280],
         [0.0478],
         [0.0333],
         [0.0163],
         [0.0252],
         [0.0185],
         [0.0055],
         [0.0029],
         [0.0009],
         [0.0045],
         [0.0459],
         [0.0191]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0045],
         [0.0086],
         [0.0170],
         [0.0092],
         [0.0147],
         [0.0104],
         [0.0103],
         [0.0096],
         [0.0058],
    


Evaluating:  20%|██████████████████████████████████▉                                                                                                                                                | 195/1000 [00:09<00:37, 21.36it/s][A

tensor([[[0.0208],
         [0.0476],
         [0.0743],
         [0.0208],
         [0.0137],
         [0.0144],
         [0.0059],
         [0.0077],
         [0.0019],
         [0.0024],
         [0.0409],
         [0.0329]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0643],
         [0.1029],
         [0.0342],
         [0.0093],
         [0.0126],
         [0.0073],
         [0.0637],
         [0.0451],
         [0.0057],
         [0.0301],
         [0.0163],
         [0.0240]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0487],
         [0.0433],
         [0.0347],
         [0.0313],
         [0.0327],
         [0.0071],
         [0.0108],
         [0.0031],
         [0.0016],
         [0.0076],
         [0.0987],
         [0.0432]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0127],
         [0.0220],
         [0.0279],
         [0.0123],
         [0.0138],
         [0.0162],
         [0.0278],
         [0.0223],
         [0.0167],
    


Evaluating:  20%|███████████████████████████████████▍                                                                                                                                               | 198/1000 [00:09<00:37, 21.45it/s][A
Evaluating:  20%|███████████████████████████████████▉                                                                                                                                               | 201/1000 [00:09<00:37, 21.55it/s][A


reg attention sum per layer
tensor([[[0.0313],
         [0.0090],
         [0.0160],
         [0.0029],
         [0.0075],
         [0.0036],
         [0.0278],
         [0.0062],
         [0.0084],
         [0.0061],
         [0.0015],
         [0.0160]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0255],
         [0.0124],
         [0.0598],
         [0.0149],
         [0.0285],
         [0.0192],
         [0.0147],
         [0.0500],
         [0.0165],
         [0.0075],
         [0.0068],
         [0.0511]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1521],
         [0.0991],
         [0.0676],
         [0.0651],
         [0.0487],
         [0.0363],
         [0.0200],
         [0.0611],
         [0.0102],
         [0.0166],
         [0.0985],
         [0.0366]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0587],
         [0.1433],
         [0.1842],
         [0.0629],
         [0.1239],
         [0.0201],
         [0.0259],
         [0.0


Evaluating:  20%|████████████████████████████████████▌                                                                                                                                              | 204/1000 [00:09<00:36, 21.53it/s][A

reg attention sum per layer
tensor([[[0.0544],
         [0.0275],
         [0.0442],
         [0.0064],
         [0.0212],
         [0.0096],
         [0.0273],
         [0.0032],
         [0.0038],
         [0.0068],
         [0.0074],
         [0.0264]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0172],
         [0.0268],
         [0.0582],
         [0.0395],
         [0.0279],
         [0.0032],
         [0.0069],
         [0.0043],
         [0.0013],
         [0.0031],
         [0.0568],
         [0.0587]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0277],
         [0.1013],
         [0.0411],
         [0.0218],
         [0.0167],
         [0.0099],
         [0.0070],
         [0.0053],
         [0.0032],
         [0.0069],
         [0.0112],
         [0.0391]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0083],
         [0.0234],
         [0.0413],
         [0.0132],
         [0.0130],
         [0.0091],
         [0.0115],
         [0.00


Evaluating:  21%|█████████████████████████████████████                                                                                                                                              | 207/1000 [00:09<00:36, 21.52it/s][A

tensor([[[0.0455],
         [0.1274],
         [0.0383],
         [0.0045],
         [0.0105],
         [0.0124],
         [0.0186],
         [0.0043],
         [0.0108],
         [0.0138],
         [0.0045],
         [0.0232]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0333],
         [0.0947],
         [0.0883],
         [0.0214],
         [0.0096],
         [0.0315],
         [0.0088],
         [0.0036],
         [0.0042],
         [0.0133],
         [0.0653],
         [0.0631]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0418],
         [0.0787],
         [0.0218],
         [0.0190],
         [0.0169],
         [0.0224],
         [0.0178],
         [0.0135],
         [0.0028],
         [0.0081],
         [0.0336],
         [0.0360]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2192],
         [0.0222],
         [0.1042],
         [0.0184],
         [0.0203],
         [0.0206],
         [0.0228],
         [0.0044],
         [0.0057],
    


Evaluating:  21%|█████████████████████████████████████▌                                                                                                                                             | 210/1000 [00:10<00:36, 21.50it/s][A
Evaluating:  21%|██████████████████████████████████████▏                                                                                                                                            | 213/1000 [00:10<00:36, 21.49it/s][A

tensor([[[0.0125],
         [0.0178],
         [0.0061],
         [0.0023],
         [0.0029],
         [0.0026],
         [0.0146],
         [0.0066],
         [0.0021],
         [0.0026],
         [0.0045],
         [0.0134]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0101],
         [0.0225],
         [0.0311],
         [0.0054],
         [0.0029],
         [0.0117],
         [0.0092],
         [0.0070],
         [0.0014],
         [0.0012],
         [0.0504],
         [0.0639]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0134],
         [0.0191],
         [0.0400],
         [0.0140],
         [0.0260],
         [0.0022],
         [0.0145],
         [0.0073],
         [0.0132],
         [0.0047],
         [0.0077],
         [0.0147]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1272],
         [0.0903],
         [0.1430],
         [0.0370],
         [0.0415],
         [0.0283],
         [0.0714],
         [0.0543],
         [0.0346],
    


Evaluating:  22%|██████████████████████████████████████▋                                                                                                                                            | 216/1000 [00:10<00:36, 21.54it/s][A

tensor([[[0.0106],
         [0.0036],
         [0.0059],
         [0.0019],
         [0.0031],
         [0.0015],
         [0.0046],
         [0.0034],
         [0.0089],
         [0.0039],
         [0.0013],
         [0.0038]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0381],
         [0.0399],
         [0.0253],
         [0.0120],
         [0.0164],
         [0.0264],
         [0.0469],
         [0.0079],
         [0.0061],
         [0.0089],
         [0.0384],
         [0.0130]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0082],
         [0.0091],
         [0.0158],
         [0.0110],
         [0.0041],
         [0.0123],
         [0.0419],
         [0.0056],
         [0.0014],
         [0.0107],
         [0.1410],
         [0.0246]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0304],
         [0.0162],
         [0.0717],
         [0.0237],
         [0.0635],
         [0.0074],
         [0.0556],
         [0.0137],
         [0.0155],
    


Evaluating:  22%|███████████████████████████████████████▏                                                                                                                                           | 219/1000 [00:10<00:36, 21.43it/s][A

tensor([[[0.0117],
         [0.0250],
         [0.0159],
         [0.0029],
         [0.0069],
         [0.0180],
         [0.0023],
         [0.0097],
         [0.0005],
         [0.0170],
         [0.1108],
         [0.0074]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0101],
         [0.0293],
         [0.0149],
         [0.0041],
         [0.0063],
         [0.0046],
         [0.0043],
         [0.0026],
         [0.0013],
         [0.0029],
         [0.0063],
         [0.0359]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0215],
         [0.0410],
         [0.0591],
         [0.0223],
         [0.0537],
         [0.0284],
         [0.0270],
         [0.0212],
         [0.0145],
         [0.0208],
         [0.0138],
         [0.0178]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0108],
         [0.0191],
         [0.0872],
         [0.0161],
         [0.0228],
         [0.0056],
         [0.0035],
         [0.0071],
         [0.0013],
    


Evaluating:  22%|███████████████████████████████████████▋                                                                                                                                           | 222/1000 [00:10<00:36, 21.49it/s][A
Evaluating:  22%|████████████████████████████████████████▎                                                                                                                                          | 225/1000 [00:10<00:36, 21.49it/s][A

tensor([[[0.0557],
         [0.0887],
         [0.0802],
         [0.0109],
         [0.0177],
         [0.0299],
         [0.0376],
         [0.0527],
         [0.0083],
         [0.0119],
         [0.0447],
         [0.0917]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0232],
         [0.0459],
         [0.0682],
         [0.0254],
         [0.0298],
         [0.0228],
         [0.0153],
         [0.0015],
         [0.0014],
         [0.0078],
         [0.0581],
         [0.0212]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0579],
         [0.0467],
         [0.0437],
         [0.0217],
         [0.0476],
         [0.0375],
         [0.0776],
         [0.0422],
         [0.0237],
         [0.0187],
         [0.0142],
         [0.0367]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0279],
         [0.0245],
         [0.0207],
         [0.0074],
         [0.0261],
         [0.0206],
         [0.0185],
         [0.0209],
         [0.0104],
    


Evaluating:  23%|████████████████████████████████████████▊                                                                                                                                          | 228/1000 [00:10<00:36, 21.40it/s][A

tensor([[[0.0247],
         [0.0073],
         [0.0318],
         [0.0136],
         [0.0414],
         [0.0070],
         [0.0102],
         [0.0117],
         [0.0042],
         [0.0061],
         [0.0082],
         [0.0159]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0417],
         [0.0274],
         [0.0120],
         [0.0092],
         [0.0139],
         [0.0233],
         [0.0447],
         [0.0058],
         [0.0126],
         [0.0073],
         [0.0057],
         [0.0086]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0305],
         [0.0526],
         [0.0577],
         [0.0095],
         [0.0173],
         [0.0209],
         [0.0280],
         [0.0115],
         [0.0237],
         [0.0164],
         [0.0123],
         [0.0147]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0318],
         [0.0526],
         [0.0593],
         [0.0193],
         [0.0400],
         [0.0207],
         [0.0210],
         [0.0093],
         [0.0124],
    


Evaluating:  23%|█████████████████████████████████████████▎                                                                                                                                         | 231/1000 [00:11<00:36, 21.33it/s][A

tensor([[[0.0188],
         [0.0436],
         [0.0536],
         [0.0139],
         [0.0221],
         [0.0059],
         [0.0306],
         [0.0058],
         [0.0369],
         [0.0102],
         [0.0209],
         [0.0802]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0066],
         [0.0178],
         [0.2123],
         [0.0116],
         [0.0751],
         [0.0063],
         [0.0130],
         [0.0028],
         [0.0031],
         [0.0018],
         [0.0232],
         [0.1453]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0280],
         [0.0184],
         [0.1133],
         [0.0282],
         [0.0205],
         [0.0036],
         [0.0041],
         [0.0197],
         [0.0018],
         [0.0030],
         [0.0328],
         [0.0330]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0097],
         [0.0131],
         [0.0408],
         [0.0133],
         [0.0253],
         [0.0271],
         [0.0192],
         [0.0060],
         [0.0161],
    


Evaluating:  23%|█████████████████████████████████████████▉                                                                                                                                         | 234/1000 [00:11<00:35, 21.47it/s][A
Evaluating:  24%|██████████████████████████████████████████▍                                                                                                                                        | 237/1000 [00:11<00:35, 21.57it/s][A

tensor([[[0.0432],
         [0.0288],
         [0.0128],
         [0.0056],
         [0.0096],
         [0.0137],
         [0.0095],
         [0.0158],
         [0.0038],
         [0.0230],
         [0.0059],
         [0.0194]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1234],
         [0.2306],
         [0.0863],
         [0.0792],
         [0.0400],
         [0.0203],
         [0.0171],
         [0.0128],
         [0.0148],
         [0.0222],
         [0.1211],
         [0.0463]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0472],
         [0.0365],
         [0.0475],
         [0.0161],
         [0.0200],
         [0.0124],
         [0.0301],
         [0.0198],
         [0.0080],
         [0.0139],
         [0.0175],
         [0.0281]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0325],
         [0.0264],
         [0.0340],
         [0.0071],
         [0.0265],
         [0.0258],
         [0.0187],
         [0.0327],
         [0.0043],
    


Evaluating:  24%|██████████████████████████████████████████▉                                                                                                                                        | 240/1000 [00:11<00:35, 21.59it/s][A

tensor([[[0.0157],
         [0.0350],
         [0.0570],
         [0.0163],
         [0.0448],
         [0.0224],
         [0.0120],
         [0.0038],
         [0.0029],
         [0.0084],
         [0.0258],
         [0.0501]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0379],
         [0.0446],
         [0.1308],
         [0.0161],
         [0.0352],
         [0.0131],
         [0.0124],
         [0.0060],
         [0.0037],
         [0.0065],
         [0.1162],
         [0.0546]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0159],
         [0.0142],
         [0.0121],
         [0.0064],
         [0.0049],
         [0.0048],
         [0.0036],
         [0.0014],
         [0.0038],
         [0.0020],
         [0.0033],
         [0.0278]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0244],
         [0.0180],
         [0.0241],
         [0.0083],
         [0.0124],
         [0.0075],
         [0.0076],
         [0.0067],
         [0.0033],
    


Evaluating:  24%|███████████████████████████████████████████▍                                                                                                                                       | 243/1000 [00:11<00:35, 21.51it/s][A
Evaluating:  25%|████████████████████████████████████████████                                                                                                                                       | 246/1000 [00:11<00:34, 21.63it/s][A


reg attention sum per layer
tensor([[[0.0467],
         [0.0331],
         [0.0317],
         [0.0112],
         [0.0623],
         [0.0308],
         [0.0570],
         [0.0111],
         [0.0117],
         [0.0236],
         [0.0189],
         [0.0279]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2677],
         [0.1820],
         [0.0540],
         [0.0245],
         [0.0209],
         [0.0184],
         [0.0375],
         [0.0391],
         [0.0037],
         [0.0341],
         [0.0650],
         [0.0466]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0338],
         [0.0386],
         [0.0831],
         [0.0297],
         [0.0181],
         [0.0071],
         [0.0132],
         [0.0029],
         [0.0068],
         [0.0041],
         [0.0914],
         [0.0541]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0092],
         [0.0240],
         [0.0202],
         [0.0048],
         [0.0053],
         [0.0058],
         [0.0050],
         [0.0


Evaluating:  25%|████████████████████████████████████████████▌                                                                                                                                      | 249/1000 [00:11<00:34, 21.64it/s][A

reg attention sum per layer
tensor([[[0.0164],
         [0.0219],
         [0.0236],
         [0.0114],
         [0.0759],
         [0.0079],
         [0.0168],
         [0.0364],
         [0.0069],
         [0.0331],
         [0.0134],
         [0.0567]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0119],
         [0.0325],
         [0.0182],
         [0.0084],
         [0.0186],
         [0.0129],
         [0.0089],
         [0.0052],
         [0.0030],
         [0.0063],
         [0.0343],
         [0.0133]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0499],
         [0.1050],
         [0.0602],
         [0.0203],
         [0.0455],
         [0.0072],
         [0.0236],
         [0.0152],
         [0.0096],
         [0.0061],
         [0.0281],
         [0.0705]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0097],
         [0.0128],
         [0.1054],
         [0.0099],
         [0.0105],
         [0.0068],
         [0.0036],
         [0.00


Evaluating:  25%|█████████████████████████████████████████████                                                                                                                                      | 252/1000 [00:11<00:34, 21.45it/s][A

tensor([[[0.0474],
         [0.0672],
         [0.0774],
         [0.0267],
         [0.0218],
         [0.0146],
         [0.0436],
         [0.0155],
         [0.0366],
         [0.0172],
         [0.0528],
         [0.0380]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1133],
         [0.0841],
         [0.0578],
         [0.0127],
         [0.0321],
         [0.0255],
         [0.0238],
         [0.1216],
         [0.0021],
         [0.0158],
         [0.0119],
         [0.0346]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0604],
         [0.0395],
         [0.0207],
         [0.0079],
         [0.0103],
         [0.0022],
         [0.0157],
         [0.0191],
         [0.0116],
         [0.0157],
         [0.0125],
         [0.0034]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0141],
         [0.0345],
         [0.0516],
         [0.0159],
         [0.0642],
         [0.0123],
         [0.0312],
         [0.0046],
         [0.0063],
    


Evaluating:  26%|█████████████████████████████████████████████▋                                                                                                                                     | 255/1000 [00:12<00:34, 21.51it/s][A
Evaluating:  26%|██████████████████████████████████████████████▏                                                                                                                                    | 258/1000 [00:12<00:34, 21.55it/s][A

tensor([[[0.0150],
         [0.0188],
         [0.0271],
         [0.0070],
         [0.0342],
         [0.0086],
         [0.0028],
         [0.0132],
         [0.0007],
         [0.0041],
         [0.0296],
         [0.0061]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0416],
         [0.0202],
         [0.0208],
         [0.0157],
         [0.0061],
         [0.0039],
         [0.0233],
         [0.0026],
         [0.0068],
         [0.0106],
         [0.0075],
         [0.0254]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0177],
         [0.0653],
         [0.1104],
         [0.0090],
         [0.0040],
         [0.0017],
         [0.0169],
         [0.0158],
         [0.0051],
         [0.0071],
         [0.0193],
         [0.0418]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0648],
         [0.0390],
         [0.0274],
         [0.0125],
         [0.0165],
         [0.0117],
         [0.0301],
         [0.1000],
         [0.0515],
    


Evaluating:  26%|██████████████████████████████████████████████▋                                                                                                                                    | 261/1000 [00:12<00:34, 21.53it/s][A

tensor([[[0.0050],
         [0.0146],
         [0.0213],
         [0.0118],
         [0.0023],
         [0.0077],
         [0.0096],
         [0.0035],
         [0.0035],
         [0.0011],
         [0.0408],
         [0.0188]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0438],
         [0.0323],
         [0.0329],
         [0.0230],
         [0.0253],
         [0.0217],
         [0.0228],
         [0.0107],
         [0.0172],
         [0.0113],
         [0.0147],
         [0.0252]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0247],
         [0.0314],
         [0.0733],
         [0.0174],
         [0.0308],
         [0.0400],
         [0.0322],
         [0.0117],
         [0.0093],
         [0.0066],
         [0.0366],
         [0.0532]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0231],
         [0.0114],
         [0.0471],
         [0.0101],
         [0.0117],
         [0.0113],
         [0.0057],
         [0.0008],
         [0.0013],
    


Evaluating:  26%|███████████████████████████████████████████████▎                                                                                                                                   | 264/1000 [00:12<00:34, 21.38it/s][A

tensor([[[0.0157],
         [0.0273],
         [0.0764],
         [0.0334],
         [0.0277],
         [0.0072],
         [0.0071],
         [0.0067],
         [0.0020],
         [0.0046],
         [0.0498],
         [0.1257]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0056],
         [0.0417],
         [0.0426],
         [0.0061],
         [0.0124],
         [0.0280],
         [0.0201],
         [0.0192],
         [0.0079],
         [0.0118],
         [0.0331],
         [0.0147]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0195],
         [0.0630],
         [0.0600],
         [0.0051],
         [0.0237],
         [0.0069],
         [0.0163],
         [0.0122],
         [0.0073],
         [0.0232],
         [0.0256],
         [0.0500]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0522],
         [0.0147],
         [0.0194],
         [0.0336],
         [0.0105],
         [0.0089],
         [0.0809],
         [0.0141],
         [0.0042],
    


Evaluating:  27%|███████████████████████████████████████████████▊                                                                                                                                   | 267/1000 [00:12<00:34, 21.46it/s][A
Evaluating:  27%|████████████████████████████████████████████████▎                                                                                                                                  | 270/1000 [00:12<00:33, 21.56it/s][A

tensor([[[0.0039],
         [0.0120],
         [0.0274],
         [0.0025],
         [0.0045],
         [0.0061],
         [0.0080],
         [0.0071],
         [0.0018],
         [0.0034],
         [0.0856],
         [0.0098]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0144],
         [0.0146],
         [0.0496],
         [0.0128],
         [0.0079],
         [0.0015],
         [0.0053],
         [0.0035],
         [0.0067],
         [0.0033],
         [0.0153],
         [0.0197]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0357],
         [0.0404],
         [0.0582],
         [0.0149],
         [0.0126],
         [0.0141],
         [0.0216],
         [0.0054],
         [0.0028],
         [0.0029],
         [0.0309],
         [0.0178]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0144],
         [0.0303],
         [0.0493],
         [0.0048],
         [0.0107],
         [0.0075],
         [0.0126],
         [0.0147],
         [0.0148],
    


Evaluating:  27%|████████████████████████████████████████████████▊                                                                                                                                  | 273/1000 [00:12<00:33, 21.54it/s][A

tensor([[[0.0269],
         [0.0370],
         [0.0134],
         [0.0095],
         [0.0096],
         [0.0447],
         [0.0455],
         [0.0051],
         [0.0031],
         [0.0077],
         [0.0229],
         [0.0118]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0231],
         [0.0536],
         [0.0338],
         [0.0080],
         [0.0083],
         [0.0053],
         [0.0111],
         [0.0048],
         [0.0013],
         [0.0016],
         [0.0148],
         [0.0345]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0368],
         [0.0334],
         [0.0504],
         [0.0157],
         [0.0157],
         [0.0200],
         [0.0111],
         [0.0337],
         [0.0046],
         [0.0082],
         [0.0533],
         [0.0243]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0181],
         [0.0068],
         [0.0405],
         [0.0113],
         [0.0072],
         [0.0045],
         [0.0084],
         [0.0085],
         [0.0134],
    


Evaluating:  28%|█████████████████████████████████████████████████▍                                                                                                                                 | 276/1000 [00:13<00:33, 21.43it/s][A

tensor([[[0.0271],
         [0.0215],
         [0.0239],
         [0.0305],
         [0.0077],
         [0.0135],
         [0.0174],
         [0.0070],
         [0.0094],
         [0.0031],
         [0.0377],
         [0.0362]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0840],
         [0.0296],
         [0.0143],
         [0.0249],
         [0.0161],
         [0.0386],
         [0.0183],
         [0.0114],
         [0.0029],
         [0.0088],
         [0.0046],
         [0.0147]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0230],
         [0.0144],
         [0.0126],
         [0.0181],
         [0.0155],
         [0.0121],
         [0.0295],
         [0.0267],
         [0.0262],
         [0.0189],
         [0.0214],
         [0.0112]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0508],
         [0.0823],
         [0.0754],
         [0.0306],
         [0.0552],
         [0.0751],
         [0.0715],
         [0.0266],
         [0.0051],
    


Evaluating:  28%|█████████████████████████████████████████████████▉                                                                                                                                 | 279/1000 [00:13<00:33, 21.49it/s][A
Evaluating:  28%|██████████████████████████████████████████████████▍                                                                                                                                | 282/1000 [00:13<00:33, 21.58it/s][A


reg attention sum per layer
tensor([[[0.0540],
         [0.0249],
         [0.0265],
         [0.0260],
         [0.0150],
         [0.0078],
         [0.0082],
         [0.0021],
         [0.0013],
         [0.0032],
         [0.0320],
         [0.0300]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0063],
         [0.0169],
         [0.1559],
         [0.0387],
         [0.0130],
         [0.0042],
         [0.0046],
         [0.0007],
         [0.0013],
         [0.0011],
         [0.1477],
         [0.0821]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0576],
         [0.0810],
         [0.0675],
         [0.0278],
         [0.0997],
         [0.0174],
         [0.0298],
         [0.0152],
         [0.0068],
         [0.0127],
         [0.0634],
         [0.0355]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0145],
         [0.0088],
         [0.0325],
         [0.0178],
         [0.0146],
         [0.0042],
         [0.0072],
         [0.0


Evaluating:  28%|███████████████████████████████████████████████████                                                                                                                                | 285/1000 [00:13<00:33, 21.55it/s][A


reg attention sum per layer
tensor([[[0.0681],
         [0.0364],
         [0.0540],
         [0.0192],
         [0.0424],
         [0.0181],
         [0.0128],
         [0.0228],
         [0.0051],
         [0.0141],
         [0.0203],
         [0.0281]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0235],
         [0.0183],
         [0.0096],
         [0.0241],
         [0.0707],
         [0.0200],
         [0.0222],
         [0.0013],
         [0.0022],
         [0.0039],
         [0.1032],
         [0.0505]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0408],
         [0.0973],
         [0.0551],
         [0.0091],
         [0.0431],
         [0.0318],
         [0.0293],
         [0.0112],
         [0.0588],
         [0.0186],
         [0.0176],
         [0.0377]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0528],
         [0.0296],
         [0.0544],
         [0.0351],
         [0.0715],
         [0.0575],
         [0.0298],
         [0.0


Evaluating:  29%|███████████████████████████████████████████████████▌                                                                                                                               | 288/1000 [00:13<00:32, 21.58it/s][A
Evaluating:  29%|████████████████████████████████████████████████████                                                                                                                               | 291/1000 [00:13<00:32, 21.64it/s][A

reg attention sum per layer
tensor([[[0.0350],
         [0.0305],
         [0.0738],
         [0.0295],
         [0.0108],
         [0.0124],
         [0.0064],
         [0.0211],
         [0.0062],
         [0.0050],
         [0.0082],
         [0.0288]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0156],
         [0.0272],
         [0.0385],
         [0.0042],
         [0.0091],
         [0.0019],
         [0.0054],
         [0.0057],
         [0.0030],
         [0.0040],
         [0.0098],
         [0.0388]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0267],
         [0.0061],
         [0.0229],
         [0.0100],
         [0.0095],
         [0.0072],
         [0.0227],
         [0.0132],
         [0.0056],
         [0.0049],
         [0.0049],
         [0.0194]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0127],
         [0.0207],
         [0.0486],
         [0.0112],
         [0.0074],
         [0.0018],
         [0.0083],
         [0.00


Evaluating:  29%|████████████████████████████████████████████████████▋                                                                                                                              | 294/1000 [00:13<00:32, 21.60it/s][A

tensor([[[0.0308],
         [0.0332],
         [0.0266],
         [0.0156],
         [0.0082],
         [0.0121],
         [0.0163],
         [0.0093],
         [0.0053],
         [0.0047],
         [0.0103],
         [0.0253]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0612],
         [0.0491],
         [0.1076],
         [0.0188],
         [0.0380],
         [0.0268],
         [0.0423],
         [0.0205],
         [0.0167],
         [0.0155],
         [0.0473],
         [0.0534]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0814],
         [0.0416],
         [0.0206],
         [0.0119],
         [0.0233],
         [0.0279],
         [0.0253],
         [0.0083],
         [0.0177],
         [0.0100],
         [0.0086],
         [0.0179]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0146],
         [0.0071],
         [0.0334],
         [0.0114],
         [0.0033],
         [0.0065],
         [0.0045],
         [0.0032],
         [0.0012],
    


Evaluating:  30%|█████████████████████████████████████████████████████▏                                                                                                                             | 297/1000 [00:14<00:32, 21.43it/s][A

tensor([[[0.1406],
         [0.1695],
         [0.0507],
         [0.0321],
         [0.0441],
         [0.0194],
         [0.0229],
         [0.0148],
         [0.0054],
         [0.0156],
         [0.0328],
         [0.0201]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1136],
         [0.0557],
         [0.0444],
         [0.0247],
         [0.0905],
         [0.0097],
         [0.0740],
         [0.0117],
         [0.0263],
         [0.0567],
         [0.0149],
         [0.0801]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0141],
         [0.0277],
         [0.0284],
         [0.0194],
         [0.0171],
         [0.0056],
         [0.0130],
         [0.0214],
         [0.0074],
         [0.0061],
         [0.0146],
         [0.0390]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0364],
         [0.0289],
         [0.1235],
         [0.0154],
         [0.0276],
         [0.0298],
         [0.0438],
         [0.0197],
         [0.0036],
    


Evaluating:  30%|█████████████████████████████████████████████████████▋                                                                                                                             | 300/1000 [00:14<00:32, 21.53it/s][A
Evaluating:  30%|██████████████████████████████████████████████████████▏                                                                                                                            | 303/1000 [00:14<00:32, 21.56it/s][A

tensor([[[0.0481],
         [0.0453],
         [0.0220],
         [0.0237],
         [0.0253],
         [0.0121],
         [0.0384],
         [0.0214],
         [0.0103],
         [0.0082],
         [0.0407],
         [0.0445]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0256],
         [0.0207],
         [0.0220],
         [0.0082],
         [0.0060],
         [0.0032],
         [0.0119],
         [0.0009],
         [0.0037],
         [0.0040],
         [0.0283],
         [0.0176]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0477],
         [0.0636],
         [0.0960],
         [0.0343],
         [0.0874],
         [0.0304],
         [0.0264],
         [0.0220],
         [0.0132],
         [0.0108],
         [0.0770],
         [0.0580]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0194],
         [0.0575],
         [0.0805],
         [0.0253],
         [0.0275],
         [0.0049],
         [0.0164],
         [0.0070],
         [0.0028],
    


Evaluating:  31%|██████████████████████████████████████████████████████▊                                                                                                                            | 306/1000 [00:14<00:32, 21.54it/s][A

tensor([[[0.0426],
         [0.0133],
         [0.0161],
         [0.0087],
         [0.0127],
         [0.0099],
         [0.0212],
         [0.0059],
         [0.0018],
         [0.0080],
         [0.0399],
         [0.0121]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1946],
         [0.0322],
         [0.0243],
         [0.0353],
         [0.0249],
         [0.0204],
         [0.0645],
         [0.0323],
         [0.0096],
         [0.0491],
         [0.0079],
         [0.0242]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0443],
         [0.0517],
         [0.0225],
         [0.0230],
         [0.0287],
         [0.0253],
         [0.0222],
         [0.0047],
         [0.0049],
         [0.0037],
         [0.1329],
         [0.0293]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0581],
         [0.0206],
         [0.0317],
         [0.0130],
         [0.0319],
         [0.0081],
         [0.0356],
         [0.0089],
         [0.0071],
    


Evaluating:  31%|███████████████████████████████████████████████████████▎                                                                                                                           | 309/1000 [00:14<00:32, 21.48it/s][A

tensor([[[0.0178],
         [0.0204],
         [0.0447],
         [0.0086],
         [0.0104],
         [0.0044],
         [0.0094],
         [0.0039],
         [0.0053],
         [0.0105],
         [0.0271],
         [0.0277]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0118],
         [0.0161],
         [0.0589],
         [0.0151],
         [0.0237],
         [0.0094],
         [0.0137],
         [0.0089],
         [0.0080],
         [0.0032],
         [0.0701],
         [0.0283]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0444],
         [0.0295],
         [0.0642],
         [0.0135],
         [0.0154],
         [0.0125],
         [0.0224],
         [0.0087],
         [0.0276],
         [0.0089],
         [0.0180],
         [0.0609]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0445],
         [0.0404],
         [0.0414],
         [0.0156],
         [0.1018],
         [0.0375],
         [0.0335],
         [0.0106],
         [0.0340],
    


Evaluating:  31%|███████████████████████████████████████████████████████▊                                                                                                                           | 312/1000 [00:14<00:31, 21.52it/s][A
Evaluating:  32%|████████████████████████████████████████████████████████▍                                                                                                                          | 315/1000 [00:14<00:31, 21.47it/s][A


reg attention sum per layer
tensor([[[0.0559],
         [0.0526],
         [0.0131],
         [0.0078],
         [0.0092],
         [0.0282],
         [0.0776],
         [0.0062],
         [0.0088],
         [0.0083],
         [0.0121],
         [0.0232]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0027],
         [0.0167],
         [0.0093],
         [0.0020],
         [0.0015],
         [0.0016],
         [0.0008],
         [0.0004],
         [0.0002],
         [0.0002],
         [0.0160],
         [0.0051]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0234],
         [0.0080],
         [0.0382],
         [0.0077],
         [0.0164],
         [0.0069],
         [0.0196],
         [0.0156],
         [0.0073],
         [0.0063],
         [0.0062],
         [0.0126]]], device='cuda:0')
reg attention sum per layer



Evaluating:  32%|████████████████████████████████████████████████████████▉                                                                                                                          | 318/1000 [00:15<00:31, 21.38it/s][A

tensor([[[0.0039],
         [0.0351],
         [0.1653],
         [0.0271],
         [0.0066],
         [0.0038],
         [0.0138],
         [0.0125],
         [0.0054],
         [0.0042],
         [0.1262],
         [0.0664]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0752],
         [0.0649],
         [0.0409],
         [0.0224],
         [0.0116],
         [0.0108],
         [0.0249],
         [0.0564],
         [0.0099],
         [0.0087],
         [0.0135],
         [0.0375]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0334],
         [0.0164],
         [0.0477],
         [0.0257],
         [0.0159],
         [0.0109],
         [0.0242],
         [0.0061],
         [0.0065],
         [0.0078],
         [0.0201],
         [0.0167]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0134],
         [0.0119],
         [0.0305],
         [0.0057],
         [0.0406],
         [0.0185],
         [0.0102],
         [0.0132],
         [0.0037],
    


Evaluating:  32%|█████████████████████████████████████████████████████████▍                                                                                                                         | 321/1000 [00:15<00:31, 21.32it/s][A

tensor([[[0.0522],
         [0.0515],
         [0.0184],
         [0.0128],
         [0.0177],
         [0.0048],
         [0.0410],
         [0.0093],
         [0.0037],
         [0.0092],
         [0.0124],
         [0.0314]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0452],
         [0.0484],
         [0.0558],
         [0.0147],
         [0.0133],
         [0.0115],
         [0.0079],
         [0.0029],
         [0.0012],
         [0.0033],
         [0.0955],
         [0.0689]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2001],
         [0.0159],
         [0.0572],
         [0.0096],
         [0.0064],
         [0.0169],
         [0.0217],
         [0.0174],
         [0.0009],
         [0.0038],
         [0.0049],
         [0.0272]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0726],
         [0.0678],
         [0.0623],
         [0.0175],
         [0.0322],
         [0.0063],
         [0.0207],
         [0.0097],
         [0.0102],
    


Evaluating:  32%|█████████████████████████████████████████████████████████▉                                                                                                                         | 324/1000 [00:15<00:31, 21.37it/s][A
Evaluating:  33%|██████████████████████████████████████████████████████████▌                                                                                                                        | 327/1000 [00:15<00:31, 21.54it/s][A

tensor([[[0.0617],
         [0.0292],
         [0.0160],
         [0.0125],
         [0.0054],
         [0.0328],
         [0.0276],
         [0.0062],
         [0.0033],
         [0.0306],
         [0.0233],
         [0.0086]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0503],
         [0.0778],
         [0.0152],
         [0.0152],
         [0.0119],
         [0.0095],
         [0.0218],
         [0.0254],
         [0.0059],
         [0.0093],
         [0.0157],
         [0.0417]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0381],
         [0.0287],
         [0.1165],
         [0.0370],
         [0.1127],
         [0.0100],
         [0.0335],
         [0.0156],
         [0.0284],
         [0.0277],
         [0.1042],
         [0.0605]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0366],
         [0.0195],
         [0.0380],
         [0.0195],
         [0.0308],
         [0.0195],
         [0.0092],
         [0.0225],
         [0.0028],
    


Evaluating:  33%|███████████████████████████████████████████████████████████                                                                                                                        | 330/1000 [00:15<00:31, 21.53it/s][A


reg attention sum per layer
tensor([[[0.0495],
         [0.0242],
         [0.0111],
         [0.0090],
         [0.0069],
         [0.0165],
         [0.0137],
         [0.0136],
         [0.0269],
         [0.0117],
         [0.0026],
         [0.0084]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0516],
         [0.0113],
         [0.0218],
         [0.0221],
         [0.0088],
         [0.0092],
         [0.0103],
         [0.0063],
         [0.0016],
         [0.0046],
         [0.0135],
         [0.0434]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0252],
         [0.0310],
         [0.0828],
         [0.0119],
         [0.0145],
         [0.0032],
         [0.0105],
         [0.0144],
         [0.0065],
         [0.0050],
         [0.0155],
         [0.0278]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0470],
         [0.0733],
         [0.0104],
         [0.0073],
         [0.0226],
         [0.0035],
         [0.0600],
         [0.0


Evaluating:  33%|███████████████████████████████████████████████████████████▌                                                                                                                       | 333/1000 [00:15<00:31, 21.47it/s][A
Evaluating:  34%|████████████████████████████████████████████████████████████▏                                                                                                                      | 336/1000 [00:15<00:30, 21.47it/s][A

reg attention sum per layer
tensor([[[0.0567],
         [0.0580],
         [0.0271],
         [0.0198],
         [0.0323],
         [0.0137],
         [0.0372],
         [0.0138],
         [0.0449],
         [0.0127],
         [0.0133],
         [0.0248]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0268],
         [0.0430],
         [0.0289],
         [0.0108],
         [0.0135],
         [0.0204],
         [0.0125],
         [0.0283],
         [0.0029],
         [0.0114],
         [0.0097],
         [0.0320]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0395],
         [0.0097],
         [0.0100],
         [0.0068],
         [0.0336],
         [0.0296],
         [0.0166],
         [0.0145],
         [0.0137],
         [0.0123],
         [0.0062],
         [0.0113]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0446],
         [0.0603],
         [0.0511],
         [0.0132],
         [0.0174],
         [0.0127],
         [0.0283],
         [0.02


Evaluating:  34%|████████████████████████████████████████████████████████████▋                                                                                                                      | 339/1000 [00:16<00:30, 21.34it/s][A

tensor([[[0.0244],
         [0.0255],
         [0.0497],
         [0.0187],
         [0.0282],
         [0.0160],
         [0.0152],
         [0.0129],
         [0.0036],
         [0.0054],
         [0.0369],
         [0.0618]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0346],
         [0.0177],
         [0.0262],
         [0.0169],
         [0.0225],
         [0.0171],
         [0.0180],
         [0.0038],
         [0.0045],
         [0.0133],
         [0.0316],
         [0.0136]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0612],
         [0.0701],
         [0.1075],
         [0.0376],
         [0.0825],
         [0.0452],
         [0.0188],
         [0.0626],
         [0.0106],
         [0.0107],
         [0.0707],
         [0.0553]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0097],
         [0.0340],
         [0.0479],
         [0.0084],
         [0.0085],
         [0.0034],
         [0.0100],
         [0.0071],
         [0.0038],
    


Evaluating:  34%|█████████████████████████████████████████████████████████████▏                                                                                                                     | 342/1000 [00:16<00:30, 21.25it/s][A
Evaluating:  34%|█████████████████████████████████████████████████████████████▊                                                                                                                     | 345/1000 [00:16<00:30, 21.45it/s][A

reg attention sum per layer
tensor([[[0.0260],
         [0.0332],
         [0.0357],
         [0.0123],
         [0.0146],
         [0.0098],
         [0.0168],
         [0.0298],
         [0.0070],
         [0.0119],
         [0.0085],
         [0.0136]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0986],
         [0.1041],
         [0.1818],
         [0.0208],
         [0.0729],
         [0.1143],
         [0.0984],
         [0.0278],
         [0.0366],
         [0.0461],
         [0.0799],
         [0.0440]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0349],
         [0.0124],
         [0.0361],
         [0.0160],
         [0.0257],
         [0.0194],
         [0.0443],
         [0.0101],
         [0.0201],
         [0.0089],
         [0.0224],
         [0.0389]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0313],
         [0.0403],
         [0.0406],
         [0.0126],
         [0.0273],
         [0.0094],
         [0.0219],
         [0.03


Evaluating:  35%|██████████████████████████████████████████████████████████████▎                                                                                                                    | 348/1000 [00:16<00:30, 21.42it/s][A

reg attention sum per layer
tensor([[[0.0345],
         [0.0244],
         [0.0474],
         [0.0078],
         [0.0184],
         [0.0071],
         [0.0134],
         [0.0127],
         [0.0072],
         [0.0049],
         [0.0062],
         [0.0421]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0109],
         [0.0210],
         [0.0238],
         [0.0047],
         [0.0121],
         [0.0056],
         [0.0170],
         [0.0093],
         [0.0075],
         [0.0072],
         [0.0123],
         [0.0083]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0158],
         [0.0218],
         [0.0168],
         [0.0147],
         [0.0088],
         [0.0340],
         [0.0282],
         [0.0025],
         [0.0023],
         [0.0056],
         [0.1125],
         [0.0337]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0578],
         [0.0320],
         [0.0490],
         [0.0198],
         [0.0225],
         [0.0034],
         [0.0320],
         [0.01


Evaluating:  35%|██████████████████████████████████████████████████████████████▊                                                                                                                    | 351/1000 [00:16<00:30, 21.35it/s][A

tensor([[[0.0431],
         [0.0197],
         [0.0390],
         [0.0180],
         [0.0103],
         [0.0294],
         [0.0082],
         [0.0079],
         [0.0014],
         [0.0034],
         [0.0379],
         [0.0286]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0410],
         [0.0207],
         [0.0079],
         [0.0039],
         [0.0055],
         [0.0036],
         [0.0105],
         [0.0186],
         [0.0039],
         [0.0031],
         [0.0039],
         [0.0032]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0820],
         [0.0989],
         [0.1191],
         [0.0438],
         [0.0512],
         [0.0186],
         [0.0206],
         [0.0103],
         [0.0051],
         [0.0140],
         [0.1035],
         [0.0931]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0162],
         [0.0273],
         [0.0261],
         [0.0097],
         [0.0078],
         [0.0202],
         [0.0051],
         [0.0023],
         [0.0009],
    


Evaluating:  35%|███████████████████████████████████████████████████████████████▎                                                                                                                   | 354/1000 [00:16<00:30, 21.44it/s][A
Evaluating:  36%|███████████████████████████████████████████████████████████████▉                                                                                                                   | 357/1000 [00:16<00:29, 21.54it/s][A

tensor([[[0.0528],
         [0.0294],
         [0.0297],
         [0.0182],
         [0.0229],
         [0.0101],
         [0.0720],
         [0.0903],
         [0.0169],
         [0.0183],
         [0.0085],
         [0.0227]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0520],
         [0.0254],
         [0.0109],
         [0.0046],
         [0.0190],
         [0.0110],
         [0.0097],
         [0.0117],
         [0.0179],
         [0.0159],
         [0.0036],
         [0.0170]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0456],
         [0.0450],
         [0.0199],
         [0.0096],
         [0.0148],
         [0.0082],
         [0.0103],
         [0.0163],
         [0.0052],
         [0.0036],
         [0.0227],
         [0.0253]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0148],
         [0.0378],
         [0.1329],
         [0.0089],
         [0.0331],
         [0.0064],
         [0.0050],
         [0.0100],
         [0.0037],
    


Evaluating:  36%|████████████████████████████████████████████████████████████████▍                                                                                                                  | 360/1000 [00:17<00:29, 21.48it/s][A


tensor([[[0.2614],
         [0.0247],
         [0.0133],
         [0.0215],
         [0.0786],
         [0.0221],
         [0.1767],
         [0.0072],
         [0.0949],
         [0.0968],
         [0.0033],
         [0.0117]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0281],
         [0.0240],
         [0.0290],
         [0.0166],
         [0.0078],
         [0.0100],
         [0.0354],
         [0.0034],
         [0.0063],
         [0.0090],
         [0.0307],
         [0.0214]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0643],
         [0.0538],
         [0.0796],
         [0.0157],
         [0.0202],
         [0.0251],
         [0.0082],
         [0.0197],
         [0.0061],
         [0.0035],
         [0.0456],
         [0.0768]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0291],
         [0.0505],
         [0.0694],
         [0.0201],
         [0.0052],
         [0.0103],
         [0.0131],
         [0.0135],
         [0.0031],
   


Evaluating:  36%|████████████████████████████████████████████████████████████████▉                                                                                                                  | 363/1000 [00:17<00:29, 21.39it/s][A

tensor([[[0.0097],
         [0.0106],
         [0.0350],
         [0.0082],
         [0.0225],
         [0.0106],
         [0.0338],
         [0.0090],
         [0.0346],
         [0.0061],
         [0.0253],
         [0.0764]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0119],
         [0.0173],
         [0.0386],
         [0.0113],
         [0.0131],
         [0.0059],
         [0.0064],
         [0.0078],
         [0.0119],
         [0.0032],
         [0.0155],
         [0.0111]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0627],
         [0.0388],
         [0.0694],
         [0.0195],
         [0.0126],
         [0.0113],
         [0.0280],
         [0.0247],
         [0.0056],
         [0.0083],
         [0.0496],
         [0.0264]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0188],
         [0.0082],
         [0.0258],
         [0.0114],
         [0.0286],
         [0.0179],
         [0.0075],
         [0.0118],
         [0.0065],
    


Evaluating:  37%|█████████████████████████████████████████████████████████████████▌                                                                                                                 | 366/1000 [00:17<00:29, 21.47it/s][A
Evaluating:  37%|██████████████████████████████████████████████████████████████████                                                                                                                 | 369/1000 [00:17<00:29, 21.42it/s][A

tensor([[[0.0179],
         [0.0491],
         [0.0632],
         [0.0133],
         [0.0127],
         [0.0070],
         [0.0075],
         [0.0087],
         [0.0021],
         [0.0048],
         [0.0209],
         [0.0307]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0268],
         [0.0118],
         [0.2189],
         [0.0296],
         [0.0848],
         [0.0085],
         [0.0159],
         [0.0104],
         [0.0112],
         [0.0110],
         [0.0337],
         [0.3536]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0709],
         [0.1126],
         [0.0368],
         [0.0218],
         [0.0151],
         [0.0087],
         [0.0174],
         [0.0116],
         [0.0034],
         [0.0159],
         [0.0331],
         [0.0290]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0204],
         [0.0890],
         [0.2067],
         [0.0261],
         [0.0485],
         [0.0050],
         [0.0223],
         [0.0075],
         [0.0191],
    


Evaluating:  37%|██████████████████████████████████████████████████████████████████▌                                                                                                                | 372/1000 [00:17<00:29, 21.44it/s][A

tensor([[[0.0217],
         [0.0470],
         [0.0765],
         [0.0078],
         [0.0458],
         [0.0083],
         [0.0128],
         [0.0141],
         [0.0033],
         [0.0077],
         [0.0762],
         [0.0418]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0436],
         [0.0225],
         [0.0239],
         [0.0096],
         [0.0292],
         [0.0141],
         [0.0190],
         [0.0060],
         [0.0072],
         [0.0194],
         [0.0295],
         [0.0135]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1108],
         [0.1550],
         [0.0470],
         [0.0180],
         [0.0153],
         [0.0146],
         [0.0693],
         [0.0264],
         [0.0616],
         [0.0258],
         [0.0050],
         [0.0570]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0707],
         [0.0259],
         [0.0721],
         [0.0220],
         [0.0330],
         [0.0224],
         [0.0158],
         [0.0233],
         [0.0349],
    


Evaluating:  38%|███████████████████████████████████████████████████████████████████▏                                                                                                               | 375/1000 [00:17<00:29, 21.32it/s][A

tensor([[[0.1739],
         [0.1424],
         [0.1441],
         [0.0499],
         [0.0533],
         [0.0138],
         [0.0525],
         [0.0770],
         [0.0283],
         [0.0356],
         [0.0235],
         [0.0661]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0128],
         [0.0071],
         [0.0045],
         [0.0037],
         [0.0035],
         [0.0040],
         [0.0009],
         [0.0082],
         [0.0013],
         [0.0037],
         [0.0047],
         [0.0124]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0197],
         [0.0267],
         [0.0623],
         [0.0123],
         [0.0142],
         [0.0127],
         [0.0106],
         [0.0070],
         [0.0019],
         [0.0026],
         [0.0199],
         [0.0299]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0072],
         [0.0116],
         [0.0311],
         [0.0057],
         [0.0357],
         [0.0272],
         [0.0043],
         [0.0042],
         [0.0018],
    


Evaluating:  38%|███████████████████████████████████████████████████████████████████▋                                                                                                               | 378/1000 [00:17<00:28, 21.46it/s][A
Evaluating:  38%|████████████████████████████████████████████████████████████████████▏                                                                                                              | 381/1000 [00:18<00:28, 21.56it/s][A

tensor([[[0.0104],
         [0.0180],
         [0.1040],
         [0.0146],
         [0.0384],
         [0.0128],
         [0.0240],
         [0.0064],
         [0.0036],
         [0.0091],
         [0.0473],
         [0.0377]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0251],
         [0.0274],
         [0.0453],
         [0.0114],
         [0.0129],
         [0.0092],
         [0.0142],
         [0.0075],
         [0.0018],
         [0.0106],
         [0.1046],
         [0.0377]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0081],
         [0.0070],
         [0.0370],
         [0.0029],
         [0.0171],
         [0.0189],
         [0.0030],
         [0.0122],
         [0.0040],
         [0.0037],
         [0.0140],
         [0.0184]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0170],
         [0.0150],
         [0.0043],
         [0.0081],
         [0.0120],
         [0.0033],
         [0.0107],
         [0.0042],
         [0.0020],
    


Evaluating:  38%|████████████████████████████████████████████████████████████████████▋                                                                                                              | 384/1000 [00:18<00:28, 21.45it/s][A

tensor([[[0.0247],
         [0.0299],
         [0.0439],
         [0.0245],
         [0.0249],
         [0.0192],
         [0.0340],
         [0.0145],
         [0.0301],
         [0.0096],
         [0.0275],
         [0.0288]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1504],
         [0.1277],
         [0.1143],
         [0.0205],
         [0.0096],
         [0.0074],
         [0.0152],
         [0.0137],
         [0.0034],
         [0.0102],
         [0.0136],
         [0.0623]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1030],
         [0.0156],
         [0.0794],
         [0.0195],
         [0.0089],
         [0.0116],
         [0.0323],
         [0.0145],
         [0.0078],
         [0.0099],
         [0.0054],
         [0.0540]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0268],
         [0.0153],
         [0.0202],
         [0.0147],
         [0.0167],
         [0.0021],
         [0.0225],
         [0.0048],
         [0.0076],
    


Evaluating:  39%|█████████████████████████████████████████████████████████████████████▎                                                                                                             | 387/1000 [00:18<00:28, 21.46it/s][A


reg attention sum per layer
tensor([[[0.0194],
         [0.0219],
         [0.0189],
         [0.0077],
         [0.0075],
         [0.0198],
         [0.0216],
         [0.0061],
         [0.0019],
         [0.0057],
         [0.0106],
         [0.0064]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0078],
         [0.0079],
         [0.0122],
         [0.0041],
         [0.0028],
         [0.0043],
         [0.0120],
         [0.0047],
         [0.0021],
         [0.0029],
         [0.0029],
         [0.0106]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0058],
         [0.0049],
         [0.0049],
         [0.0035],
         [0.0030],
         [0.0039],
         [0.0030],
         [0.0008],
         [0.0005],
         [0.0018],
         [0.0018],
         [0.0036]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0027],
         [0.0408],
         [0.0104],
         [0.0022],
         [0.0016],
         [0.0020],
         [0.0026],
         [0.0


Evaluating:  39%|█████████████████████████████████████████████████████████████████████▊                                                                                                             | 390/1000 [00:18<00:28, 21.51it/s][A
Evaluating:  39%|██████████████████████████████████████████████████████████████████████▎                                                                                                            | 393/1000 [00:18<00:28, 21.41it/s][A


reg attention sum per layer
tensor([[[0.0103],
         [0.0295],
         [0.0227],
         [0.0039],
         [0.0149],
         [0.0113],
         [0.0034],
         [0.0044],
         [0.0013],
         [0.0050],
         [0.0839],
         [0.0216]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0338],
         [0.0309],
         [0.2220],
         [0.0190],
         [0.0183],
         [0.0135],
         [0.0071],
         [0.0308],
         [0.0040],
         [0.0099],
         [0.1342],
         [0.1010]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0174],
         [0.0420],
         [0.3717],
         [0.0653],
         [0.0162],
         [0.0290],
         [0.0296],
         [0.0037],
         [0.0095],
         [0.0172],
         [0.2061],
         [0.2420]]], device='cuda:0')
reg attention sum per layer



Evaluating:  40%|██████████████████████████████████████████████████████████████████████▉                                                                                                            | 396/1000 [00:18<00:28, 21.39it/s][A

tensor([[[0.0711],
         [0.0313],
         [0.0296],
         [0.0192],
         [0.0291],
         [0.0088],
         [0.0601],
         [0.0397],
         [0.0133],
         [0.0208],
         [0.0154],
         [0.0253]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0953],
         [0.0298],
         [0.0368],
         [0.0267],
         [0.0217],
         [0.0254],
         [0.0276],
         [0.0503],
         [0.0235],
         [0.0251],
         [0.0074],
         [0.0697]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0823],
         [0.1720],
         [0.1156],
         [0.0298],
         [0.0472],
         [0.0491],
         [0.0477],
         [0.0197],
         [0.0342],
         [0.0510],
         [0.0303],
         [0.0714]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0413],
         [0.0287],
         [0.0349],
         [0.0238],
         [0.0169],
         [0.0277],
         [0.0233],
         [0.0025],
         [0.0047],
    


Evaluating:  40%|███████████████████████████████████████████████████████████████████████▍                                                                                                           | 399/1000 [00:18<00:28, 21.46it/s][A
Evaluating:  40%|███████████████████████████████████████████████████████████████████████▉                                                                                                           | 402/1000 [00:18<00:27, 21.52it/s][A

reg attention sum per layer
tensor([[[0.0305],
         [0.0304],
         [0.0964],
         [0.0245],
         [0.0532],
         [0.0100],
         [0.0334],
         [0.0115],
         [0.0478],
         [0.0118],
         [0.0173],
         [0.0412]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0335],
         [0.0834],
         [0.0564],
         [0.0207],
         [0.0133],
         [0.0397],
         [0.0158],
         [0.0060],
         [0.0026],
         [0.0040],
         [0.0714],
         [0.0492]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0050],
         [0.0100],
         [0.0298],
         [0.0033],
         [0.0145],
         [0.0138],
         [0.0038],
         [0.0032],
         [0.0047],
         [0.0028],
         [0.0040],
         [0.0106]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0273],
         [0.0203],
         [0.0338],
         [0.0115],
         [0.0033],
         [0.0078],
         [0.0330],
         [0.01


Evaluating:  40%|████████████████████████████████████████████████████████████████████████▍                                                                                                          | 405/1000 [00:19<00:27, 21.55it/s][A

reg attention sum per layer
tensor([[[0.0468],
         [0.0485],
         [0.0262],
         [0.0269],
         [0.0160],
         [0.0107],
         [0.0265],
         [0.0121],
         [0.0038],
         [0.0045],
         [0.0144],
         [0.0203]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0578],
         [0.0720],
         [0.0571],
         [0.0288],
         [0.0467],
         [0.0099],
         [0.0378],
         [0.0150],
         [0.0169],
         [0.0112],
         [0.0105],
         [0.0478]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0065],
         [0.0057],
         [0.0192],
         [0.0033],
         [0.0151],
         [0.0026],
         [0.0389],
         [0.0095],
         [0.0196],
         [0.0074],
         [0.0023],
         [0.0076]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0336],
         [0.0430],
         [0.0482],
         [0.0086],
         [0.0326],
         [0.0200],
         [0.0175],
         [0.03


Evaluating:  41%|█████████████████████████████████████████████████████████████████████████                                                                                                          | 408/1000 [00:19<00:27, 21.44it/s][A

tensor([[[0.0353],
         [0.0429],
         [0.1053],
         [0.0185],
         [0.0109],
         [0.0077],
         [0.0129],
         [0.0204],
         [0.0099],
         [0.0069],
         [0.0069],
         [0.0745]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0147],
         [0.0150],
         [0.0172],
         [0.0055],
         [0.0344],
         [0.0084],
         [0.0054],
         [0.0107],
         [0.0062],
         [0.0025],
         [0.0101],
         [0.0246]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0133],
         [0.0064],
         [0.0316],
         [0.0052],
         [0.0174],
         [0.0044],
         [0.0180],
         [0.0057],
         [0.0019],
         [0.0019],
         [0.0065],
         [0.0059]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0054],
         [0.0070],
         [0.0132],
         [0.0031],
         [0.0062],
         [0.0110],
         [0.0056],
         [0.0048],
         [0.0021],
    


Evaluating:  41%|█████████████████████████████████████████████████████████████████████████▌                                                                                                         | 411/1000 [00:19<00:27, 21.41it/s][A
Evaluating:  41%|██████████████████████████████████████████████████████████████████████████                                                                                                         | 414/1000 [00:19<00:27, 21.38it/s][A

tensor([[[0.1154],
         [0.1652],
         [0.0959],
         [0.0482],
         [0.1398],
         [0.1201],
         [0.0698],
         [0.0425],
         [0.0233],
         [0.0301],
         [0.3002],
         [0.0885]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0604],
         [0.0695],
         [0.0428],
         [0.0088],
         [0.0280],
         [0.0291],
         [0.0707],
         [0.0145],
         [0.0068],
         [0.0176],
         [0.0084],
         [0.0224]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0236],
         [0.0332],
         [0.0294],
         [0.0083],
         [0.0106],
         [0.0085],
         [0.0473],
         [0.0022],
         [0.0036],
         [0.0092],
         [0.0340],
         [0.0251]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0169],
         [0.0125],
         [0.0101],
         [0.0020],
         [0.0081],
         [0.0049],
         [0.0147],
         [0.0121],
         [0.0032],
    


Evaluating:  42%|██████████████████████████████████████████████████████████████████████████▋                                                                                                        | 417/1000 [00:19<00:27, 21.37it/s][A

tensor([[[0.0177],
         [0.0186],
         [0.0404],
         [0.0122],
         [0.0286],
         [0.0195],
         [0.0029],
         [0.0045],
         [0.0013],
         [0.0030],
         [0.0104],
         [0.0200]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0401],
         [0.0221],
         [0.0447],
         [0.0126],
         [0.0309],
         [0.0140],
         [0.0132],
         [0.0079],
         [0.0160],
         [0.0104],
         [0.0131],
         [0.0116]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0123],
         [0.0314],
         [0.0078],
         [0.0048],
         [0.0030],
         [0.0114],
         [0.0061],
         [0.0009],
         [0.0005],
         [0.0010],
         [0.0163],
         [0.0172]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0690],
         [0.0546],
         [0.0678],
         [0.0111],
         [0.0093],
         [0.0357],
         [0.0111],
         [0.0187],
         [0.0051],
    


Evaluating:  42%|███████████████████████████████████████████████████████████████████████████▏                                                                                                       | 420/1000 [00:19<00:27, 21.22it/s][A

tensor([[[0.0233],
         [0.0510],
         [0.0611],
         [0.0307],
         [0.0167],
         [0.0083],
         [0.0112],
         [0.0125],
         [0.0071],
         [0.0031],
         [0.0764],
         [0.0597]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0186],
         [0.0099],
         [0.0103],
         [0.0063],
         [0.0022],
         [0.0069],
         [0.0046],
         [0.0028],
         [0.0015],
         [0.0044],
         [0.0046],
         [0.0065]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0127],
         [0.0211],
         [0.0056],
         [0.0099],
         [0.0040],
         [0.0142],
         [0.0145],
         [0.0067],
         [0.0025],
         [0.0078],
         [0.0683],
         [0.0105]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0286],
         [0.0505],
         [0.0779],
         [0.0168],
         [0.0221],
         [0.0099],
         [0.0220],
         [0.0100],
         [0.0110],
    


Evaluating:  42%|███████████████████████████████████████████████████████████████████████████▋                                                                                                       | 423/1000 [00:19<00:27, 21.34it/s][A
Evaluating:  43%|████████████████████████████████████████████████████████████████████████████▎                                                                                                      | 426/1000 [00:20<00:26, 21.43it/s][A

tensor([[[0.0163],
         [0.0118],
         [0.0426],
         [0.0229],
         [0.0354],
         [0.0207],
         [0.0052],
         [0.0069],
         [0.0026],
         [0.0044],
         [0.0222],
         [0.0195]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0108],
         [0.0148],
         [0.0206],
         [0.0113],
         [0.0142],
         [0.0066],
         [0.0150],
         [0.0025],
         [0.0036],
         [0.0038],
         [0.0291],
         [0.0103]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0100],
         [0.0245],
         [0.0243],
         [0.0143],
         [0.0066],
         [0.0123],
         [0.0169],
         [0.0125],
         [0.0103],
         [0.0020],
         [0.0246],
         [0.0372]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0278],
         [0.0258],
         [0.1219],
         [0.0295],
         [0.1407],
         [0.0127],
         [0.0086],
         [0.0301],
         [0.0048],
    


Evaluating:  43%|████████████████████████████████████████████████████████████████████████████▊                                                                                                      | 429/1000 [00:20<00:26, 21.35it/s][A

tensor([[[0.0086],
         [0.0071],
         [0.0129],
         [0.0062],
         [0.0202],
         [0.0047],
         [0.0024],
         [0.0054],
         [0.0006],
         [0.0036],
         [0.0081],
         [0.0051]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0373],
         [0.0620],
         [0.0724],
         [0.0109],
         [0.0404],
         [0.0046],
         [0.0589],
         [0.0129],
         [0.0592],
         [0.0136],
         [0.0068],
         [0.0682]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1100],
         [0.0682],
         [0.0626],
         [0.0238],
         [0.0244],
         [0.0089],
         [0.0501],
         [0.0650],
         [0.0516],
         [0.0315],
         [0.0065],
         [0.0593]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1028],
         [0.0261],
         [0.1321],
         [0.0389],
         [0.0856],
         [0.0384],
         [0.0676],
         [0.0300],
         [0.0568],
    


Evaluating:  43%|█████████████████████████████████████████████████████████████████████████████▎                                                                                                     | 432/1000 [00:20<00:26, 21.44it/s][A
Evaluating:  44%|█████████████████████████████████████████████████████████████████████████████▊                                                                                                     | 435/1000 [00:20<00:26, 21.50it/s][A

reg attention sum per layer
tensor([[[0.0249],
         [0.0224],
         [0.1299],
         [0.0152],
         [0.0443],
         [0.0081],
         [0.0135],
         [0.0049],
         [0.0058],
         [0.0050],
         [0.0237],
         [0.1115]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0115],
         [0.0121],
         [0.0388],
         [0.0109],
         [0.0081],
         [0.0134],
         [0.0355],
         [0.0028],
         [0.0025],
         [0.0067],
         [0.0068],
         [0.0071]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0038],
         [0.0228],
         [0.0224],
         [0.0089],
         [0.0062],
         [0.0068],
         [0.0035],
         [0.0033],
         [0.0015],
         [0.0040],
         [0.0066],
         [0.0078]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0258],
         [0.0312],
         [0.0211],
         [0.0161],
         [0.0163],
         [0.0083],
         [0.0048],
         [0.00


Evaluating:  44%|██████████████████████████████████████████████████████████████████████████████▍                                                                                                    | 438/1000 [00:20<00:26, 21.45it/s][A

tensor([[[0.0600],
         [0.0747],
         [0.0695],
         [0.0140],
         [0.0285],
         [0.0232],
         [0.0298],
         [0.0247],
         [0.0119],
         [0.0385],
         [0.0317],
         [0.0416]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0264],
         [0.0489],
         [0.0459],
         [0.0272],
         [0.0434],
         [0.0235],
         [0.0191],
         [0.0168],
         [0.0033],
         [0.0066],
         [0.0356],
         [0.0386]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1908],
         [0.1552],
         [0.0372],
         [0.0113],
         [0.0140],
         [0.0266],
         [0.0457],
         [0.0931],
         [0.0095],
         [0.0234],
         [0.0028],
         [0.0480]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0045],
         [0.0080],
         [0.0565],
         [0.0051],
         [0.0078],
         [0.0066],
         [0.0013],
         [0.0029],
         [0.0005],
    


Evaluating:  44%|██████████████████████████████████████████████████████████████████████████████▉                                                                                                    | 441/1000 [00:20<00:26, 21.37it/s][A

tensor([[[0.0033],
         [0.0065],
         [0.0041],
         [0.0053],
         [0.0009],
         [0.0033],
         [0.0010],
         [0.0022],
         [0.0006],
         [0.0003],
         [0.0036],
         [0.0039]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0191],
         [0.0244],
         [0.0262],
         [0.0227],
         [0.0092],
         [0.0136],
         [0.0102],
         [0.0031],
         [0.0018],
         [0.0039],
         [0.0492],
         [0.0243]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0170],
         [0.0113],
         [0.0320],
         [0.0076],
         [0.0057],
         [0.0051],
         [0.0047],
         [0.0417],
         [0.0074],
         [0.0035],
         [0.0037],
         [0.0443]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0241],
         [0.0380],
         [0.0852],
         [0.0188],
         [0.0347],
         [0.0172],
         [0.0065],
         [0.0039],
         [0.0054],
    


Evaluating:  44%|███████████████████████████████████████████████████████████████████████████████▍                                                                                                   | 444/1000 [00:20<00:26, 21.36it/s][A
Evaluating:  45%|████████████████████████████████████████████████████████████████████████████████                                                                                                   | 447/1000 [00:21<00:25, 21.40it/s][A

tensor([[[0.0394],
         [0.0314],
         [0.0353],
         [0.0186],
         [0.0224],
         [0.0161],
         [0.0114],
         [0.0153],
         [0.0098],
         [0.0064],
         [0.0321],
         [0.0178]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0380],
         [0.0159],
         [0.0422],
         [0.0146],
         [0.0229],
         [0.0093],
         [0.0311],
         [0.0134],
         [0.0181],
         [0.0178],
         [0.0146],
         [0.0403]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0435],
         [0.0267],
         [0.0272],
         [0.0123],
         [0.0361],
         [0.0227],
         [0.0243],
         [0.0103],
         [0.0207],
         [0.0128],
         [0.0219],
         [0.0138]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0287],
         [0.1206],
         [0.1048],
         [0.0159],
         [0.0281],
         [0.0108],
         [0.0246],
         [0.0134],
         [0.0560],
    


Evaluating:  45%|████████████████████████████████████████████████████████████████████████████████▌                                                                                                  | 450/1000 [00:21<00:25, 21.33it/s][A

tensor([[[0.0223],
         [0.0315],
         [0.0528],
         [0.0229],
         [0.0459],
         [0.0071],
         [0.0301],
         [0.1020],
         [0.0070],
         [0.0090],
         [0.0334],
         [0.0455]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0037],
         [0.0023],
         [0.0046],
         [0.0037],
         [0.0046],
         [0.0028],
         [0.0039],
         [0.0026],
         [0.0020],
         [0.0024],
         [0.0019],
         [0.0137]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0524],
         [0.0327],
         [0.0361],
         [0.0235],
         [0.0496],
         [0.0181],
         [0.0126],
         [0.0258],
         [0.0030],
         [0.0059],
         [0.0290],
         [0.0447]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0020],
         [0.0117],
         [0.0128],
         [0.0016],
         [0.0114],
         [0.0055],
         [0.0036],
         [0.0011],
         [0.0005],
    


Evaluating:  45%|█████████████████████████████████████████████████████████████████████████████████                                                                                                  | 453/1000 [00:21<00:25, 21.28it/s][A

tensor([[[0.0426],
         [0.1059],
         [0.0303],
         [0.0266],
         [0.0400],
         [0.0053],
         [0.0154],
         [0.0217],
         [0.0049],
         [0.0076],
         [0.0164],
         [0.0198]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0617],
         [0.0173],
         [0.0240],
         [0.0140],
         [0.0258],
         [0.0118],
         [0.0218],
         [0.0330],
         [0.0063],
         [0.0110],
         [0.0038],
         [0.0241]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0352],
         [0.0280],
         [0.0285],
         [0.0113],
         [0.0135],
         [0.0058],
         [0.0169],
         [0.0068],
         [0.0088],
         [0.0053],
         [0.0172],
         [0.0497]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0039],
         [0.0310],
         [0.0542],
         [0.0090],
         [0.0073],
         [0.0107],
         [0.0049],
         [0.0013],
         [0.0007],
    


Evaluating:  46%|█████████████████████████████████████████████████████████████████████████████████▌                                                                                                 | 456/1000 [00:21<00:25, 21.34it/s][A
Evaluating:  46%|██████████████████████████████████████████████████████████████████████████████████▏                                                                                                | 459/1000 [00:21<00:25, 21.29it/s][A

tensor([[[0.0599],
         [0.0244],
         [0.0306],
         [0.0132],
         [0.0238],
         [0.0067],
         [0.0225],
         [0.0363],
         [0.0084],
         [0.0119],
         [0.0112],
         [0.0161]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0700],
         [0.0560],
         [0.0476],
         [0.0218],
         [0.0332],
         [0.0317],
         [0.0497],
         [0.0259],
         [0.0496],
         [0.0172],
         [0.0114],
         [0.0410]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0107],
         [0.0296],
         [0.0084],
         [0.0076],
         [0.0070],
         [0.0054],
         [0.0102],
         [0.0020],
         [0.0012],
         [0.0026],
         [0.0399],
         [0.0151]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0198],
         [0.0423],
         [0.0437],
         [0.0085],
         [0.0222],
         [0.0067],
         [0.0106],
         [0.0017],
         [0.0054],
    


Evaluating:  46%|██████████████████████████████████████████████████████████████████████████████████▋                                                                                                | 462/1000 [00:21<00:25, 21.35it/s][A

tensor([[[0.0174],
         [0.0200],
         [0.0236],
         [0.0049],
         [0.0183],
         [0.0217],
         [0.0109],
         [0.0213],
         [0.0033],
         [0.0048],
         [0.0070],
         [0.0093]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0220],
         [0.0531],
         [0.0456],
         [0.0186],
         [0.0176],
         [0.0064],
         [0.0132],
         [0.0068],
         [0.0047],
         [0.0060],
         [0.0089],
         [0.0176]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1015],
         [0.0329],
         [0.0446],
         [0.0184],
         [0.0422],
         [0.0470],
         [0.0344],
         [0.0369],
         [0.0234],
         [0.0276],
         [0.0246],
         [0.0349]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0103],
         [0.0076],
         [0.0098],
         [0.0075],
         [0.0056],
         [0.0079],
         [0.0073],
         [0.0010],
         [0.0011],
    


Evaluating:  46%|███████████████████████████████████████████████████████████████████████████████████▏                                                                                               | 465/1000 [00:21<00:25, 21.35it/s][A

tensor([[[0.0386],
         [0.0881],
         [0.1480],
         [0.0174],
         [0.0480],
         [0.0272],
         [0.0150],
         [0.0134],
         [0.0046],
         [0.0228],
         [0.0551],
         [0.0510]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0790],
         [0.0544],
         [0.0163],
         [0.0179],
         [0.0048],
         [0.0220],
         [0.0331],
         [0.0099],
         [0.0059],
         [0.0198],
         [0.0135],
         [0.0128]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0226],
         [0.0335],
         [0.0484],
         [0.0110],
         [0.0125],
         [0.0084],
         [0.0264],
         [0.0312],
         [0.0073],
         [0.0084],
         [0.0154],
         [0.0229]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0421],
         [0.0599],
         [0.2500],
         [0.0094],
         [0.0645],
         [0.0128],
         [0.0291],
         [0.0090],
         [0.0440],
    


Evaluating:  47%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                               | 468/1000 [00:22<00:24, 21.34it/s][A
Evaluating:  47%|████████████████████████████████████████████████████████████████████████████████████▎                                                                                              | 471/1000 [00:22<00:24, 21.34it/s][A

tensor([[[0.0972],
         [0.1300],
         [0.0622],
         [0.0164],
         [0.0210],
         [0.0070],
         [0.0278],
         [0.0206],
         [0.0225],
         [0.0171],
         [0.0075],
         [0.0166]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0369],
         [0.0215],
         [0.0308],
         [0.0113],
         [0.0139],
         [0.0140],
         [0.0101],
         [0.0072],
         [0.0102],
         [0.0045],
         [0.0172],
         [0.0399]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0556],
         [0.0414],
         [0.0498],
         [0.0171],
         [0.0168],
         [0.0041],
         [0.0210],
         [0.0137],
         [0.0088],
         [0.0109],
         [0.0057],
         [0.0424]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0483],
         [0.0491],
         [0.1194],
         [0.0112],
         [0.0443],
         [0.0221],
         [0.0495],
         [0.0425],
         [0.0216],
    


Evaluating:  47%|████████████████████████████████████████████████████████████████████████████████████▊                                                                                              | 474/1000 [00:22<00:24, 21.20it/s][A

tensor([[[0.0757],
         [0.0324],
         [0.0269],
         [0.0184],
         [0.0323],
         [0.0263],
         [0.0288],
         [0.0267],
         [0.0375],
         [0.0171],
         [0.0092],
         [0.0086]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0462],
         [0.0526],
         [0.0320],
         [0.0136],
         [0.0212],
         [0.0066],
         [0.0143],
         [0.0146],
         [0.0066],
         [0.0108],
         [0.0252],
         [0.0166]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0039],
         [0.0115],
         [0.0473],
         [0.0071],
         [0.0254],
         [0.0067],
         [0.0020],
         [0.0048],
         [0.0003],
         [0.0006],
         [0.0406],
         [0.0508]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1604],
         [0.1611],
         [0.0274],
         [0.0117],
         [0.0089],
         [0.0083],
         [0.0329],
         [0.0076],
         [0.0043],
    


Evaluating:  48%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                             | 477/1000 [00:22<00:24, 21.14it/s][A

tensor([[[0.0134],
         [0.0439],
         [0.0910],
         [0.0200],
         [0.0608],
         [0.0203],
         [0.0096],
         [0.0374],
         [0.0077],
         [0.0063],
         [0.0148],
         [0.0720]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0278],
         [0.0865],
         [0.0431],
         [0.0217],
         [0.0213],
         [0.0070],
         [0.0086],
         [0.0085],
         [0.0053],
         [0.0095],
         [0.0403],
         [0.0507]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0521],
         [0.0877],
         [0.1322],
         [0.0235],
         [0.0629],
         [0.1047],
         [0.0227],
         [0.0981],
         [0.0101],
         [0.0166],
         [0.0431],
         [0.0343]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0835],
         [0.0230],
         [0.0402],
         [0.0349],
         [0.0385],
         [0.0136],
         [0.0058],
         [0.0302],
         [0.0046],
    


Evaluating:  48%|█████████████████████████████████████████████████████████████████████████████████████▉                                                                                             | 480/1000 [00:22<00:24, 21.33it/s][A
Evaluating:  48%|██████████████████████████████████████████████████████████████████████████████████████▍                                                                                            | 483/1000 [00:22<00:24, 21.42it/s][A

tensor([[[0.0078],
         [0.0394],
         [0.0678],
         [0.0120],
         [0.0132],
         [0.0091],
         [0.0121],
         [0.0031],
         [0.0047],
         [0.0016],
         [0.0314],
         [0.0159]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0107],
         [0.0324],
         [0.0098],
         [0.0064],
         [0.0037],
         [0.0044],
         [0.0048],
         [0.0014],
         [0.0004],
         [0.0004],
         [0.0290],
         [0.0068]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0617],
         [0.0338],
         [0.0212],
         [0.0112],
         [0.0073],
         [0.0045],
         [0.0098],
         [0.0027],
         [0.0015],
         [0.0052],
         [0.0274],
         [0.0151]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0455],
         [0.0454],
         [0.0751],
         [0.0151],
         [0.0130],
         [0.0129],
         [0.0128],
         [0.0185],
         [0.0046],
    


Evaluating:  49%|██████████████████████████████████████████████████████████████████████████████████████▉                                                                                            | 486/1000 [00:22<00:24, 21.17it/s][A

tensor([[[0.0290],
         [0.1207],
         [0.2187],
         [0.0307],
         [0.0553],
         [0.0091],
         [0.0161],
         [0.0130],
         [0.0029],
         [0.0044],
         [0.1626],
         [0.1009]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0511],
         [0.0735],
         [0.3865],
         [0.0350],
         [0.0753],
         [0.0237],
         [0.0677],
         [0.0095],
         [0.0339],
         [0.0191],
         [0.0617],
         [0.2554]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0127],
         [0.0182],
         [0.0558],
         [0.0078],
         [0.0146],
         [0.0056],
         [0.0062],
         [0.0078],
         [0.0026],
         [0.0016],
         [0.0165],
         [0.0272]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0354],
         [0.0958],
         [0.0304],
         [0.0125],
         [0.0139],
         [0.0167],
         [0.0222],
         [0.0296],
         [0.0089],
    


Evaluating:  49%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 489/1000 [00:23<00:24, 21.17it/s][A


tensor([[[0.0502],
         [0.0775],
         [0.0043],
         [0.0061],
         [0.0042],
         [0.0153],
         [0.0134],
         [0.0008],
         [0.0017],
         [0.0087],
         [0.0041],
         [0.0203]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0184],
         [0.0045],
         [0.0195],
         [0.0124],
         [0.0168],
         [0.0095],
         [0.0174],
         [0.0211],
         [0.0103],
         [0.0079],
         [0.0018],
         [0.0115]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0289],
         [0.0397],
         [0.0957],
         [0.0602],
         [0.0359],
         [0.0094],
         [0.0035],
         [0.0308],
         [0.0038],
         [0.0054],
         [0.0483],
         [0.1014]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0140],
         [0.0326],
         [0.0419],
         [0.0083],
         [0.0170],
         [0.0105],
         [0.0117],
         [0.0016],
         [0.0062],
    

Evaluating:  49%|████████████████████████████████████████████████████████████████████████████████████████                                                                                           | 492/1000 [00:23<00:23, 21.26it/s][A
Evaluating:  50%|████████████████████████████████████████████████████████████████████████████████████████▌                                                                                          | 495/1000 [00:23<00:23, 21.33it/s][A

reg attention sum per layer
tensor([[[0.0104],
         [0.0155],
         [0.0133],
         [0.0170],
         [0.0142],
         [0.0125],
         [0.0047],
         [0.0221],
         [0.0022],
         [0.0053],
         [0.0105],
         [0.0060]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0101],
         [0.0186],
         [0.0104],
         [0.0050],
         [0.0277],
         [0.0065],
         [0.0060],
         [0.0087],
         [0.0031],
         [0.0045],
         [0.0086],
         [0.0160]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0522],
         [0.0661],
         [0.1286],
         [0.0372],
         [0.0114],
         [0.0434],
         [0.0150],
         [0.0205],
         [0.0124],
         [0.0080],
         [0.0305],
         [0.0639]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1649],
         [0.0556],
         [0.1836],
         [0.0380],
         [0.1200],
         [0.3281],
         [0.0634],
         [0.05


Evaluating:  50%|█████████████████████████████████████████████████████████████████████████████████████████▏                                                                                         | 498/1000 [00:23<00:23, 21.37it/s][A

tensor([[[0.0088],
         [0.0140],
         [0.0157],
         [0.0089],
         [0.0048],
         [0.0122],
         [0.0288],
         [0.0050],
         [0.0047],
         [0.0021],
         [0.0056],
         [0.0099]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0139],
         [0.0145],
         [0.0228],
         [0.0108],
         [0.0158],
         [0.0023],
         [0.0027],
         [0.0033],
         [0.0023],
         [0.0056],
         [0.0034],
         [0.0130]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0231],
         [0.0358],
         [0.0490],
         [0.0131],
         [0.0194],
         [0.0126],
         [0.0128],
         [0.0079],
         [0.0024],
         [0.0029],
         [0.0263],
         [0.0412]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0327],
         [0.0100],
         [0.0172],
         [0.0072],
         [0.0322],
         [0.0071],
         [0.0123],
         [0.0110],
         [0.0215],
    


Evaluating:  50%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                                         | 501/1000 [00:23<00:23, 21.36it/s][A
Evaluating:  50%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 504/1000 [00:23<00:23, 21.44it/s][A

tensor([[[0.0173],
         [0.0223],
         [0.1239],
         [0.0113],
         [0.0169],
         [0.0061],
         [0.0040],
         [0.0060],
         [0.0027],
         [0.0027],
         [0.0215],
         [0.0508]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0623],
         [0.0840],
         [0.0544],
         [0.0297],
         [0.0329],
         [0.0152],
         [0.0335],
         [0.0206],
         [0.0123],
         [0.0148],
         [0.0271],
         [0.0368]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0238],
         [0.0374],
         [0.0773],
         [0.0157],
         [0.0218],
         [0.0426],
         [0.0222],
         [0.0234],
         [0.0121],
         [0.0130],
         [0.0078],
         [0.0558]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0164],
         [0.1071],
         [0.0324],
         [0.0059],
         [0.0052],
         [0.0070],
         [0.0146],
         [0.0030],
         [0.0028],
    


Evaluating:  51%|██████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 507/1000 [00:23<00:23, 21.32it/s][A

tensor([[[0.0202],
         [0.0282],
         [0.1135],
         [0.0411],
         [0.0214],
         [0.0141],
         [0.0226],
         [0.0260],
         [0.0082],
         [0.0072],
         [0.0365],
         [0.0504]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0216],
         [0.0574],
         [0.0413],
         [0.0135],
         [0.0365],
         [0.0106],
         [0.0073],
         [0.0564],
         [0.0033],
         [0.0051],
         [0.0401],
         [0.0437]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0057],
         [0.0156],
         [0.0992],
         [0.0075],
         [0.0135],
         [0.0052],
         [0.0037],
         [0.0078],
         [0.0017],
         [0.0037],
         [0.0300],
         [0.0321]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0532],
         [0.0350],
         [0.0223],
         [0.0227],
         [0.0118],
         [0.0148],
         [0.0111],
         [0.0277],
         [0.0065],
    


Evaluating:  51%|███████████████████████████████████████████████████████████████████████████████████████████▎                                                                                       | 510/1000 [00:24<00:23, 21.27it/s][A

tensor([[[0.0125],
         [0.0124],
         [0.0266],
         [0.0127],
         [0.0140],
         [0.0046],
         [0.0031],
         [0.0142],
         [0.0012],
         [0.0032],
         [0.0086],
         [0.0172]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0408],
         [0.0663],
         [0.1333],
         [0.0199],
         [0.0445],
         [0.0199],
         [0.0233],
         [0.0266],
         [0.0133],
         [0.0186],
         [0.0481],
         [0.0658]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0142],
         [0.0144],
         [0.0272],
         [0.0071],
         [0.0081],
         [0.0063],
         [0.0131],
         [0.0040],
         [0.0028],
         [0.0053],
         [0.0298],
         [0.0138]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0330],
         [0.0182],
         [0.0190],
         [0.0103],
         [0.0306],
         [0.0137],
         [0.0729],
         [0.0590],
         [0.0134],
    


Evaluating:  51%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                                       | 513/1000 [00:24<00:22, 21.34it/s][A
Evaluating:  52%|████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                      | 516/1000 [00:24<00:22, 21.38it/s][A

tensor([[[0.0380],
         [0.0703],
         [0.0632],
         [0.0217],
         [0.0175],
         [0.0070],
         [0.0092],
         [0.0056],
         [0.0014],
         [0.0031],
         [0.1132],
         [0.0656]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0141],
         [0.0381],
         [0.1406],
         [0.0230],
         [0.0744],
         [0.0090],
         [0.0149],
         [0.0077],
         [0.0204],
         [0.0044],
         [0.0476],
         [0.0945]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0504],
         [0.0194],
         [0.0383],
         [0.0147],
         [0.0178],
         [0.0683],
         [0.0233],
         [0.0129],
         [0.0066],
         [0.0061],
         [0.0229],
         [0.0217]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0054],
         [0.1233],
         [0.0133],
         [0.0075],
         [0.0049],
         [0.0047],
         [0.0090],
         [0.0013],
         [0.0023],
    


Evaluating:  52%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                      | 519/1000 [00:24<00:22, 21.32it/s][A

tensor([[[0.0345],
         [0.0442],
         [0.0411],
         [0.0140],
         [0.0322],
         [0.0200],
         [0.0345],
         [0.0137],
         [0.0096],
         [0.0122],
         [0.0294],
         [0.0221]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0059],
         [0.0087],
         [0.0109],
         [0.0157],
         [0.0071],
         [0.0153],
         [0.0140],
         [0.0017],
         [0.0031],
         [0.0031],
         [0.0159],
         [0.0167]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0483],
         [0.0202],
         [0.0237],
         [0.0242],
         [0.0155],
         [0.0196],
         [0.0503],
         [0.0131],
         [0.0058],
         [0.0131],
         [0.0351],
         [0.01


Evaluating:  52%|█████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 522/1000 [00:24<00:22, 21.42it/s][A
Evaluating:  52%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                     | 525/1000 [00:24<00:22, 21.48it/s][A

reg attention sum per layer
tensor([[[0.0388],
         [0.0180],
         [0.1271],
         [0.0102],
         [0.0410],
         [0.0097],
         [0.0243],
         [0.0043],
         [0.0144],
         [0.0166],
         [0.0079],
         [0.0569]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0417],
         [0.0625],
         [0.0422],
         [0.0159],
         [0.0091],
         [0.0128],
         [0.0200],
         [0.0126],
         [0.0061],
         [0.0059],
         [0.0071],
         [0.0413]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0429],
         [0.1175],
         [0.0745],
         [0.0197],
         [0.0444],
         [0.0502],
         [0.0352],
         [0.0048],
         [0.0017],
         [0.0079],
         [0.3425],
         [0.0804]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0771],
         [0.0129],
         [0.0262],
         [0.0092],
         [0.0205],
         [0.0089],
         [0.0247],
         [0.03


Evaluating:  53%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                    | 528/1000 [00:24<00:21, 21.53it/s][A

tensor([[[0.0484],
         [0.0270],
         [0.0278],
         [0.0141],
         [0.0203],
         [0.0095],
         [0.0648],
         [0.0212],
         [0.0434],
         [0.0195],
         [0.0032],
         [0.0365]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0408],
         [0.0278],
         [0.0446],
         [0.0116],
         [0.0471],
         [0.0145],
         [0.0188],
         [0.0041],
         [0.0093],
         [0.0129],
         [0.0181],
         [0.0185]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0068],
         [0.0034],
         [0.0168],
         [0.0019],
         [0.0078],
         [0.0056],
         [0.0039],
         [0.0021],
         [0.0075],
         [0.0028],
         [0.0054],
         [0.0060]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0095],
         [0.0521],
         [0.0055],
         [0.0039],
         [0.0012],
         [0.0066],
         [0.0270],
         [0.0008],
         [0.0010],
    


Evaluating:  53%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                    | 531/1000 [00:25<00:21, 21.38it/s][A

tensor([[[0.0155],
         [0.0245],
         [0.0679],
         [0.0115],
         [0.0112],
         [0.0053],
         [0.0150],
         [0.0038],
         [0.0061],
         [0.0029],
         [0.0313],
         [0.0363]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0450],
         [0.0617],
         [0.0395],
         [0.0371],
         [0.0899],
         [0.0244],
         [0.0316],
         [0.0188],
         [0.0191],
         [0.0199],
         [0.0226],
         [0.0349]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0199],
         [0.0701],
         [0.0690],
         [0.0138],
         [0.0615],
         [0.0189],
         [0.0271],
         [0.0113],
         [0.0233],
         [0.0113],
         [0.0253],
         [0.0632]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0356],
         [0.0165],
         [0.0161],
         [0.0134],
         [0.0056],
         [0.0058],
         [0.0308],
         [0.0063],
         [0.0050],
    


Evaluating:  53%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 534/1000 [00:25<00:21, 21.41it/s][A
Evaluating:  54%|████████████████████████████████████████████████████████████████████████████████████████████████                                                                                   | 537/1000 [00:25<00:21, 21.52it/s][A

tensor([[[0.0095],
         [0.0490],
         [0.0685],
         [0.0109],
         [0.0377],
         [0.0292],
         [0.0178],
         [0.0117],
         [0.0036],
         [0.0041],
         [0.0876],
         [0.0312]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0617],
         [0.0465],
         [0.1529],
         [0.0105],
         [0.0233],
         [0.0448],
         [0.0350],
         [0.0020],
         [0.0016],
         [0.0121],
         [0.0295],
         [0.0260]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0712],
         [0.0458],
         [0.0774],
         [0.0219],
         [0.0134],
         [0.0154],
         [0.0247],
         [0.0065],
         [0.0023],
         [0.0074],
         [0.0244],
         [0.0944]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0328],
         [0.0364],
         [0.0531],
         [0.0346],
         [0.0298],
         [0.0211],
         [0.0138],
         [0.0035],
         [0.0043],
    


Evaluating:  54%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                  | 540/1000 [00:25<00:21, 21.42it/s][A

tensor([[[0.0131],
         [0.0198],
         [0.0349],
         [0.0032],
         [0.0205],
         [0.0080],
         [0.0081],
         [0.0011],
         [0.0031],
         [0.0037],
         [0.0397],
         [0.0116]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0103],
         [0.0248],
         [0.0050],
         [0.0061],
         [0.0057],
         [0.0019],
         [0.0022],
         [0.0017],
         [0.0005],
         [0.0016],
         [0.0094],
         [0.0265]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0226],
         [0.0461],
         [0.0812],
         [0.0152],
         [0.0337],
         [0.0365],
         [0.0186],
         [0.0091],
         [0.0141],
         [0.0119],
         [0.0515],
         [0.0511]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0448],
         [0.0354],
         [0.0764],
         [0.0468],
         [0.0648],
         [0.0197],
         [0.1461],
         [0.0166],
         [0.0085],
    


Evaluating:  54%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                 | 543/1000 [00:25<00:21, 21.30it/s][A

tensor([[[0.0458],
         [0.0813],
         [0.0342],
         [0.0177],
         [0.0246],
         [0.0077],
         [0.0242],
         [0.0122],
         [0.0130],
         [0.0202],
         [0.0167],
         [0.0288]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0486],
         [0.0382],
         [0.0571],
         [0.0160],
         [0.0292],
         [0.0184],
         [0.0164],
         [0.0150],
         [0.0089],
         [0.0059],
         [0.0418],
         [0.0367]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0831],
         [0.0193],
         [0.0527],
         [0.0099],
         [0.0430],
         [0.0317],
         [0.0241],
         [0.0032],
         [0.0093],
         [0.0140],
         [0.0378],
         [0.0400]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0097],
         [0.0121],
         [0.0576],
         [0.0082],
         [0.0138],
         [0.0194],
         [0.0052],
         [0.0006],
         [0.0031],
    


Evaluating:  55%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 546/1000 [00:25<00:21, 21.31it/s][A
Evaluating:  55%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                | 549/1000 [00:25<00:21, 21.41it/s][A

tensor([[[0.0031],
         [0.0095],
         [0.0186],
         [0.0077],
         [0.0078],
         [0.0187],
         [0.0147],
         [0.0045],
         [0.0077],
         [0.0028],
         [0.0122],
         [0.0212]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0120],
         [0.0351],
         [0.0621],
         [0.0174],
         [0.0283],
         [0.0399],
         [0.0236],
         [0.0101],
         [0.0127],
         [0.0072],
         [0.0742],
         [0.0506]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0272],
         [0.0105],
         [0.0330],
         [0.0125],
         [0.0065],
         [0.0123],
         [0.0012],
         [0.0093],
         [0.0007],
         [0.0019],
         [0.0206],
         [0.0094]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0264],
         [0.0179],
         [0.0392],
         [0.0081],
         [0.0113],
         [0.0144],
         [0.0116],
         [0.0037],
         [0.0015],
    


Evaluating:  55%|██████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 552/1000 [00:26<00:21, 21.25it/s][A

tensor([[[0.0341],
         [0.0670],
         [0.1176],
         [0.0254],
         [0.0245],
         [0.0232],
         [0.0130],
         [0.0209],
         [0.0019],
         [0.0063],
         [0.1940],
         [0.0578]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1320],
         [0.0754],
         [0.0320],
         [0.0434],
         [0.0131],
         [0.0082],
         [0.0305],
         [0.0326],
         [0.0429],
         [0.0164],
         [0.0068],
         [0.0541]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0061],
         [0.0095],
         [0.0180],
         [0.0030],
         [0.0168],
         [0.0157],
         [0.0084],
         [0.0016],
         [0.0021],
         [0.0028],
         [0.0252],
         [0.0048]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0602],
         [0.1174],
         [0.0413],
         [0.0232],
         [0.0201],
         [0.0346],
         [0.0325],
         [0.0487],
         [0.0134],
    


Evaluating:  56%|███████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                               | 555/1000 [00:26<00:21, 21.19it/s][A

tensor([[[0.0146],
         [0.0325],
         [0.1201],
         [0.0149],
         [0.0724],
         [0.0240],
         [0.0162],
         [0.0158],
         [0.0047],
         [0.0135],
         [0.0291],
         [0.0407]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0668],
         [0.0937],
         [0.0816],
         [0.0248],
         [0.0213],
         [0.0254],
         [0.0590],
         [0.0229],
         [0.0057],
         [0.0164],
         [0.0224],
         [0.0147]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0175],
         [0.0089],
         [0.0111],
         [0.0030],
         [0.0163],
         [0.0246],
         [0.0053],
         [0.0186],
         [0.0029],
         [0.0040],
         [0.0165],
         [0.0077]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0597],
         [0.0094],
         [0.0122],
         [0.0103],
         [0.0085],
         [0.0038],
         [0.0110],
         [0.0171],
         [0.0032],
    


Evaluating:  56%|███████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                               | 558/1000 [00:26<00:20, 21.37it/s][A
Evaluating:  56%|████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 561/1000 [00:26<00:20, 21.40it/s][A

tensor([[[0.0260],
         [0.0591],
         [0.0299],
         [0.0146],
         [0.0290],
         [0.1297],
         [0.0131],
         [0.0175],
         [0.0008],
         [0.0128],
         [0.1197],
         [0.0420]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0095],
         [0.0478],
         [0.1162],
         [0.0308],
         [0.0392],
         [0.0053],
         [0.0053],
         [0.0138],
         [0.0017],
         [0.0031],
         [0.0747],
         [0.1291]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0340],
         [0.0076],
         [0.0709],
         [0.0399],
         [0.0154],
         [0.0112],
         [0.0155],
         [0.0058],
         [0.0037],
         [0.0028],
         [0.0242],
         [0.0426]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0415],
         [0.0133],
         [0.0105],
         [0.0073],
         [0.0124],
         [0.0045],
         [0.0254],
         [0.0275],
         [0.0348],
    


Evaluating:  56%|████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                              | 564/1000 [00:26<00:20, 21.43it/s][A

tensor([[[0.0359],
         [0.0496],
         [0.0183],
         [0.0260],
         [0.0118],
         [0.0234],
         [0.0225],
         [0.0106],
         [0.0021],
         [0.0044],
         [0.0366],
         [0.0373]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1336],
         [0.0393],
         [0.0422],
         [0.0208],
         [0.0077],
         [0.0052],
         [0.0293],
         [0.0060],
         [0.0087],
         [0.0073],
         [0.0078],
         [0.0233]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0359],
         [0.0209],
         [0.1140],
         [0.0190],
         [0.0438],
         [0.0865],
         [0.0093],
         [0.0066],
         [0.0144],
         [0.0135],
         [0.0344],
         [0.05


Evaluating:  57%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                             | 567/1000 [00:26<00:20, 21.42it/s][A

tensor([[[0.0814],
         [0.0173],
         [0.0853],
         [0.0569],
         [0.0413],
         [0.0066],
         [0.0276],
         [0.0088],
         [0.0243],
         [0.0080],
         [0.0424],
         [0.0648]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0233],
         [0.0440],
         [0.0174],
         [0.0118],
         [0.0108],
         [0.0082],
         [0.0087],
         [0.0067],
         [0.0029],
         [0.0057],
         [0.0559],
         [0.0176]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0483],
         [0.0276],
         [0.0708],
         [0.0434],
         [0.0400],
         [0.0120],
         [0.0251],
         [0.0041],
         [0.0538],
         [0.0187],
         [0.0113],
         [0.0706]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0136],
         [0.0076],
         [0.0372],
         [0.0115],
         [0.0075],
         [0.0062],
         [0.0085],
         [0.0133],
         [0.0027],
    


Evaluating:  57%|██████████████████████████████████████████████████████████████████████████████████████████████████████                                                                             | 570/1000 [00:26<00:20, 21.44it/s][A
Evaluating:  57%|██████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                            | 573/1000 [00:26<00:19, 21.45it/s][A

tensor([[[0.0576],
         [0.0431],
         [0.0447],
         [0.0301],
         [0.0291],
         [0.0282],
         [0.0082],
         [0.0021],
         [0.0043],
         [0.0042],
         [0.2015],
         [0.0435]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0340],
         [0.0659],
         [0.0429],
         [0.0170],
         [0.0239],
         [0.0224],
         [0.0289],
         [0.0542],
         [0.0119],
         [0.0079],
         [0.0342],
         [0.0263]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0073],
         [0.0097],
         [0.0132],
         [0.0034],
         [0.0139],
         [0.0018],
         [0.0122],
         [0.0043],
         [0.0037],
         [0.0057],
         [0.0113],
         [0.0052]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0386],
         [0.0519],
         [0.0350],
         [0.0138],
         [0.0103],
         [0.0057],
         [0.0079],
         [0.0106],
         [0.0012],
    


Evaluating:  58%|███████████████████████████████████████████████████████████████████████████████████████████████████████                                                                            | 576/1000 [00:27<00:19, 21.42it/s][A

tensor([[[0.1165],
         [0.0524],
         [0.0337],
         [0.0114],
         [0.0353],
         [0.0036],
         [0.0901],
         [0.0135],
         [0.0065],
         [0.0092],
         [0.0073],
         [0.0213]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0144],
         [0.0706],
         [0.0757],
         [0.0098],
         [0.0241],
         [0.0276],
         [0.0090],
         [0.0036],
         [0.0045],
         [0.0076],
         [0.0464],
         [0.0358]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0360],
         [0.0288],
         [0.0224],
         [0.0262],
         [0.0123],
         [0.0106],
         [0.0075],
         [0.0146],
         [0.0041],
         [0.0105],
         [0.0230],
         [0.0263]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0388],
         [0.0563],
         [0.0456],
         [0.0099],
         [0.0175],
         [0.0112],
         [0.0249],
         [0.0370],
         [0.0115],
    


Evaluating:  58%|███████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 579/1000 [00:27<00:19, 21.17it/s][A

tensor([[[0.0141],
         [0.0282],
         [0.0388],
         [0.0110],
         [0.0143],
         [0.0093],
         [0.0136],
         [0.0083],
         [0.0048],
         [0.0150],
         [0.0398],
         [0.0215]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0311],
         [0.0085],
         [0.0908],
         [0.0166],
         [0.0076],
         [0.0110],
         [0.0050],
         [0.0047],
         [0.0010],
         [0.0059],
         [0.0157],
         [0.1303]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0179],
         [0.0599],
         [0.0967],
         [0.0139],
         [0.0293],
         [0.0156],
         [0.0101],
         [0.0128],
         [0.0076],
         [0.0073],
         [0.0484],
         [0.0329]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0229],
         [0.0171],
         [0.0130],
         [0.0202],
         [0.0478],
         [0.0194],
         [0.0217],
         [0.0387],
         [0.0074],
    


Evaluating:  58%|████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 582/1000 [00:27<00:19, 21.29it/s][A
Evaluating:  58%|████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                          | 585/1000 [00:27<00:19, 21.34it/s][A

tensor([[[0.0279],
         [0.0363],
         [0.0560],
         [0.0144],
         [0.0084],
         [0.0077],
         [0.0169],
         [0.0164],
         [0.0069],
         [0.0036],
         [0.0713],
         [0.0388]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0298],
         [0.0227],
         [0.0432],
         [0.0245],
         [0.0179],
         [0.0186],
         [0.0049],
         [0.0082],
         [0.0027],
         [0.0048],
         [0.0235],
         [0.0268]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0197],
         [0.0148],
         [0.0360],
         [0.0129],
         [0.0288],
         [0.0100],
         [0.0411],
         [0.0142],
         [0.0098],
         [0.0053],
         [0.0214],
         [0.0300]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0238],
         [0.0108],
         [0.0077],
         [0.0076],
         [0.0088],
         [0.0016],
         [0.0176],
         [0.0203],
         [0.0184],
    


Evaluating:  59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                         | 588/1000 [00:27<00:19, 21.39it/s][A

tensor([[[0.0201],
         [0.0323],
         [0.0640],
         [0.0146],
         [0.0324],
         [0.0132],
         [0.0174],
         [0.0197],
         [0.0038],
         [0.0152],
         [0.0498],
         [0.0162]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0227],
         [0.0321],
         [0.1146],
         [0.0381],
         [0.0142],
         [0.0163],
         [0.0099],
         [0.0134],
         [0.0047],
         [0.0019],
         [0.1929],
         [0.0837]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0298],
         [0.0102],
         [0.0447],
         [0.0243],
         [0.0312],
         [0.0063],
         [0.0143],
         [0.0074],
         [0.0033],
         [0.0084],
         [0.0487],
         [0.0269]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0208],
         [0.0320],
         [0.0855],
         [0.0505],
         [0.0291],
         [0.0123],
         [0.0152],
         [0.0247],
         [0.0069],
    


Evaluating:  59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 591/1000 [00:27<00:19, 21.37it/s][A

tensor([[[0.1005],
         [0.0694],
         [0.0548],
         [0.0244],
         [0.0360],
         [0.0412],
         [0.0363],
         [0.0193],
         [0.0035],
         [0.0107],
         [0.0475],
         [0.0404]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0119],
         [0.0067],
         [0.0315],
         [0.0038],
         [0.0080],
         [0.0048],
         [0.0076],
         [0.0080],
         [0.0070],
         [0.0017],
         [0.0046],
         [0.0466]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0341],
         [0.0286],
         [0.0991],
         [0.0203],
         [0.0277],
         [0.0108],
         [0.0163],
         [0.0062],
         [0.0016],
         [0.0023],
         [0.2013],
         [0.0960]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0190],
         [0.0363],
         [0.0252],
         [0.0208],
         [0.0067],
         [0.0084],
         [0.0361],
         [0.0096],
         [0.0014],
    


Evaluating:  59%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                        | 594/1000 [00:27<00:19, 21.36it/s][A
Evaluating:  60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                        | 597/1000 [00:28<00:18, 21.53it/s][A

tensor([[[0.0861],
         [0.2219],
         [0.1166],
         [0.0268],
         [0.0528],
         [0.0242],
         [0.0644],
         [0.0173],
         [0.0509],
         [0.0230],
         [0.0248],
         [0.1262]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0244],
         [0.0201],
         [0.0242],
         [0.0166],
         [0.0153],
         [0.0078],
         [0.0363],
         [0.0064],
         [0.0039],
         [0.0118],
         [0.0414],
         [0.0390]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0065],
         [0.0138],
         [0.0317],
         [0.0023],
         [0.0059],
         [0.0128],
         [0.0057],
         [0.0025],
         [0.0009],
         [0.0022],
         [0.0063],
         [0.0051]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0221],
         [0.0236],
         [0.0899],
         [0.0380],
         [0.0454],
         [0.0104],
         [0.0373],
         [0.0113],
         [0.0470],
    


Evaluating:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                       | 600/1000 [00:28<00:18, 21.46it/s][A

tensor([[[0.0125],
         [0.0133],
         [0.0366],
         [0.0099],
         [0.0192],
         [0.0114],
         [0.0124],
         [0.0117],
         [0.0035],
         [0.0042],
         [0.0184],
         [0.0360]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0394],
         [0.0494],
         [0.0705],
         [0.0238],
         [0.0723],
         [0.0378],
         [0.0146],
         [0.0939],
         [0.0168],
         [0.0251],
         [0.0409],
         [0.0446]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0195],
         [0.0348],
         [0.0544],
         [0.0156],
         [0.0167],
         [0.0154],
         [0.0083],
         [0.0090],
         [0.0027],
         [0.0020],
         [0.0316],
         [0.0417]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0312],
         [0.0178],
         [0.0552],
         [0.0114],
         [0.0080],
         [0.0073],
         [0.0272],
         [0.0101],
         [0.0040],
    


Evaluating:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 603/1000 [00:28<00:18, 21.43it/s][A


reg attention sum per layer
tensor([[[0.0588],
         [0.0346],
         [0.0272],
         [0.0275],
         [0.0150],
         [0.0136],
         [0.0266],
         [0.0044],
         [0.0116],
         [0.0129],
         [0.0085],
         [0.0212]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0060],
         [0.0293],
         [0.0325],
         [0.0044],
         [0.0048],
         [0.0104],
         [0.0041],
         [0.0067],
         [0.0023],
         [0.0024],
         [0.0360],
         [0.0277]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0160],
         [0.0247],
         [0.0079],
         [0.0062],
         [0.0041],
         [0.0035],
         [0.0110],
         [0.0073],
         [0.0016],
         [0.0026],
         [0.0064],
         [0.0059]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0073],
         [0.0336],
         [0.1264],
         [0.0155],
         [0.0452],
         [0.0125],
         [0.0088],
         [0.0


Evaluating:  61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                      | 606/1000 [00:28<00:18, 21.45it/s][A
Evaluating:  61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                      | 609/1000 [00:28<00:18, 21.46it/s][A


reg attention sum per layer
tensor([[[0.0214],
         [0.0358],
         [0.0393],
         [0.0345],
         [0.0969],
         [0.0133],
         [0.0265],
         [0.0064],
         [0.0325],
         [0.0306],
         [0.0601],
         [0.0729]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0220],
         [0.0163],
         [0.0168],
         [0.0217],
         [0.0435],
         [0.0088],
         [0.0176],
         [0.0100],
         [0.0120],
         [0.0080],
         [0.0120],
         [0.0392]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0331],
         [0.0898],
         [0.0783],
         [0.0222],
         [0.0391],
         [0.0097],
         [0.0182],
         [0.0102],
         [0.0061],
         [0.0109],
         [0.1262],
         [0.0907]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0266],
         [0.0083],
         [0.0112],
         [0.0087],
         [0.0322],
         [0.0133],
         [0.0157],
         [0.0


Evaluating:  61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 612/1000 [00:28<00:18, 21.51it/s][A


reg attention sum per layer
tensor([[[0.0199],
         [0.0394],
         [0.1904],
         [0.0395],
         [0.0358],
         [0.0126],
         [0.0082],
         [0.0125],
         [0.0065],
         [0.0072],
         [0.1132],
         [0.1764]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1028],
         [0.0670],
         [0.0596],
         [0.0244],
         [0.0412],
         [0.0248],
         [0.0233],
         [0.0156],
         [0.0022],
         [0.0233],
         [0.0668],
         [0.0158]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0846],
         [0.0166],
         [0.0986],
         [0.0288],
         [0.0270],
         [0.0311],
         [0.0046],
         [0.0109],
         [0.0007],
         [0.0056],
         [0.0183],
         [0.0269]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0377],
         [0.0431],
         [0.1526],
         [0.0677],
         [0.0610],
         [0.0300],
         [0.0162],
         [0.0


Evaluating:  62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 615/1000 [00:28<00:18, 21.37it/s][A
Evaluating:  62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 618/1000 [00:29<00:17, 21.40it/s][A

reg attention sum per layer
tensor([[[0.0480],
         [0.0165],
         [0.0269],
         [0.0114],
         [0.0350],
         [0.0201],
         [0.0354],
         [0.0226],
         [0.0304],
         [0.0204],
         [0.0044],
         [0.0173]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0330],
         [0.0484],
         [0.0492],
         [0.0083],
         [0.0165],
         [0.0065],
         [0.0381],
         [0.0146],
         [0.0069],
         [0.0051],
         [0.0203],
         [0.0257]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0267],
         [0.0108],
         [0.0248],
         [0.0123],
         [0.0399],
         [0.0061],
         [0.0184],
         [0.0148],
         [0.0054],
         [0.0072],
         [0.0161],
         [0.0155]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0175],
         [0.0130],
         [0.0189],
         [0.0030],
         [0.0010],
         [0.0049],
         [0.0055],
         [0.00


Evaluating:  62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                   | 621/1000 [00:29<00:17, 21.43it/s][A

reg attention sum per layer
tensor([[[0.0612],
         [0.0435],
         [0.0140],
         [0.0137],
         [0.0130],
         [0.0145],
         [0.0217],
         [0.0253],
         [0.0089],
         [0.0104],
         [0.0061],
         [0.0173]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0551],
         [0.0364],
         [0.0172],
         [0.0108],
         [0.0129],
         [0.0056],
         [0.0240],
         [0.0119],
         [0.0335],
         [0.0090],
         [0.0167],
         [0.0241]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0363],
         [0.0111],
         [0.0879],
         [0.0198],
         [0.0422],
         [0.0043],
         [0.0111],
         [0.0245],
         [0.0084],
         [0.0079],
         [0.0045],
         [0.0247]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0090],
         [0.0751],
         [0.0745],
         [0.0142],
         [0.0347],
         [0.0165],
         [0.0231],
         [0.01


Evaluating:  62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 624/1000 [00:29<00:17, 21.36it/s][A

tensor([[[0.0504],
         [0.0392],
         [0.1720],
         [0.0605],
         [0.0517],
         [0.0125],
         [0.0151],
         [0.0554],
         [0.0169],
         [0.0086],
         [0.0311],
         [0.0522]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0339],
         [0.0099],
         [0.0489],
         [0.0101],
         [0.0301],
         [0.0168],
         [0.0277],
         [0.0067],
         [0.0043],
         [0.0093],
         [0.0239],
         [0.0135]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0250],
         [0.0161],
         [0.0123],
         [0.0075],
         [0.0308],
         [0.0146],
         [0.0114],
         [0.0091],
         [0.0060],
         [0.0138],
         [0.0145],
         [0.0172]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0606],
         [0.0566],
         [0.0615],
         [0.0112],
         [0.0061],
         [0.0182],
         [0.0076],
         [0.0032],
         [0.0057],
    


Evaluating:  63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                  | 627/1000 [00:29<00:17, 21.35it/s][A
Evaluating:  63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                  | 630/1000 [00:29<00:17, 21.48it/s][A

tensor([[[0.0165],
         [0.0405],
         [0.0376],
         [0.0107],
         [0.0338],
         [0.0085],
         [0.0160],
         [0.0078],
         [0.0191],
         [0.0058],
         [0.0479],
         [0.0632]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0273],
         [0.0148],
         [0.0207],
         [0.0134],
         [0.0237],
         [0.0154],
         [0.0226],
         [0.0076],
         [0.0337],
         [0.0109],
         [0.0218],
         [0.0282]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0056],
         [0.0248],
         [0.0543],
         [0.0034],
         [0.0106],
         [0.0159],
         [0.0100],
         [0.0036],
         [0.0018],
         [0.0026],
         [0.0612],
         [0.0334]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2511],
         [0.0632],
         [0.0161],
         [0.0206],
         [0.0351],
         [0.0307],
         [0.0387],
         [0.0195],
         [0.0240],
    


Evaluating:  63%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 633/1000 [00:29<00:17, 21.39it/s][A

tensor([[[0.0597],
         [0.0197],
         [0.0511],
         [0.0297],
         [0.0130],
         [0.0079],
         [0.0176],
         [0.0057],
         [0.0119],
         [0.0060],
         [0.0224],
         [0.0330]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0969],
         [0.0368],
         [0.0593],
         [0.0427],
         [0.0593],
         [0.0126],
         [0.0470],
         [0.0203],
         [0.0181],
         [0.0150],
         [0.0264],
         [0.0404]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0934],
         [0.0257],
         [0.0748],
         [0.0210],
         [0.0480],
         [0.0052],
         [0.0425],
         [0.0080],
         [0.0114],
         [0.0080],
         [0.0093],
         [0.0404]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0815],
         [0.0301],
         [0.0279],
         [0.0294],
         [0.0139],
         [0.0070],
         [0.0134],
         [0.0051],
         [0.0134],
    


Evaluating:  64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 636/1000 [00:29<00:16, 21.46it/s][A

tensor([[[0.0361],
         [0.0633],
         [0.0539],
         [0.0061],
         [0.0394],
         [0.0220],
         [0.0310],
         [0.0181],
         [0.0131],
         [0.0103],
         [0.0238],
         [0.0168]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0151],
         [0.0360],
         [0.0231],
         [0.0222],
         [0.0240],
         [0.0042],
         [0.0244],
         [0.0041],
         [0.0019],
         [0.0055],
         [0.0510],
         [0.0188]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0196],
         [0.0378],
         [0.1104],
         [0.0143],
         [0.0530],
         [0.0164],
         [0.0214],
         [0.0195],
         [0.0181],
         [0.0082],
         [0.0200],
         [0.0667]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0338],
         [0.0905],
         [0.1726],
         [0.0443],
         [0.0197],
         [0.0230],
         [0.0101],
         [0.0037],
         [0.0020],
    


Evaluating:  64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 639/1000 [00:30<00:16, 21.38it/s][A
Evaluating:  64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                | 642/1000 [00:30<00:16, 21.41it/s][A

tensor([[[0.0416],
         [0.0214],
         [0.0498],
         [0.0166],
         [0.0134],
         [0.0079],
         [0.0413],
         [0.0099],
         [0.0057],
         [0.0083],
         [0.0569],
         [0.0307]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0119],
         [0.0052],
         [0.0209],
         [0.0051],
         [0.0573],
         [0.0052],
         [0.0161],
         [0.0043],
         [0.0484],
         [0.0049],
         [0.0048],
         [0.0117]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0155],
         [0.0230],
         [0.0205],
         [0.0153],
         [0.0050],
         [0.0018],
         [0.0116],
         [0.0020],
         [0.0018],
         [0.0046],
         [0.0159],
         [0.0110]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0378],
         [0.0238],
         [0.0625],
         [0.0082],
         [0.0221],
         [0.0056],
         [0.0148],
         [0.0091],
         [0.0101],
    


Evaluating:  64%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                               | 645/1000 [00:30<00:16, 21.39it/s][A

tensor([[[0.0389],
         [0.0718],
         [0.0635],
         [0.0186],
         [0.0239],
         [0.0106],
         [0.0146],
         [0.0111],
         [0.0054],
         [0.0113],
         [0.0184],
         [0.0171]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1458],
         [0.1963],
         [0.0202],
         [0.0149],
         [0.0134],
         [0.0261],
         [0.0078],
         [0.0100],
         [0.0037],
         [0.0096],
         [0.0447],
         [0.0188]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0703],
         [0.0661],
         [0.0228],
         [0.0134],
         [0.0293],
         [0.0393],
         [0.0023],
         [0.0056],
         [0.0020],
         [0.0077],
         [0.0141],
         [0.0282]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0372],
         [0.0423],
         [0.0370],
         [0.0106],
         [0.0188],
         [0.0108],
         [0.0074],
         [0.0055],
         [0.0030],
    


Evaluating:  65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 648/1000 [00:30<00:16, 21.37it/s][A

tensor([[[0.0253],
         [0.0654],
         [0.0457],
         [0.0291],
         [0.0230],
         [0.0190],
         [0.0099],
         [0.0126],
         [0.0036],
         [0.0075],
         [0.0423],
         [0.1228]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0115],
         [0.0081],
         [0.0602],
         [0.0177],
         [0.0579],
         [0.0077],
         [0.0035],
         [0.0031],
         [0.0021],
         [0.0033],
         [0.0572],
         [0.0537]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0269],
         [0.0248],
         [0.0313],
         [0.0166],
         [0.0200],
         [0.0218],
         [0.0236],
         [0.0214],
         [0.0025],
         [0.0096],
         [0.0842],
         [0.0281]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0086],
         [0.0187],
         [0.0642],
         [0.0058],
         [0.0096],
         [0.0105],
         [0.0069],
         [0.0056],
         [0.0010],
    


Evaluating:  65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 651/1000 [00:30<00:16, 21.50it/s][A
Evaluating:  65%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                              | 654/1000 [00:30<00:16, 21.54it/s][A

tensor([[[0.0296],
         [0.0308],
         [0.0808],
         [0.0188],
         [0.0588],
         [0.0241],
         [0.0087],
         [0.0510],
         [0.0221],
         [0.0106],
         [0.0246],
         [0.0478]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0468],
         [0.0337],
         [0.0742],
         [0.0266],
         [0.0755],
         [0.0118],
         [0.0214],
         [0.0380],
         [0.0221],
         [0.0270],
         [0.0264],
         [0.0349]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0075],
         [0.0734],
         [0.0481],
         [0.0142],
         [0.0212],
         [0.0144],
         [0.0074],
         [0.0057],
         [0.0035],
         [0.0069],
         [0.0678],
         [0.0158]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0389],
         [0.0610],
         [0.0740],
         [0.0256],
         [0.0366],
         [0.0080],
         [0.0179],
         [0.0121],
         [0.0132],
    


Evaluating:  66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                             | 657/1000 [00:30<00:16, 21.41it/s][A

tensor([[[0.0071],
         [0.0134],
         [0.0346],
         [0.0025],
         [0.0154],
         [0.0051],
         [0.0188],
         [0.0077],
         [0.0064],
         [0.0034],
         [0.0051],
         [0.0037]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0348],
         [0.0145],
         [0.0244],
         [0.0108],
         [0.0294],
         [0.0037],
         [0.0213],
         [0.0238],
         [0.0114],
         [0.0112],
         [0.0110],
         [0.0217]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0229],
         [0.0138],
         [0.0177],
         [0.0061],
         [0.0061],
         [0.0024],
         [0.0104],
         [0.0019],
         [0.0021],
         [0.0040],
         [0.0121],
         [0.0053]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1469],
         [0.1009],
         [0.0291],
         [0.0436],
         [0.0224],
         [0.0116],
         [0.1139],
         [0.0479],
         [0.0163],
    


Evaluating:  66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                            | 660/1000 [00:31<00:15, 21.38it/s][A

tensor([[[0.0283],
         [0.0503],
         [0.1206],
         [0.0097],
         [0.0635],
         [0.0210],
         [0.0068],
         [0.0501],
         [0.0017],
         [0.0063],
         [0.2143],
         [0.0620]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0365],
         [0.0243],
         [0.0450],
         [0.0336],
         [0.0237],
         [0.0157],
         [0.0338],
         [0.0118],
         [0.0074],
         [0.0130],
         [0.0785],
         [0.0332]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0449],
         [0.0243],
         [0.0890],
         [0.0426],
         [0.0646],
         [0.0071],
         [0.0227],
         [0.0368],
         [0.0202],
         [0.0249],
         [0.0228],
         [0.0396]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0628],
         [0.0956],
         [0.0531],
         [0.0183],
         [0.0276],
         [0.0166],
         [0.0332],
         [0.0205],
         [0.0066],
    


Evaluating:  66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 663/1000 [00:31<00:15, 21.37it/s][A
Evaluating:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 666/1000 [00:31<00:15, 21.27it/s][A

tensor([[[0.0800],
         [0.0301],
         [0.0556],
         [0.0087],
         [0.0342],
         [0.0090],
         [0.0295],
         [0.0126],
         [0.0068],
         [0.0170],
         [0.0087],
         [0.0185]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0094],
         [0.0234],
         [0.0190],
         [0.0094],
         [0.0115],
         [0.0088],
         [0.0209],
         [0.0066],
         [0.0117],
         [0.0071],
         [0.0420],
         [0.0096]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0529],
         [0.0356],
         [0.0267],
         [0.0132],
         [0.0201],
         [0.0130],
         [0.0427],
         [0.0134],
         [0.0124],
         [0.0079],
         [0.0078],
         [0.0103]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0400],
         [0.0153],
         [0.0377],
         [0.0058],
         [0.0156],
         [0.0013],
         [0.0288],
         [0.0164],
         [0.0318],
    


Evaluating:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                           | 669/1000 [00:31<00:15, 21.29it/s][A

tensor([[[0.0234],
         [0.0211],
         [0.0463],
         [0.0078],
         [0.0174],
         [0.0260],
         [0.0152],
         [0.0091],
         [0.0094],
         [0.0048],
         [0.0184],
         [0.0236]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0250],
         [0.0739],
         [0.0378],
         [0.0130],
         [0.0189],
         [0.0076],
         [0.0096],
         [0.0267],
         [0.0041],
         [0.0045],
         [0.0120],
         [0.0315]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0074],
         [0.0114],
         [0.0132],
         [0.0099],
         [0.0115],
         [0.0298],
         [0.0302],
         [0.0013],
         [0.0016],
         [0.0055],
         [0.1432],
         [0.0265]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0379],
         [0.0355],
         [0.0533],
         [0.0171],
         [0.0167],
         [0.0066],
         [0.0181],
         [0.0063],
         [0.0118],
    


Evaluating:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 672/1000 [00:31<00:15, 21.30it/s][A

tensor([[[0.0096],
         [0.0074],
         [0.0193],
         [0.0040],
         [0.0081],
         [0.0027],
         [0.0209],
         [0.0064],
         [0.0110],
         [0.0046],
         [0.0039],
         [0.0137]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0283],
         [0.0283],
         [0.0299],
         [0.0086],
         [0.0216],
         [0.0090],
         [0.0293],
         [0.0102],
         [0.0582],
         [0.0060],
         [0.0066],
         [0.0420]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0569],
         [0.0255],
         [0.0162],
         [0.0211],
         [0.0403],
         [0.0098],
         [0.0981],
         [0.0077],
         [0.0416],
         [0.0251],
         [0.0176],
         [0.0127]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0516],
         [0.0533],
         [0.0460],
         [0.0167],
         [0.0151],
         [0.0125],
         [0.0239],
         [0.0321],
         [0.0133],
    


Evaluating:  68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                          | 675/1000 [00:31<00:15, 21.40it/s][A
Evaluating:  68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 678/1000 [00:31<00:14, 21.47it/s][A

tensor([[[0.1205],
         [0.0915],
         [0.3043],
         [0.0185],
         [0.0211],
         [0.1011],
         [0.1174],
         [0.0060],
         [0.0022],
         [0.0132],
         [0.1255],
         [0.0957]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0524],
         [0.0144],
         [0.0205],
         [0.0095],
         [0.0274],
         [0.0225],
         [0.0155],
         [0.0019],
         [0.0034],
         [0.0072],
         [0.0133],
         [0.0258]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0446],
         [0.0804],
         [0.0342],
         [0.0190],
         [0.0130],
         [0.0178],
         [0.0200],
         [0.0377],
         [0.0099],
         [0.0065],
         [0.0077],
         [0.0391]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0127],
         [0.0448],
         [0.1638],
         [0.0526],
         [0.0982],
         [0.0146],
         [0.0108],
         [0.0050],
         [0.0377],
    


Evaluating:  68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                         | 681/1000 [00:32<00:14, 21.48it/s][A

tensor([[[0.0362],
         [0.0119],
         [0.0370],
         [0.0081],
         [0.0170],
         [0.0097],
         [0.0119],
         [0.0076],
         [0.0066],
         [0.0040],
         [0.0080],
         [0.0109]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0252],
         [0.0231],
         [0.0771],
         [0.0085],
         [0.1247],
         [0.0110],
         [0.0137],
         [0.0022],
         [0.0094],
         [0.1164],
         [0.0236],
         [0.0096]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0177],
         [0.0233],
         [0.0600],
         [0.0170],
         [0.0203],
         [0.0115],
         [0.0058],
         [0.0207],
         [0.0041],
         [0.0085],
         [0.0458],
         [0.0693]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0182],
         [0.0256],
         [0.0441],
         [0.0147],
         [0.0064],
         [0.0118],
         [0.0094],
         [0.0112],
         [0.0037],
    


Evaluating:  68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 684/1000 [00:32<00:14, 21.39it/s][A

tensor([[[0.0426],
         [0.0487],
         [0.0461],
         [0.0353],
         [0.0247],
         [0.0071],
         [0.0196],
         [0.0173],
         [0.0132],
         [0.0126],
         [0.0322],
         [0.0285]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1802],
         [0.1714],
         [0.1161],
         [0.0208],
         [0.0477],
         [0.0133],
         [0.1134],
         [0.0436],
         [0.0383],
         [0.0355],
         [0.0135],
         [0.1069]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0252],
         [0.0398],
         [0.1517],
         [0.0091],
         [0.0238],
         [0.0106],
         [0.0053],
         [0.0069],
         [0.0015],
         [0.0049],
         [0.0593],
         [0.0384]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1105],
         [0.0798],
         [0.0288],
         [0.0205],
         [0.0166],
         [0.0180],
         [0.0199],
         [0.0214],
         [0.0060],
    


Evaluating:  69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 687/1000 [00:32<00:14, 21.37it/s][A
Evaluating:  69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                       | 690/1000 [00:32<00:14, 21.36it/s][A

tensor([[[0.0787],
         [0.0370],
         [0.0410],
         [0.0138],
         [0.0819],
         [0.0233],
         [0.0191],
         [0.0184],
         [0.0175],
         [0.0250],
         [0.0408],
         [0.0270]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1499],
         [0.0921],
         [0.0260],
         [0.0827],
         [0.0193],
         [0.0148],
         [0.0755],
         [0.0352],
         [0.0064],
         [0.0196],
         [0.0087],
         [0.0208]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0316],
         [0.0393],
         [0.0947],
         [0.0260],
         [0.0186],
         [0.0070],
         [0.0392],
         [0.0395],
         [0.0236],
         [0.0203],
         [0.0117],
         [0.0370]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0365],
         [0.0155],
         [0.0667],
         [0.0220],
         [0.0462],
         [0.0078],
         [0.0044],
         [0.0106],
         [0.0022],
    


Evaluating:  69%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                       | 693/1000 [00:32<00:14, 21.44it/s][A

tensor([[[0.0084],
         [0.0377],
         [0.0652],
         [0.0129],
         [0.0090],
         [0.0086],
         [0.0065],
         [0.0063],
         [0.0016],
         [0.0030],
         [0.0426],
         [0.0538]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0444],
         [0.0434],
         [0.0299],
         [0.0312],
         [0.0160],
         [0.0177],
         [0.0222],
         [0.0029],
         [0.0038],
         [0.0067],
         [0.0839],
         [0.0248]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0124],
         [0.0061],
         [0.0248],
         [0.0092],
         [0.0245],
         [0.0029],
         [0.0209],
         [0.0024],
         [0.0065],
         [0.0048],
         [0.0171],
         [0.0239]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0232],
         [0.0154],
         [0.0151],
         [0.0071],
         [0.0175],
         [0.0102],
         [0.0198],
         [0.0043],
         [0.0210],
    


Evaluating:  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 696/1000 [00:32<00:14, 21.41it/s][A

tensor([[[0.0158],
         [0.0257],
         [0.1074],
         [0.0273],
         [0.0638],
         [0.0113],
         [0.0554],
         [0.0038],
         [0.0091],
         [0.0109],
         [0.0942],
         [0.0850]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0412],
         [0.0211],
         [0.0416],
         [0.0151],
         [0.0427],
         [0.0115],
         [0.0106],
         [0.0073],
         [0.0188],
         [0.0109],
         [0.0321],
         [0.0607]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0791],
         [0.0976],
         [0.0350],
         [0.0215],
         [0.0167],
         [0.0180],
         [0.0345],
         [0.0070],
         [0.0021],
         [0.0164],
         [0.0171],
         [0.0260]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0331],
         [0.0109],
         [0.0164],
         [0.0077],
         [0.0102],
         [0.0031],
         [0.0151],
         [0.0027],
         [0.0064],
    


Evaluating:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 699/1000 [00:32<00:14, 21.39it/s][A
Evaluating:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                     | 702/1000 [00:33<00:13, 21.46it/s][A


reg attention sum per layer
tensor([[[0.0941],
         [0.0664],
         [0.0198],
         [0.0250],
         [0.0175],
         [0.0209],
         [0.0358],
         [0.0114],
         [0.0024],
         [0.0147],
         [0.0258],
         [0.0249]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0072],
         [0.0440],
         [0.0026],
         [0.0023],
         [0.0005],
         [0.0025],
         [0.0076],
         [0.0010],
         [0.0002],
         [0.0017],
         [0.0257],
         [0.0060]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0304],
         [0.0651],
         [0.0715],
         [0.0158],
         [0.0242],
         [0.0183],
         [0.0179],
         [0.0272],
         [0.0062],
         [0.0098],
         [0.0387],
         [0.0327]]], device='cuda:0')
reg attention sum per layer



Evaluating:  70%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 705/1000 [00:33<00:13, 21.38it/s][A

tensor([[[0.0053],
         [0.0039],
         [0.0178],
         [0.0074],
         [0.0069],
         [0.0042],
         [0.0060],
         [0.0006],
         [0.0006],
         [0.0006],
         [0.0116],
         [0.0155]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0476],
         [0.0364],
         [0.0276],
         [0.0173],
         [0.0221],
         [0.0042],
         [0.0264],
         [0.0051],
         [0.0036],
         [0.0055],
         [0.0317],
         [0.0231]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0253],
         [0.0214],
         [0.0495],
         [0.0097],
         [0.0135],
         [0.0045],
         [0.0239],
         [0.0043],
         [0.0045],
         [0.0053],
         [0.0064],
         [0.0154]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0414],
         [0.0434],
         [0.0306],
         [0.0110],
         [0.0157],
         [0.0027],
         [0.0159],
         [0.0429],
         [0.0226],
    


Evaluating:  71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 708/1000 [00:33<00:13, 21.37it/s][A


reg attention sum per layer
tensor([[[0.0849],
         [0.0426],
         [0.0296],
         [0.0242],
         [0.0245],
         [0.0099],
         [0.0822],
         [0.0156],
         [0.0043],
         [0.0202],
         [0.0138],
         [0.0511]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0189],
         [0.0341],
         [0.0322],
         [0.0136],
         [0.0090],
         [0.0043],
         [0.0055],
         [0.0069],
         [0.0024],
         [0.0040],
         [0.0164],
         [0.0169]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0961],
         [0.0705],
         [0.2089],
         [0.0548],
         [0.0250],
         [0.0150],
         [0.0654],
         [0.0167],
         [0.0140],
         [0.0236],
         [0.1380],
         [0.0408]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0434],
         [0.0799],
         [0.0597],
         [0.0219],
         [0.0205],
         [0.0135],
         [0.0444],
         [0.0


Evaluating:  71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 711/1000 [00:33<00:13, 21.35it/s][A
Evaluating:  71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                   | 714/1000 [00:33<00:13, 21.48it/s][A


reg attention sum per layer
tensor([[[0.0249],
         [0.0902],
         [0.0520],
         [0.0238],
         [0.0068],
         [0.0068],
         [0.0392],
         [0.0032],
         [0.0052],
         [0.0088],
         [0.0639],
         [0.0196]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0194],
         [0.0333],
         [0.0404],
         [0.0142],
         [0.0214],
         [0.0088],
         [0.0109],
         [0.0174],
         [0.0099],
         [0.0093],
         [0.0155],
         [0.0207]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0299],
         [0.0334],
         [0.0270],
         [0.0196],
         [0.0046],
         [0.0062],
         [0.0214],
         [0.0126],
         [0.0026],
         [0.0054],
         [0.0055],
         [0.0108]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0178],
         [0.0306],
         [0.0039],
         [0.0032],
         [0.0032],
         [0.0056],
         [0.0034],
         [0.0


Evaluating:  72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 717/1000 [00:33<00:13, 21.39it/s][A


reg attention sum per layer
tensor([[[0.0081],
         [0.0044],
         [0.0253],
         [0.0078],
         [0.0232],
         [0.0013],
         [0.0045],
         [0.0029],
         [0.0027],
         [0.0056],
         [0.0112],
         [0.0087]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0554],
         [0.0176],
         [0.0736],
         [0.0283],
         [0.0335],
         [0.0130],
         [0.0217],
         [0.0172],
         [0.0047],
         [0.0146],
         [0.0221],
         [0.0430]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0079],
         [0.0113],
         [0.0208],
         [0.0076],
         [0.0038],
         [0.0037],
         [0.0078],
         [0.0041],
         [0.0024],
         [0.0025],
         [0.0158],
         [0.0126]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0308],
         [0.0576],
         [0.0217],
         [0.0084],
         [0.0086],
         [0.0103],
         [0.0160],
         [0.0


Evaluating:  72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                  | 720/1000 [00:33<00:13, 21.37it/s][A
Evaluating:  72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 723/1000 [00:33<00:12, 21.45it/s]


reg attention sum per layer
tensor([[[0.0254],
         [0.0257],
         [0.0471],
         [0.0119],
         [0.0209],
         [0.0079],
         [0.0207],
         [0.0127],
         [0.0069],
         [0.0046],
         [0.0205],
         [0.0833]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0385],
         [0.0517],
         [0.0969],
         [0.0105],
         [0.0177],
         [0.0264],
         [0.0363],
         [0.0141],
         [0.0179],
         [0.0085],
         [0.0451],
         [0.0319]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0213],
         [0.0275],
         [0.0497],
         [0.0085],
         [0.0405],
         [0.0101],
         [0.0184],
         [0.0110],
         [0.0106],
         [0.0134],
         [0.0079],
         [0.0130]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0338],
         [0.0102],
         [0.0557],
         [0.0079],
         [0.0134],
         [0.0122],
         [0.0098],
         [0.0

[A
Evaluating:  73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                 | 726/1000 [00:34<00:12, 21.51it/s][A

reg attention sum per layer
tensor([[[0.0355],
         [0.0153],
         [0.0092],
         [0.0146],
         [0.0142],
         [0.0085],
         [0.0447],
         [0.0160],
         [0.0147],
         [0.0181],
         [0.0046],
         [0.0102]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0669],
         [0.0665],
         [0.1251],
         [0.0235],
         [0.0329],
         [0.0144],
         [0.0404],
         [0.0069],
         [0.0083],
         [0.0050],
         [0.0638],
         [0.0540]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0042],
         [0.0565],
         [0.0344],
         [0.0211],
         [0.0227],
         [0.0137],
         [0.0088],
         [0.0053],
         [0.0035],
         [0.0021],
         [0.0381],
         [0.0079]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0160],
         [0.0385],
         [0.0991],
         [0.0299],
         [0.0575],
         [0.0156],
         [0.0148],
         [0.01


Evaluating:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 729/1000 [00:34<00:12, 21.45it/s][A

tensor([[[0.0482],
         [0.0193],
         [0.0219],
         [0.0071],
         [0.0119],
         [0.0078],
         [0.0163],
         [0.0118],
         [0.0203],
         [0.0357],
         [0.0039],
         [0.0081]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0205],
         [0.0194],
         [0.0538],
         [0.0098],
         [0.0340],
         [0.0078],
         [0.0079],
         [0.0027],
         [0.0015],
         [0.0035],
         [0.0340],
         [0.0337]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0857],
         [0.0386],
         [0.0975],
         [0.0144],
         [0.0292],
         [0.0214],
         [0.0118],
         [0.0185],
         [0.0078],
         [0.0205],
         [0.0478],
         [0.0810]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0196],
         [0.0706],
         [0.0121],
         [0.0117],
         [0.0094],
         [0.0160],
         [0.0088],
         [0.0097],
         [0.0011],
    


Evaluating:  73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                | 732/1000 [00:34<00:12, 21.41it/s][A
Evaluating:  74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 735/1000 [00:34<00:12, 21.43it/s][A

tensor([[[0.0295],
         [0.0702],
         [0.3912],
         [0.0347],
         [0.0228],
         [0.0089],
         [0.0104],
         [0.0139],
         [0.0028],
         [0.0069],
         [0.1172],
         [0.1393]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0579],
         [0.0619],
         [0.1056],
         [0.0321],
         [0.0620],
         [0.0169],
         [0.0242],
         [0.0335],
         [0.0103],
         [0.0245],
         [0.0318],
         [0.0966]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0202],
         [0.0286],
         [0.0201],
         [0.0095],
         [0.0352],
         [0.0075],
         [0.0145],
         [0.0230],
         [0.0131],
         [0.0050],
         [0.0064],
         [0.0308]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0563],
         [0.0420],
         [0.0281],
         [0.0106],
         [0.0062],
         [0.0021],
         [0.0366],
         [0.0182],
         [0.0159],
    


Evaluating:  74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 738/1000 [00:34<00:12, 21.27it/s][A

tensor([[[0.0108],
         [0.0212],
         [0.0170],
         [0.0068],
         [0.0136],
         [0.0040],
         [0.0116],
         [0.0069],
         [0.0029],
         [0.0037],
         [0.0114],
         [0.0180]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0697],
         [0.0373],
         [0.0517],
         [0.0103],
         [0.0220],
         [0.0143],
         [0.0261],
         [0.0104],
         [0.0128],
         [0.0193],
         [0.0135],
         [0.0389]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0135],
         [0.0242],
         [0.0351],
         [0.0076],
         [0.0050],
         [0.0101],
         [0.0060],
         [0.0060],
         [0.0041],
         [0.0041],
         [0.0204],
         [0.0139]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0842],
         [0.0457],
         [0.0643],
         [0.0156],
         [0.0214],
         [0.0155],
         [0.0335],
         [0.0217],
         [0.0079],
    


Evaluating:  74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 741/1000 [00:34<00:12, 21.32it/s][A

tensor([[[0.0306],
         [0.0218],
         [0.0495],
         [0.0267],
         [0.0299],
         [0.0104],
         [0.0108],
         [0.0127],
         [0.0089],
         [0.0063],
         [0.0327],
         [0.0348]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0202],
         [0.0105],
         [0.0070],
         [0.0089],
         [0.0103],
         [0.0063],
         [0.0070],
         [0.0141],
         [0.0018],
         [0.0056],
         [0.0061],
         [0.0059]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0500],
         [0.1114],
         [0.0468],
         [0.0130],
         [0.0596],
         [0.0062],
         [0.0121],
         [0.0311],
         [0.0014],
         [0.0144],
         [0.0217],
         [0.0409]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0525],
         [0.0536],
         [0.0982],
         [0.0349],
         [0.0299],
         [0.0128],
         [0.0406],
         [0.0700],
         [0.0249],
    


Evaluating:  74%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 744/1000 [00:34<00:11, 21.37it/s][A
Evaluating:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 747/1000 [00:35<00:11, 21.45it/s][A

tensor([[[0.0666],
         [0.0216],
         [0.0219],
         [0.0108],
         [0.0469],
         [0.0116],
         [0.0226],
         [0.0206],
         [0.0206],
         [0.0246],
         [0.0043],
         [0.0254]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0175],
         [0.0253],
         [0.0807],
         [0.0278],
         [0.0267],
         [0.0109],
         [0.0118],
         [0.0090],
         [0.0030],
         [0.0049],
         [0.0078],
         [0.0602]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0718],
         [0.0424],
         [0.0674],
         [0.0341],
         [0.0339],
         [0.0085],
         [0.0358],
         [0.0458],
         [0.0105],
         [0.0150],
         [0.0034],
         [0.0682]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1681],
         [0.0307],
         [0.0118],
         [0.0228],
         [0.0048],
         [0.0128],
         [0.0423],
         [0.0099],
         [0.0006],
    


Evaluating:  75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                            | 750/1000 [00:35<00:11, 21.37it/s][A

tensor([[[0.1246],
         [0.0152],
         [0.0475],
         [0.0141],
         [0.0279],
         [0.0360],
         [0.0118],
         [0.0162],
         [0.0028],
         [0.0231],
         [0.0150],
         [0.0106]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0460],
         [0.0164],
         [0.0805],
         [0.0218],
         [0.0207],
         [0.0208],
         [0.0197],
         [0.0013],
         [0.0013],
         [0.0166],
         [0.1717],
         [0.0387]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0252],
         [0.0557],
         [0.0429],
         [0.0114],
         [0.0113],
         [0.0268],
         [0.0246],
         [0.0321],
         [0.0110],
         [0.0104],
         [0.0110],
         [0.0266]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0063],
         [0.0026],
         [0.0183],
         [0.0040],
         [0.0050],
         [0.0012],
         [0.0289],
         [0.0030],
         [0.0092],
    


Evaluating:  75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 753/1000 [00:35<00:11, 21.36it/s][A

tensor([[[0.0197],
         [0.0177],
         [0.0197],
         [0.0171],
         [0.0125],
         [0.0052],
         [0.0278],
         [0.0086],
         [0.0335],
         [0.0064],
         [0.0035],
         [0.0387]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0155],
         [0.0236],
         [0.0489],
         [0.0180],
         [0.0068],
         [0.0166],
         [0.0083],
         [0.0055],
         [0.0066],
         [0.0028],
         [0.0403],
         [0.0374]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0292],
         [0.0615],
         [0.0716],
         [0.0244],
         [0.0135],
         [0.0118],
         [0.0133],
         [0.0313],
         [0.0033],
         [0.0026],
         [0.0620],
         [0.0593]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0550],
         [0.0833],
         [0.0408],
         [0.0237],
         [0.0613],
         [0.0257],
         [0.0827],
         [0.0320],
         [0.0135],
    


Evaluating:  76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 756/1000 [00:35<00:11, 21.35it/s][A
Evaluating:  76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 759/1000 [00:35<00:11, 21.39it/s][A

tensor([[[0.0507],
         [0.0498],
         [0.0508],
         [0.0187],
         [0.0226],
         [0.0208],
         [0.0197],
         [0.0661],
         [0.0114],
         [0.0062],
         [0.0236],
         [0.0554]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0619],
         [0.0641],
         [0.0294],
         [0.0102],
         [0.0239],
         [0.0082],
         [0.0336],
         [0.0130],
         [0.0153],
         [0.0135],
         [0.0066],
         [0.0338]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0156],
         [0.0315],
         [0.1533],
         [0.0081],
         [0.0543],
         [0.0153],
         [0.0140],
         [0.0049],
         [0.0195],
         [0.0052],
         [0.0873],
         [0.08


Evaluating:  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                          | 762/1000 [00:35<00:11, 21.33it/s][A

tensor([[[0.0424],
         [0.0140],
         [0.0207],
         [0.0186],
         [0.0223],
         [0.0175],
         [0.0235],
         [0.0060],
         [0.0059],
         [0.0127],
         [0.0215],
         [0.0203]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0323],
         [0.0087],
         [0.0322],
         [0.0078],
         [0.0300],
         [0.0262],
         [0.0182],
         [0.0178],
         [0.0046],
         [0.0060],
         [0.0085],
         [0.0310]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0256],
         [0.0131],
         [0.0496],
         [0.0179],
         [0.0237],
         [0.0167],
         [0.0407],
         [0.0112],
         [0.0228],
         [0.0128],
         [0.0108],
         [0.0198]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0296],
         [0.0318],
         [0.0141],
         [0.0156],
         [0.0296],
         [0.0164],
         [0.0083],
         [0.0049],
         [0.0058],
    


Evaluating:  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 765/1000 [00:35<00:11, 21.33it/s][A

tensor([[[0.0305],
         [0.0196],
         [0.0255],
         [0.0133],
         [0.0169],
         [0.0253],
         [0.0281],
         [0.0204],
         [0.0055],
         [0.0127],
         [0.0094],
         [0.0196]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1344],
         [0.0944],
         [0.0567],
         [0.0275],
         [0.0164],
         [0.0087],
         [0.0349],
         [0.0165],
         [0.0082],
         [0.0110],
         [0.0146],
         [0.0479]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0305],
         [0.0316],
         [0.0545],
         [0.0113],
         [0.0240],
         [0.0146],
         [0.0106],
         [0.0319],
         [0.0205],
         [0.0074],
         [0.0098],
         [0.0369]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1319],
         [0.0692],
         [0.0550],
         [0.0147],
         [0.0156],
         [0.0258],
         [0.0173],
         [0.0044],
         [0.0012],
    


Evaluating:  77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                         | 768/1000 [00:36<00:10, 21.38it/s][A
Evaluating:  77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                         | 771/1000 [00:36<00:10, 21.47it/s][A

tensor([[[0.0099],
         [0.0075],
         [0.0186],
         [0.0027],
         [0.0068],
         [0.0055],
         [0.0071],
         [0.0051],
         [0.0084],
         [0.0059],
         [0.0039],
         [0.0066]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0853],
         [0.0593],
         [0.0584],
         [0.0250],
         [0.0554],
         [0.0082],
         [0.1224],
         [0.0543],
         [0.0285],
         [0.0279],
         [0.0103],
         [0.1106]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0511],
         [0.0316],
         [0.0821],
         [0.0152],
         [0.0168],
         [0.0066],
         [0.0442],
         [0.0057],
         [0.0062],
         [0.0093],
         [0.0376],
         [0.0387]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1741],
         [0.0513],
         [0.0918],
         [0.0363],
         [0.0258],
         [0.0238],
         [0.0505],
         [0.0759],
         [0.0102],
    


Evaluating:  77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 774/1000 [00:36<00:10, 21.43it/s][A

tensor([[[0.0711],
         [0.0261],
         [0.0909],
         [0.0294],
         [0.0438],
         [0.0123],
         [0.0198],
         [0.0128],
         [0.0285],
         [0.0208],
         [0.0116],
         [0.0174]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0834],
         [0.0561],
         [0.0154],
         [0.0217],
         [0.0219],
         [0.0121],
         [0.0349],
         [0.0080],
         [0.0054],
         [0.0066],
         [0.0054],
         [0.0141]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0544],
         [0.1625],
         [0.0647],
         [0.0595],
         [0.0547],
         [0.0339],
         [0.0248],
         [0.0099],
         [0.0142],
         [0.0098],
         [0.0697],
         [0.0982]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0641],
         [0.0147],
         [0.0871],
         [0.0145],
         [0.0425],
         [0.0124],
         [0.0175],
         [0.0039],
         [0.0078],
    


Evaluating:  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                        | 777/1000 [00:36<00:10, 21.31it/s][A

tensor([[[0.0474],
         [0.0687],
         [0.0841],
         [0.0344],
         [0.0383],
         [0.0120],
         [0.0113],
         [0.0298],
         [0.0074],
         [0.0079],
         [0.0410],
         [0.0713]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0085],
         [0.0214],
         [0.0379],
         [0.0074],
         [0.0098],
         [0.0041],
         [0.0072],
         [0.0026],
         [0.0024],
         [0.0027],
         [0.0530],
         [0.0118]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0212],
         [0.0202],
         [0.0108],
         [0.0053],
         [0.0030],
         [0.0021],
         [0.0098],
         [0.0031],
         [0.0007],
         [0.0037],
         [0.0082],
         [0.0055]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0475],
         [0.0402],
         [0.0134],
         [0.0079],
         [0.0328],
         [0.0142],
         [0.0443],
         [0.0386],
         [0.0416],
    


Evaluating:  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 780/1000 [00:36<00:10, 21.41it/s][A
Evaluating:  78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                      | 783/1000 [00:36<00:10, 21.43it/s][A

tensor([[[0.0493],
         [0.0101],
         [0.0198],
         [0.0067],
         [0.0115],
         [0.0500],
         [0.0058],
         [0.0139],
         [0.0026],
         [0.0049],
         [0.0238],
         [0.0208]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0367],
         [0.0459],
         [0.1141],
         [0.0148],
         [0.0659],
         [0.0826],
         [0.0243],
         [0.0654],
         [0.0099],
         [0.0155],
         [0.0339],
         [0.0568]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0454],
         [0.0910],
         [0.0535],
         [0.0168],
         [0.0167],
         [0.0064],
         [0.0182],
         [0.0234],
         [0.0173],
         [0.0095],
         [0.0091],
         [0.0739]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0218],
         [0.0630],
         [0.0820],
         [0.0230],
         [0.0292],
         [0.0035],
         [0.0200],
         [0.0065],
         [0.0354],
    


Evaluating:  79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                      | 786/1000 [00:36<00:10, 21.31it/s][A

tensor([[[0.0464],
         [0.0896],
         [0.0084],
         [0.0095],
         [0.0019],
         [0.0037],
         [0.0775],
         [0.0183],
         [0.0143],
         [0.0199],
         [0.0057],
         [0.0052]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0019],
         [0.0080],
         [0.0099],
         [0.0020],
         [0.0045],
         [0.0050],
         [0.0033],
         [0.0012],
         [0.0035],
         [0.0010],
         [0.0058],
         [0.0086]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1193],
         [0.0333],
         [0.1291],
         [0.0283],
         [0.0336],
         [0.0203],
         [0.0246],
         [0.0058],
         [0.0117],
         [0.0209],
         [0.0485],
         [0.0373]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0233],
         [0.0189],
         [0.0296],
         [0.0207],
         [0.0222],
         [0.0111],
         [0.0184],
         [0.0022],
         [0.0016],
    


Evaluating:  79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 789/1000 [00:37<00:09, 21.36it/s][A

tensor([[[0.0595],
         [0.0216],
         [0.0125],
         [0.0080],
         [0.0032],
         [0.0303],
         [0.0152],
         [0.0121],
         [0.0023],
         [0.0039],
         [0.0057],
         [0.0070]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0895],
         [0.1163],
         [0.1758],
         [0.0837],
         [0.0297],
         [0.0158],
         [0.0459],
         [0.0419],
         [0.0143],
         [0.0105],
         [0.0298],
         [0.0687]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0095],
         [0.0121],
         [0.0127],
         [0.0061],
         [0.0090],
         [0.0073],
         [0.0059],
         [0.0014],
         [0.0014],
         [0.0013],
         [0.0404],
         [0.0249]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0786],
         [0.0739],
         [0.0392],
         [0.0107],
         [0.0231],
         [0.0220],
         [0.0224],
         [0.0103],
         [0.0040],
    


Evaluating:  79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                     | 792/1000 [00:37<00:09, 21.45it/s][A
Evaluating:  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                    | 795/1000 [00:37<00:09, 21.41it/s][A

tensor([[[0.0416],
         [0.0563],
         [0.0505],
         [0.0137],
         [0.0178],
         [0.0049],
         [0.0153],
         [0.0053],
         [0.0061],
         [0.0038],
         [0.0183],
         [0.0429]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0368],
         [0.0735],
         [0.0909],
         [0.0272],
         [0.0235],
         [0.0094],
         [0.0198],
         [0.0531],
         [0.0092],
         [0.0197],
         [0.0168],
         [0.0308]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0156],
         [0.0187],
         [0.0267],
         [0.0041],
         [0.0059],
         [0.0101],
         [0.0263],
         [0.0033],
         [0.0066],
         [0.0087],
         [0.0180],
         [0.0196]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0439],
         [0.0781],
         [0.0631],
         [0.0240],
         [0.0163],
         [0.0085],
         [0.0239],
         [0.0051],
         [0.0010],
    


Evaluating:  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                    | 798/1000 [00:37<00:09, 21.39it/s][A

tensor([[[0.0195],
         [0.0335],
         [0.0209],
         [0.0074],
         [0.0040],
         [0.0116],
         [0.0036],
         [0.0031],
         [0.0007],
         [0.0021],
         [0.0127],
         [0.0156]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0191],
         [0.0635],
         [0.0280],
         [0.0084],
         [0.0161],
         [0.0098],
         [0.0160],
         [0.0140],
         [0.0048],
         [0.0031],
         [0.0066],
         [0.0268]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0237],
         [0.0327],
         [0.0527],
         [0.0062],
         [0.0117],
         [0.0082],
         [0.0343],
         [0.0116],
         [0.0083],
         [0.0036],
         [0.0150],
         [0.0280]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0082],
         [0.1588],
         [0.0460],
         [0.0171],
         [0.0108],
         [0.0130],
         [0.0134],
         [0.0048],
         [0.0040],
    


Evaluating:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 801/1000 [00:37<00:09, 21.36it/s][A

tensor([[[0.0261],
         [0.0161],
         [0.0459],
         [0.0076],
         [0.0056],
         [0.0270],
         [0.0057],
         [0.0050],
         [0.0084],
         [0.0056],
         [0.0133],
         [0.0121]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0379],
         [0.0220],
         [0.2591],
         [0.0120],
         [0.0295],
         [0.0065],
         [0.0218],
         [0.0084],
         [0.0109],
         [0.0096],
         [0.0121],
         [0.0551]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1459],
         [0.0375],
         [0.0243],
         [0.0390],
         [0.0387],
         [0.0149],
         [0.0487],
         [0.0310],
         [0.0192],
         [0.0305],
         [0.0136],
         [0.0212]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0267],
         [0.0365],
         [0.0586],
         [0.0220],
         [0.0675],
         [0.0163],
         [0.0207],
         [0.0040],
         [0.0151],
    


Evaluating:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                   | 804/1000 [00:37<00:09, 21.40it/s][A
Evaluating:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                  | 807/1000 [00:37<00:09, 21.38it/s][A

tensor([[[0.0807],
         [0.0334],
         [0.0291],
         [0.0336],
         [0.0518],
         [0.0394],
         [0.0646],
         [0.0117],
         [0.0214],
         [0.0336],
         [0.0305],
         [0.0133]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1152],
         [0.0330],
         [0.0423],
         [0.0155],
         [0.0495],
         [0.0276],
         [0.0302],
         [0.0099],
         [0.0053],
         [0.0327],
         [0.0077],
         [0.0483]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0409],
         [0.1129],
         [0.0888],
         [0.0334],
         [0.0353],
         [0.0407],
         [0.0208],
         [0.0749],
         [0.0119],
         [0.0202],
         [0.0592],
         [0.0489]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0271],
         [0.0346],
         [0.0345],
         [0.0098],
         [0.0307],
         [0.0076],
         [0.0316],
         [0.0072],
         [0.0112],
    


Evaluating:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                  | 810/1000 [00:38<00:08, 21.27it/s][A

tensor([[[0.0381],
         [0.0195],
         [0.0517],
         [0.0136],
         [0.0195],
         [0.0272],
         [0.0068],
         [0.0043],
         [0.0014],
         [0.0088],
         [0.0100],
         [0.0467]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0159],
         [0.0092],
         [0.0187],
         [0.0061],
         [0.0105],
         [0.0116],
         [0.0112],
         [0.0051],
         [0.0388],
         [0.0052],
         [0.0031],
         [0.0211]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0344],
         [0.0422],
         [0.0767],
         [0.0194],
         [0.0280],
         [0.0274],
         [0.0284],
         [0.0073],
         [0.0099],
         [0.0109],
         [0.0201],
         [0.0565]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0079],
         [0.0110],
         [0.0416],
         [0.0271],
         [0.0079],
         [0.0015],
         [0.0167],
         [0.0014],
         [0.0035],
    


Evaluating:  81%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                 | 813/1000 [00:38<00:08, 21.34it/s][A

tensor([[[0.0036],
         [0.0108],
         [0.0691],
         [0.0055],
         [0.0041],
         [0.0020],
         [0.0034],
         [0.0009],
         [0.0027],
         [0.0024],
         [0.0108],
         [0.0156]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0801],
         [0.0486],
         [0.0934],
         [0.0203],
         [0.0298],
         [0.0205],
         [0.0487],
         [0.0463],
         [0.0210],
         [0.0169],
         [0.0055],
         [0.1196]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0804],
         [0.0530],
         [0.0208],
         [0.0258],
         [0.0190],
         [0.0183],
         [0.0397],
         [0.0343],
         [0.0088],
         [0.0068],
         [0.0082],
         [0.0281]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0249],
         [0.0268],
         [0.0581],
         [0.0215],
         [0.0072],
         [0.0085],
         [0.0078],
         [0.0053],
         [0.0017],
    


Evaluating:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                 | 816/1000 [00:38<00:08, 21.34it/s][A
Evaluating:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 819/1000 [00:38<00:08, 21.43it/s][A

tensor([[[0.0041],
         [0.0036],
         [0.0209],
         [0.0090],
         [0.0179],
         [0.0027],
         [0.0024],
         [0.0029],
         [0.0024],
         [0.0031],
         [0.0116],
         [0.0182]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0577],
         [0.0357],
         [0.0274],
         [0.0112],
         [0.0567],
         [0.0141],
         [0.0356],
         [0.0705],
         [0.0088],
         [0.0311],
         [0.0128],
         [0.0281]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0434],
         [0.0670],
         [0.0120],
         [0.0097],
         [0.0143],
         [0.0075],
         [0.0091],
         [0.0175],
         [0.0057],
         [0.0145],
         [0.0270],
         [0.0291]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0117],
         [0.0171],
         [0.0125],
         [0.0067],
         [0.0076],
         [0.0021],
         [0.0082],
         [0.0158],
         [0.0035],
    


Evaluating:  82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 822/1000 [00:38<00:08, 21.35it/s][A

tensor([[[0.0113],
         [0.0242],
         [0.0272],
         [0.0076],
         [0.0232],
         [0.0142],
         [0.0355],
         [0.0100],
         [0.0187],
         [0.0070],
         [0.0082],
         [0.0144]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0773],
         [0.0897],
         [0.0334],
         [0.0118],
         [0.0174],
         [0.0156],
         [0.0144],
         [0.0357],
         [0.0034],
         [0.0109],
         [0.0152],
         [0.0177]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0193],
         [0.0356],
         [0.0201],
         [0.0057],
         [0.0193],
         [0.0191],
         [0.0174],
         [0.0333],
         [0.0088],
         [0.0091],
         [0.0317],
         [0.0347]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0365],
         [0.2158],
         [0.0358],
         [0.0220],
         [0.0226],
         [0.0129],
         [0.0141],
         [0.0105],
         [0.0042],
    


Evaluating:  82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                               | 825/1000 [00:38<00:08, 21.35it/s][A

tensor([[[0.0068],
         [0.0152],
         [0.0212],
         [0.0044],
         [0.0052],
         [0.0134],
         [0.0062],
         [0.0016],
         [0.0006],
         [0.0008],
         [0.0290],
         [0.0231]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0682],
         [0.0329],
         [0.0928],
         [0.0144],
         [0.0177],
         [0.0358],
         [0.0188],
         [0.0129],
         [0.0049],
         [0.0127],
         [0.0294],
         [0.0362]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0185],
         [0.0227],
         [0.0429],
         [0.0083],
         [0.0167],
         [0.0079],
         [0.0136],
         [0.0084],
         [0.0020],
         [0.0051],
         [0.0119],
         [0.0207]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0501],
         [0.0355],
         [0.0353],
         [0.0246],
         [0.0414],
         [0.0185],
         [0.0551],
         [0.0113],
         [0.0246],
    


Evaluating:  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                              | 828/1000 [00:38<00:08, 21.34it/s][A
Evaluating:  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                              | 831/1000 [00:39<00:07, 21.42it/s][A

tensor([[[0.0620],
         [0.0544],
         [0.0685],
         [0.0275],
         [0.0498],
         [0.0228],
         [0.0271],
         [0.0428],
         [0.0120],
         [0.0213],
         [0.0251],
         [0.1372]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0242],
         [0.0268],
         [0.0496],
         [0.0114],
         [0.0127],
         [0.0044],
         [0.0221],
         [0.0140],
         [0.0120],
         [0.0070],
         [0.0038],
         [0.0209]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0677],
         [0.0128],
         [0.0264],
         [0.0118],
         [0.0189],
         [0.0398],
         [0.0136],
         [0.0139],
         [0.0015],
         [0.0097],
         [0.0213],
         [0.0118]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0301],
         [0.0238],
         [0.0249],
         [0.0108],
         [0.0145],
         [0.0123],
         [0.0065],
         [0.0080],
         [0.0029],
    


Evaluating:  83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 834/1000 [00:39<00:07, 21.35it/s][A

tensor([[[0.2814],
         [0.0459],
         [0.0212],
         [0.0110],
         [0.0436],
         [0.0174],
         [0.0401],
         [0.0334],
         [0.0140],
         [0.0823],
         [0.0085],
         [0.0207]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0238],
         [0.0434],
         [0.0211],
         [0.0134],
         [0.0279],
         [0.0044],
         [0.0155],
         [0.0304],
         [0.0094],
         [0.0101],
         [0.0153],
         [0.0259]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0276],
         [0.0243],
         [0.0531],
         [0.0110],
         [0.0348],
         [0.0203],
         [0.0119],
         [0.0134],
         [0.0034],
         [0.0071],
         [0.0199],
         [0.0140]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0910],
         [0.0394],
         [0.0946],
         [0.0342],
         [0.0261],
         [0.0132],
         [0.0448],
         [0.1306],
         [0.0066],
    


Evaluating:  84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                             | 837/1000 [00:39<00:07, 21.29it/s][A

tensor([[[0.0299],
         [0.0495],
         [0.0242],
         [0.0171],
         [0.0185],
         [0.0053],
         [0.0247],
         [0.0102],
         [0.0040],
         [0.0143],
         [0.0144],
         [0.0095]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0194],
         [0.0207],
         [0.0458],
         [0.0169],
         [0.0148],
         [0.0166],
         [0.0149],
         [0.0291],
         [0.0089],
         [0.0085],
         [0.0275],
         [0.0406]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0208],
         [0.0356],
         [0.0196],
         [0.0056],
         [0.0051],
         [0.0035],
         [0.0220],
         [0.0042],
         [0.0025],
         [0.0122],
         [0.0190],
         [0.0159]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0440],
         [0.0384],
         [0.0354],
         [0.0282],
         [0.0269],
         [0.0209],
         [0.0102],
         [0.0142],
         [0.0090],
    


Evaluating:  84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                            | 840/1000 [00:39<00:07, 21.31it/s][A
Evaluating:  84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 843/1000 [00:39<00:07, 21.31it/s][A

tensor([[[0.0112],
         [0.0193],
         [0.0459],
         [0.0113],
         [0.0099],
         [0.0047],
         [0.0067],
         [0.0084],
         [0.0017],
         [0.0025],
         [0.0186],
         [0.0309]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0035],
         [0.0187],
         [0.0172],
         [0.0086],
         [0.0017],
         [0.0031],
         [0.0024],
         [0.0017],
         [0.0014],
         [0.0014],
         [0.0169],
         [0.0042]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0746],
         [0.0421],
         [0.1632],
         [0.0265],
         [0.0161],
         [0.0184],
         [0.0644],
         [0.0065],
         [0.0103],
         [0.0129],
         [0.0363],
         [0.1502]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1076],
         [0.0458],
         [0.0426],
         [0.0220],
         [0.0153],
         [0.0098],
         [0.0497],
         [0.0529],
         [0.0108],
    


Evaluating:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                           | 846/1000 [00:39<00:07, 21.32it/s][A

tensor([[[0.0076],
         [0.0276],
         [0.0631],
         [0.0263],
         [0.0193],
         [0.0086],
         [0.0116],
         [0.0047],
         [0.0112],
         [0.0023],
         [0.0117],
         [0.0525]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0362],
         [0.0733],
         [0.0511],
         [0.0391],
         [0.0208],
         [0.0085],
         [0.0205],
         [0.0093],
         [0.0137],
         [0.0039],
         [0.0152],
         [0.0584]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0174],
         [0.0338],
         [0.0922],
         [0.0352],
         [0.0305],
         [0.0138],
         [0.0129],
         [0.0220],
         [0.0165],
         [0.0047],
         [0.0392],
         [0.0454]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0449],
         [0.1761],
         [0.1100],
         [0.0546],
         [0.0187],
         [0.0139],
         [0.0598],
         [0.0108],
         [0.0427],
    


Evaluating:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                           | 849/1000 [00:39<00:07, 21.23it/s][A

tensor([[[0.0708],
         [0.1783],
         [0.0407],
         [0.0095],
         [0.0093],
         [0.0146],
         [0.0289],
         [0.0265],
         [0.0066],
         [0.0132],
         [0.0030],
         [0.0309]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0547],
         [0.0674],
         [0.0381],
         [0.0065],
         [0.0198],
         [0.0174],
         [0.0128],
         [0.0173],
         [0.0041],
         [0.0099],
         [0.0253],
         [0.0207]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0747],
         [0.1717],
         [0.0483],
         [0.0201],
         [0.0146],
         [0.0046],
         [0.0466],
         [0.0402],
         [0.0232],
         [0.0218],
         [0.0185],
         [0.0303]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1116],
         [0.0328],
         [0.0611],
         [0.0152],
         [0.0163],
         [0.0315],
         [0.0461],
         [0.0668],
         [0.0091],
    


Evaluating:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 852/1000 [00:40<00:06, 21.22it/s][A
Evaluating:  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                          | 855/1000 [00:40<00:06, 21.34it/s][A

tensor([[[0.0582],
         [0.0345],
         [0.1032],
         [0.0272],
         [0.0495],
         [0.0189],
         [0.0097],
         [0.0081],
         [0.0042],
         [0.0068],
         [0.0353],
         [0.0621]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0164],
         [0.0244],
         [0.0646],
         [0.0105],
         [0.0109],
         [0.0173],
         [0.0180],
         [0.0086],
         [0.0058],
         [0.0038],
         [0.0929],
         [0.0260]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0679],
         [0.0279],
         [0.0085],
         [0.0157],
         [0.0127],
         [0.0139],
         [0.0281],
         [0.0054],
         [0.0047],
         [0.0142],
         [0.0024],
         [0.0104]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0655],
         [0.0297],
         [0.0236],
         [0.0170],
         [0.0351],
         [0.0402],
         [0.0165],
         [0.0115],
         [0.0060],
    


Evaluating:  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                         | 858/1000 [00:40<00:06, 21.25it/s][A


tensor([[[0.0271],
         [0.0113],
         [0.0351],
         [0.0078],
         [0.0224],
         [0.0063],
         [0.0114],
         [0.0032],
         [0.0072],
         [0.0123],
         [0.0035],
         [0.0070]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0367],
         [0.0330],
         [0.0270],
         [0.0128],
         [0.0086],
         [0.0182],
         [0.0170],
         [0.0031],
         [0.0032],
         [0.0056],
         [0.0311],
         [0.0073]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0258],
         [0.0458],
         [0.0868],
         [0.0128],
         [0.0335],
         [0.0355],
         [0.0216],
         [0.0329],
         [0.0218],
         [0.0069],
         [0.0569],
         [0.0482]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0427],
         [0.0421],
         [0.0492],
         [0.0174],
         [0.0234],
         [0.0530],
         [0.1260],
         [0.0142],
         [0.0340],
   


Evaluating:  86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                         | 861/1000 [00:40<00:06, 21.32it/s][A

tensor([[[0.0428],
         [0.0213],
         [0.0194],
         [0.0097],
         [0.0143],
         [0.0094],
         [0.0092],
         [0.0061],
         [0.0011],
         [0.0138],
         [0.0139],
         [0.0107]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0169],
         [0.0388],
         [0.0644],
         [0.0092],
         [0.0221],
         [0.0029],
         [0.0053],
         [0.0063],
         [0.0015],
         [0.0031],
         [0.0351],
         [0.0341]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0239],
         [0.0311],
         [0.1293],
         [0.0439],
         [0.0215],
         [0.0188],
         [0.0180],
         [0.0316],
         [0.0083],
         [0.0085],
         [0.1384],
         [0.0361]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0142],
         [0.0063],
         [0.0233],
         [0.0371],
         [0.0091],
         [0.0045],
         [0.0143],
         [0.0060],
         [0.0039],
    


Evaluating:  86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 864/1000 [00:40<00:06, 21.37it/s][A
Evaluating:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 867/1000 [00:40<00:06, 21.36it/s][A

tensor([[[0.0313],
         [0.0129],
         [0.0380],
         [0.0148],
         [0.0267],
         [0.0414],
         [0.0668],
         [0.0075],
         [0.0051],
         [0.0066],
         [0.0248],
         [0.0545]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0524],
         [0.1533],
         [0.0457],
         [0.0239],
         [0.0165],
         [0.0122],
         [0.0255],
         [0.0267],
         [0.0100],
         [0.0090],
         [0.0165],
         [0.0650]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0442],
         [0.0185],
         [0.0434],
         [0.0184],
         [0.0711],
         [0.0138],
         [0.0318],
         [0.0047],
         [0.0074],
         [0.0128],
         [0.0179],
         [0.0387]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0219],
         [0.0166],
         [0.1615],
         [0.0235],
         [0.0647],
         [0.0121],
         [0.0133],
         [0.0028],
         [0.0214],
    


Evaluating:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                       | 870/1000 [00:40<00:06, 21.30it/s][A

tensor([[[0.0390],
         [0.0702],
         [0.0390],
         [0.0262],
         [0.0219],
         [0.0266],
         [0.0220],
         [0.0150],
         [0.0088],
         [0.0124],
         [0.0139],
         [0.0262]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0125],
         [0.0471],
         [0.0728],
         [0.0085],
         [0.0163],
         [0.0264],
         [0.0101],
         [0.0117],
         [0.0094],
         [0.0066],
         [0.0144],
         [0.0234]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0537],
         [0.0267],
         [0.1528],
         [0.0201],
         [0.0312],
         [0.0062],
         [0.0124],
         [0.0090],
         [0.0172],
         [0.0056],
         [0.0137],
         [0.0659]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0381],
         [0.0568],
         [0.0368],
         [0.0105],
         [0.0450],
         [0.0131],
         [0.0785],
         [0.0062],
         [0.0116],
    


Evaluating:  87%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 873/1000 [00:41<00:05, 21.18it/s][A

tensor([[[0.0175],
         [0.0190],
         [0.1135],
         [0.0214],
         [0.0406],
         [0.0168],
         [0.0143],
         [0.0145],
         [0.0065],
         [0.0039],
         [0.1640],
         [0.0494]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0080],
         [0.0339],
         [0.0360],
         [0.0047],
         [0.0052],
         [0.0061],
         [0.0020],
         [0.0022],
         [0.0007],
         [0.0006],
         [0.0407],
         [0.0485]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0436],
         [0.0313],
         [0.0510],
         [0.0278],
         [0.0231],
         [0.0152],
         [0.0241],
         [0.0148],
         [0.0051],
         [0.0102],
         [0.0347],
         [0.0341]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0327],
         [0.0299],
         [0.1484],
         [0.0313],
         [0.0418],
         [0.0171],
         [0.0059],
         [0.0040],
         [0.0058],
    


Evaluating:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 876/1000 [00:41<00:05, 21.27it/s][A
Evaluating:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 879/1000 [00:41<00:05, 21.29it/s][A

tensor([[[0.0347],
         [0.0639],
         [0.0278],
         [0.0162],
         [0.0107],
         [0.0322],
         [0.0167],
         [0.0203],
         [0.0095],
         [0.0087],
         [0.0442],
         [0.0251]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0205],
         [0.0401],
         [0.0282],
         [0.0183],
         [0.0275],
         [0.0108],
         [0.0130],
         [0.0065],
         [0.0017],
         [0.0055],
         [0.0502],
         [0.0464]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0108],
         [0.0106],
         [0.0442],
         [0.0175],
         [0.0060],
         [0.0080],
         [0.0110],
         [0.0015],
         [0.0016],
         [0.0052],
         [0.0138],
         [0.0537]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0147],
         [0.0143],
         [0.0873],
         [0.0077],
         [0.0256],
         [0.0049],
         [0.0149],
         [0.0068],
         [0.0066],
    


Evaluating:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 882/1000 [00:41<00:05, 21.30it/s][A

tensor([[[0.0424],
         [0.0386],
         [0.0277],
         [0.0120],
         [0.0074],
         [0.0092],
         [0.0253],
         [0.0100],
         [0.0126],
         [0.0181],
         [0.0074],
         [0.0238]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0479],
         [0.0198],
         [0.0421],
         [0.0141],
         [0.0375],
         [0.0085],
         [0.0316],
         [0.0135],
         [0.0159],
         [0.0127],
         [0.0096],
         [0.0231]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0059],
         [0.0063],
         [0.1002],
         [0.0178],
         [0.0144],
         [0.0025],
         [0.0031],
         [0.0015],
         [0.0027],
         [0.0026],
         [0.0151],
         [0.0330]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0312],
         [0.0231],
         [0.0407],
         [0.0159],
         [0.0151],
         [0.0088],
         [0.0410],
         [0.0303],
         [0.0122],
    


Evaluating:  88%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 885/1000 [00:41<00:05, 21.36it/s][A

tensor([[[0.0299],
         [0.0508],
         [0.0426],
         [0.0109],
         [0.0199],
         [0.0258],
         [0.0337],
         [0.0269],
         [0.0286],
         [0.0172],
         [0.0199],
         [0.0341]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0793],
         [0.0424],
         [0.0266],
         [0.0266],
         [0.0211],
         [0.0741],
         [0.0112],
         [0.0021],
         [0.0033],
         [0.0240],
         [0.0357],
         [0.0532]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0202],
         [0.0119],
         [0.0271],
         [0.0137],
         [0.0443],
         [0.0110],
         [0.0072],
         [0.0183],
         [0.0077],
         [0.0054],
         [0.0181],
         [0.0320]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0538],
         [0.0726],
         [0.1358],
         [0.0502],
         [0.0330],
         [0.0102],
         [0.0314],
         [0.0091],
         [0.0078],
    


Evaluating:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 888/1000 [00:41<00:05, 21.35it/s][A
Evaluating:  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                   | 891/1000 [00:41<00:05, 21.30it/s][A

tensor([[[0.0314],
         [0.0175],
         [0.0240],
         [0.0082],
         [0.0090],
         [0.0094],
         [0.0121],
         [0.0059],
         [0.0089],
         [0.0052],
         [0.0102],
         [0.0346]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0304],
         [0.0106],
         [0.0357],
         [0.0082],
         [0.0234],
         [0.0054],
         [0.0064],
         [0.0143],
         [0.0046],
         [0.0096],
         [0.0070],
         [0.0145]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0232],
         [0.0282],
         [0.0262],
         [0.0121],
         [0.0194],
         [0.0089],
         [0.0173],
         [0.0096],
         [0.0275],
         [0.0107],
         [0.0137],
         [0.0193]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0359],
         [0.0378],
         [0.0597],
         [0.0284],
         [0.0799],
         [0.0265],
         [0.0204],
         [0.0608],
         [0.0195],
    


Evaluating:  89%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 894/1000 [00:42<00:04, 21.35it/s][A

tensor([[[0.0205],
         [0.0304],
         [0.0701],
         [0.0116],
         [0.0109],
         [0.0082],
         [0.0058],
         [0.0041],
         [0.0005],
         [0.0027],
         [0.0409],
         [0.0256]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0315],
         [0.0340],
         [0.0140],
         [0.0234],
         [0.0146],
         [0.0212],
         [0.0136],
         [0.0134],
         [0.0052],
         [0.0049],
         [0.0326],
         [0.0293]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0785],
         [0.0422],
         [0.1154],
         [0.0293],
         [0.0663],
         [0.0222],
         [0.0228],
         [0.0195],
         [0.0030],
         [0.0085],
         [0.0604],
         [0.0790]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0365],
         [0.0160],
         [0.0537],
         [0.0153],
         [0.0400],
         [0.0046],
         [0.0149],
         [0.0081],
         [0.0050],
    


Evaluating:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 897/1000 [00:42<00:04, 21.21it/s][A

tensor([[[0.1242],
         [0.0551],
         [0.0737],
         [0.0230],
         [0.0402],
         [0.0072],
         [0.0660],
         [0.0232],
         [0.0142],
         [0.0509],
         [0.0127],
         [0.0692]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0140],
         [0.0293],
         [0.0702],
         [0.0169],
         [0.0375],
         [0.0037],
         [0.0063],
         [0.0082],
         [0.0033],
         [0.0041],
         [0.0157],
         [0.0253]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0094],
         [0.0325],
         [0.0043],
         [0.0022],
         [0.0005],
         [0.0025],
         [0.0088],
         [0.0009],
         [0.0009],
         [0.0023],
         [0.0102],
         [0.0041]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1452],
         [0.0209],
         [0.0122],
         [0.0203],
         [0.0189],
         [0.0089],
         [0.0996],
         [0.0112],
         [0.0160],
    


Evaluating:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 900/1000 [00:42<00:04, 21.30it/s][A
Evaluating:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 903/1000 [00:42<00:04, 21.35it/s][A

tensor([[[0.0352],
         [0.0512],
         [0.0387],
         [0.0194],
         [0.0198],
         [0.0157],
         [0.0141],
         [0.0408],
         [0.0151],
         [0.0070],
         [0.0128],
         [0.0386]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0114],
         [0.0404],
         [0.1723],
         [0.0220],
         [0.0461],
         [0.0140],
         [0.0054],
         [0.0165],
         [0.0082],
         [0.0069],
         [0.0313],
         [0.0449]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0203],
         [0.0347],
         [0.0275],
         [0.0149],
         [0.0170],
         [0.0141],
         [0.0182],
         [0.0049],
         [0.0044],
         [0.0054],
         [0.0194],
         [0.0208]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0253],
         [0.0205],
         [0.0342],
         [0.0121],
         [0.0123],
         [0.0125],
         [0.0168],
         [0.0138],
         [0.0094],
    


Evaluating:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 906/1000 [00:42<00:04, 21.35it/s][A

tensor([[[0.0365],
         [0.0440],
         [0.1504],
         [0.0245],
         [0.0379],
         [0.0283],
         [0.0144],
         [0.0326],
         [0.0081],
         [0.0161],
         [0.0496],
         [0.0556]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1167],
         [0.0966],
         [0.0679],
         [0.0336],
         [0.1112],
         [0.0352],
         [0.0904],
         [0.0370],
         [0.0179],
         [0.0370],
         [0.0891],
         [0.0447]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0242],
         [0.0101],
         [0.0410],
         [0.0311],
         [0.0976],
         [0.0430],
         [0.0166],
         [0.0079],
         [0.0107],
         [0.0111],
         [0.0123],
         [0.0182]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0525],
         [0.0940],
         [0.0384],
         [0.0272],
         [0.0117],
         [0.0203],
         [0.0213],
         [0.0139],
         [0.0028],
    


Evaluating:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 909/1000 [00:42<00:04, 21.25it/s][A

tensor([[[0.0420],
         [0.0244],
         [0.0138],
         [0.0137],
         [0.0085],
         [0.0277],
         [0.0327],
         [0.0103],
         [0.0034],
         [0.0213],
         [0.0485],
         [0.0420]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0323],
         [0.0254],
         [0.0130],
         [0.0057],
         [0.0181],
         [0.0081],
         [0.0087],
         [0.0035],
         [0.0030],
         [0.0097],
         [0.0052],
         [0.0074]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0417],
         [0.0906],
         [0.0862],
         [0.0653],
         [0.0414],
         [0.0144],
         [0.0470],
         [0.0310],
         [0.0766],
         [0.0121],
         [0.0565],
         [0.0798]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0116],
         [0.0156],
         [0.0158],
         [0.0069],
         [0.0417],
         [0.0241],
         [0.0061],
         [0.0103],
         [0.0013],
    


Evaluating:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 912/1000 [00:42<00:04, 21.32it/s][A
Evaluating:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 915/1000 [00:42<00:03, 21.37it/s][A

tensor([[[0.0293],
         [0.0346],
         [0.1136],
         [0.0157],
         [0.0351],
         [0.0074],
         [0.0247],
         [0.0450],
         [0.0612],
         [0.0282],
         [0.0366],
         [0.0414]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0455],
         [0.0476],
         [0.0179],
         [0.0099],
         [0.0183],
         [0.0054],
         [0.0170],
         [0.0191],
         [0.0019],
         [0.0137],
         [0.0191],
         [0.0230]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0150],
         [0.0124],
         [0.0618],
         [0.0260],
         [0.0059],
         [0.0054],
         [0.0015],
         [0.0066],
         [0.0008],
         [0.0026],
         [0.0265],
         [0.0671]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0174],
         [0.0476],
         [0.0362],
         [0.0096],
         [0.0155],
         [0.0070],
         [0.0097],
         [0.0116],
         [0.0424],
    


Evaluating:  92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 918/1000 [00:43<00:03, 21.30it/s][A

tensor([[[0.0424],
         [0.0343],
         [0.0372],
         [0.0193],
         [0.0167],
         [0.0124],
         [0.0201],
         [0.0068],
         [0.0056],
         [0.0106],
         [0.0130],
         [0.0154]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0646],
         [0.0108],
         [0.0790],
         [0.0123],
         [0.0169],
         [0.0074],
         [0.0357],
         [0.0020],
         [0.0111],
         [0.0091],
         [0.0223],
         [0.0683]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1246],
         [0.1206],
         [0.0406],
         [0.0173],
         [0.0258],
         [0.0068],
         [0.1155],
         [0.0187],
         [0.0068],
         [0.0170],
         [0.0065],
         [0.05


Evaluating:  92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 921/1000 [00:43<00:03, 21.32it/s][A

tensor([[[0.0415],
         [0.0363],
         [0.0109],
         [0.0102],
         [0.0183],
         [0.0153],
         [0.0171],
         [0.0096],
         [0.0083],
         [0.0100],
         [0.0041],
         [0.0231]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0282],
         [0.0641],
         [0.0502],
         [0.0143],
         [0.0232],
         [0.0180],
         [0.0265],
         [0.0174],
         [0.0124],
         [0.0062],
         [0.0201],
         [0.0397]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0123],
         [0.0159],
         [0.0388],
         [0.0135],
         [0.0094],
         [0.0055],
         [0.0085],
         [0.0410],
         [0.0119],
         [0.0185],
         [0.0038],
         [0.0637]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0298],
         [0.0283],
         [0.0443],
         [0.0202],
         [0.0157],
         [0.0069],
         [0.0071],
         [0.0070],
         [0.0029],
    


Evaluating:  92%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍             | 924/1000 [00:43<00:03, 21.28it/s][A
Evaluating:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 927/1000 [00:43<00:03, 21.38it/s][A

tensor([[[0.0129],
         [0.0202],
         [0.0110],
         [0.0057],
         [0.0148],
         [0.0073],
         [0.0185],
         [0.0008],
         [0.0082],
         [0.0073],
         [0.0204],
         [0.0061]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0196],
         [0.0215],
         [0.0870],
         [0.0308],
         [0.0812],
         [0.0097],
         [0.0188],
         [0.0098],
         [0.0199],
         [0.0159],
         [0.0246],
         [0.0317]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0312],
         [0.0127],
         [0.0419],
         [0.0113],
         [0.0190],
         [0.0296],
         [0.0135],
         [0.0095],
         [0.0075],
         [0.0077],
         [0.0221],
         [0.0367]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0160],
         [0.0135],
         [0.0177],
         [0.0081],
         [0.0096],
         [0.0072],
         [0.0204],
         [0.0090],
         [0.0063],
    


Evaluating:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 930/1000 [00:43<00:03, 21.36it/s][A

tensor([[[0.0293],
         [0.0694],
         [0.0352],
         [0.0195],
         [0.0266],
         [0.0077],
         [0.0127],
         [0.0413],
         [0.0036],
         [0.0118],
         [0.0588],
         [0.0360]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0599],
         [0.0637],
         [0.0615],
         [0.0561],
         [0.0293],
         [0.0083],
         [0.0141],
         [0.0161],
         [0.0734],
         [0.0165],
         [0.0102],
         [0.0351]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0304],
         [0.1055],
         [0.1279],
         [0.0326],
         [0.0148],
         [0.0172],
         [0.0243],
         [0.0082],
         [0.0030],
         [0.0125],
         [0.0821],
         [0.0599]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0089],
         [0.0152],
         [0.0174],
         [0.0119],
         [0.0170],
         [0.0140],
         [0.0116],
         [0.0062],
         [0.0017],
    


Evaluating:  93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 933/1000 [00:43<00:03, 21.26it/s][A

tensor([[[0.0413],
         [0.0562],
         [0.0277],
         [0.0102],
         [0.0133],
         [0.0078],
         [0.0190],
         [0.0309],
         [0.0098],
         [0.0172],
         [0.0177],
         [0.0407]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0366],
         [0.0483],
         [0.0717],
         [0.0190],
         [0.0595],
         [0.0131],
         [0.0204],
         [0.0328],
         [0.0138],
         [0.0150],
         [0.0211],
         [0.0306]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0845],
         [0.0933],
         [0.0470],
         [0.0199],
         [0.0187],
         [0.0095],
         [0.0538],
         [0.0293],
         [0.0076],
         [0.0235],
         [0.0140],
         [0.0171]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0088],
         [0.0122],
         [0.0796],
         [0.0218],
         [0.0282],
         [0.0073],
         [0.0047],
         [0.0139],
         [0.0059],
    


Evaluating:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 936/1000 [00:43<00:03, 21.28it/s][A
Evaluating:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████           | 939/1000 [00:44<00:02, 21.25it/s][A

tensor([[[0.1163],
         [0.0566],
         [0.0615],
         [0.0276],
         [0.0478],
         [0.0211],
         [0.0406],
         [0.0504],
         [0.0144],
         [0.0224],
         [0.0125],
         [0.0332]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0048],
         [0.0100],
         [0.0140],
         [0.0104],
         [0.0041],
         [0.0090],
         [0.0069],
         [0.0027],
         [0.0019],
         [0.0014],
         [0.0248],
         [0.0059]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0526],
         [0.1154],
         [0.0710],
         [0.0203],
         [0.0335],
         [0.0149],
         [0.0235],
         [0.0600],
         [0.0097],
         [0.0213],
         [0.0187],
         [0.0299]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0353],
         [0.0402],
         [0.0541],
         [0.0500],
         [0.0728],
         [0.0094],
         [0.0355],
         [0.0091],
         [0.0340],
    


Evaluating:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌          | 942/1000 [00:44<00:02, 21.22it/s][A

tensor([[[0.0136],
         [0.0379],
         [0.0396],
         [0.0050],
         [0.0216],
         [0.0086],
         [0.0184],
         [0.0380],
         [0.0190],
         [0.0172],
         [0.0146],
         [0.0258]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0391],
         [0.0370],
         [0.0120],
         [0.0169],
         [0.0134],
         [0.0269],
         [0.0097],
         [0.0029],
         [0.0023],
         [0.0026],
         [0.0377],
         [0.0267]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0432],
         [0.0738],
         [0.0881],
         [0.0160],
         [0.0373],
         [0.0054],
         [0.0267],
         [0.0080],
         [0.0183],
         [0.0162],
         [0.0297],
         [0.0270]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0519],
         [0.0464],
         [0.0220],
         [0.0206],
         [0.0221],
         [0.0102],
         [0.0258],
         [0.0400],
         [0.0109],
    


Evaluating:  94%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏         | 945/1000 [00:44<00:02, 21.17it/s][A
Evaluating:  95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋         | 948/1000 [00:44<00:02, 21.31it/s]

reg attention sum per layer
tensor([[[0.0280],
         [0.0150],
         [0.0150],
         [0.0185],
         [0.0166],
         [0.0032],
         [0.0289],
         [0.0072],
         [0.0052],
         [0.0073],
         [0.0060],
         [0.0318]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0282],
         [0.0318],
         [0.0238],
         [0.0081],
         [0.0471],
         [0.0200],
         [0.0445],
         [0.0162],
         [0.0130],
         [0.0136],
         [0.0053],
         [0.0074]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0607],
         [0.0736],
         [0.0260],
         [0.0107],
         [0.0169],
         [0.0200],
         [0.0687],
         [0.0265],
         [0.0181],
         [0.0119],
         [0.0089],
         [0.0128]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0149],
         [0.0352],
         [0.0645],
         [0.0079],
         [0.0126],
         [0.0129],
         [0.0315],
         [0.01

[A
Evaluating:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏        | 951/1000 [00:44<00:02, 21.41it/s][A

reg attention sum per layer
tensor([[[0.0587],
         [0.0698],
         [0.1154],
         [0.0541],
         [0.0291],
         [0.0075],
         [0.0470],
         [0.0214],
         [0.0190],
         [0.0259],
         [0.0667],
         [0.0479]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0242],
         [0.0997],
         [0.0467],
         [0.0210],
         [0.0224],
         [0.0070],
         [0.0242],
         [0.0198],
         [0.0155],
         [0.0431],
         [0.0615],
         [0.0332]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0539],
         [0.0778],
         [0.0468],
         [0.0226],
         [0.0280],
         [0.0072],
         [0.0186],
         [0.0355],
         [0.0121],
         [0.0103],
         [0.0220],
         [0.0223]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0184],
         [0.0149],
         [0.0077],
         [0.0047],
         [0.0051],
         [0.0188],
         [0.0067],
         [0.02


Evaluating:  95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊        | 954/1000 [00:44<00:02, 21.34it/s][A

tensor([[[0.0186],
         [0.0261],
         [0.0980],
         [0.0190],
         [0.0410],
         [0.0219],
         [0.0126],
         [0.0168],
         [0.0138],
         [0.0119],
         [0.0481],
         [0.0183]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0097],
         [0.0218],
         [0.0563],
         [0.0151],
         [0.0159],
         [0.0078],
         [0.0106],
         [0.0078],
         [0.0664],
         [0.0050],
         [0.0272],
         [0.0974]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2810],
         [0.0583],
         [0.0236],
         [0.0250],
         [0.0352],
         [0.0121],
         [0.0204],
         [0.0194],
         [0.0144],
         [0.0125],
         [0.0153],
         [0.0192]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0328],
         [0.0721],
         [0.1048],
         [0.0417],
         [0.0221],
         [0.0121],
         [0.0389],
         [0.0025],
         [0.0127],
    


Evaluating:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎       | 957/1000 [00:44<00:02, 21.29it/s][A
Evaluating:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊       | 960/1000 [00:45<00:01, 21.39it/s][A

tensor([[[0.0063],
         [0.0078],
         [0.0609],
         [0.0188],
         [0.0077],
         [0.0025],
         [0.0098],
         [0.0126],
         [0.0033],
         [0.0015],
         [0.0300],
         [0.0465]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0953],
         [0.0840],
         [0.0402],
         [0.0363],
         [0.0373],
         [0.0123],
         [0.1563],
         [0.0124],
         [0.0167],
         [0.0299],
         [0.0500],
         [0.0404]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0064],
         [0.0141],
         [0.0337],
         [0.0154],
         [0.0348],
         [0.0080],
         [0.0032],
         [0.0072],
         [0.0071],
         [0.0036],
         [0.0106],
         [0.0296]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0101],
         [0.0059],
         [0.0434],
         [0.0112],
         [0.0150],
         [0.0338],
         [0.0108],
         [0.0048],
         [0.0052],
    


Evaluating:  96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍      | 963/1000 [00:45<00:01, 21.38it/s][A

tensor([[[0.0329],
         [0.0256],
         [0.0240],
         [0.0173],
         [0.0092],
         [0.0176],
         [0.0079],
         [0.0012],
         [0.0014],
         [0.0161],
         [0.0359],
         [0.0099]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0271],
         [0.0239],
         [0.0668],
         [0.0142],
         [0.0231],
         [0.0134],
         [0.0541],
         [0.0238],
         [0.0442],
         [0.0174],
         [0.0079],
         [0.0214]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0277],
         [0.0398],
         [0.0362],
         [0.0095],
         [0.0089],
         [0.0061],
         [0.0327],
         [0.0195],
         [0.0150],
         [0.0101],
         [0.0061],
         [0.0152]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0382],
         [0.0236],
         [0.0345],
         [0.0154],
         [0.0189],
         [0.0073],
         [0.0312],
         [0.0153],
         [0.0070],
    


Evaluating:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉      | 966/1000 [00:45<00:01, 21.32it/s][A

tensor([[[0.0384],
         [0.0116],
         [0.0240],
         [0.0100],
         [0.0056],
         [0.0073],
         [0.0098],
         [0.0006],
         [0.0018],
         [0.0023],
         [0.0096],
         [0.0114]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0548],
         [0.1127],
         [0.1054],
         [0.0337],
         [0.0509],
         [0.0167],
         [0.0108],
         [0.0496],
         [0.0068],
         [0.0121],
         [0.0367],
         [0.0516]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0148],
         [0.0581],
         [0.0291],
         [0.0152],
         [0.0359],
         [0.0288],
         [0.0062],
         [0.0117],
         [0.0011],
         [0.0015],
         [0.1272],
         [0.0658]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0235],
         [0.0169],
         [0.0430],
         [0.0200],
         [0.0214],
         [0.0035],
         [0.0229],
         [0.0257],
         [0.0119],
    


Evaluating:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍     | 969/1000 [00:45<00:01, 21.28it/s][A
Evaluating:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉     | 972/1000 [00:45<00:01, 21.38it/s][A

tensor([[[0.0415],
         [0.0356],
         [0.0425],
         [0.0117],
         [0.0158],
         [0.0561],
         [0.0130],
         [0.0039],
         [0.0021],
         [0.0094],
         [0.0260],
         [0.0198]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0457],
         [0.0228],
         [0.0134],
         [0.0165],
         [0.0163],
         [0.0086],
         [0.0097],
         [0.0149],
         [0.0048],
         [0.0171],
         [0.0164],
         [0.0154]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0347],
         [0.0722],
         [0.0198],
         [0.0070],
         [0.0625],
         [0.0676],
         [0.0245],
         [0.0103],
         [0.0105],
         [0.0112],
         [0.0538],
         [0.0143]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0438],
         [0.0176],
         [0.0176],
         [0.0082],
         [0.0458],
         [0.0101],
         [0.0262],
         [0.0076],
         [0.0075],
    


Evaluating:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌    | 975/1000 [00:45<00:01, 21.36it/s][A

tensor([[[0.0154],
         [0.0156],
         [0.0836],
         [0.0061],
         [0.0150],
         [0.0139],
         [0.0296],
         [0.0012],
         [0.0022],
         [0.0029],
         [0.0262],
         [0.0248]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0255],
         [0.0191],
         [0.1039],
         [0.0083],
         [0.0204],
         [0.0140],
         [0.0185],
         [0.0074],
         [0.0103],
         [0.0053],
         [0.0198],
         [0.0279]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0334],
         [0.0441],
         [0.0141],
         [0.0059],
         [0.0066],
         [0.0104],
         [0.0424],
         [0.0110],
         [0.0201],
         [0.0079],
         [0.0015],
         [0.0314]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0255],
         [0.0277],
         [0.0278],
         [0.0258],
         [0.0437],
         [0.0455],
         [0.0230],
         [0.0157],
         [0.0056],
    


Evaluating:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████    | 978/1000 [00:45<00:01, 21.26it/s][A

tensor([[[0.0184],
         [0.0565],
         [0.0308],
         [0.0113],
         [0.0092],
         [0.0061],
         [0.0220],
         [0.0014],
         [0.0018],
         [0.0069],
         [0.0331],
         [0.0224]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0196],
         [0.0367],
         [0.1399],
         [0.0311],
         [0.0240],
         [0.0129],
         [0.0119],
         [0.0075],
         [0.0067],
         [0.0077],
         [0.0513],
         [0.0244]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0959],
         [0.0413],
         [0.0456],
         [0.0117],
         [0.0150],
         [0.0118],
         [0.0252],
         [0.0276],
         [0.0115],
         [0.0103],
         [0.0222],
         [0.0260]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0348],
         [0.1055],
         [0.0588],
         [0.0264],
         [0.0160],
         [0.0055],
         [0.0134],
         [0.0104],
         [0.0040],
    


Evaluating:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌   | 981/1000 [00:46<00:00, 21.33it/s][A
Evaluating:  98%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 984/1000 [00:46<00:00, 21.33it/s][A

tensor([[[0.0333],
         [0.0259],
         [0.0155],
         [0.0063],
         [0.0148],
         [0.0139],
         [0.0024],
         [0.0026],
         [0.0016],
         [0.0025],
         [0.0040],
         [0.0072]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0431],
         [0.0390],
         [0.0815],
         [0.0510],
         [0.0795],
         [0.0069],
         [0.0221],
         [0.0160],
         [0.0282],
         [0.0121],
         [0.0306],
         [0.0481]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0281],
         [0.0256],
         [0.0491],
         [0.0161],
         [0.0180],
         [0.0109],
         [0.0077],
         [0.0108],
         [0.0042],
         [0.0116],
         [0.0188],
         [0.0217]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0170],
         [0.0334],
         [0.2516],
         [0.0418],
         [0.0377],
         [0.0126],
         [0.0143],
         [0.0209],
         [0.0160],
    


Evaluating:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋  | 987/1000 [00:46<00:00, 21.29it/s][A

tensor([[[0.1425],
         [0.1981],
         [0.0421],
         [0.0278],
         [0.0095],
         [0.0154],
         [0.0473],
         [0.0385],
         [0.0068],
         [0.0329],
         [0.0046],
         [0.0232]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0235],
         [0.0299],
         [0.0779],
         [0.0223],
         [0.0170],
         [0.0050],
         [0.0225],
         [0.0053],
         [0.0065],
         [0.0041],
         [0.0461],
         [0.0261]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0184],
         [0.0282],
         [0.0162],
         [0.0166],
         [0.0122],
         [0.0105],
         [0.0208],
         [0.0134],
         [0.0048],
         [0.0093],
         [0.0327],
         [0.0032]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0179],
         [0.0176],
         [0.0077],
         [0.0023],
         [0.0066],
         [0.0047],
         [0.0089],
         [0.0013],
         [0.0015],
    


Evaluating:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 990/1000 [00:46<00:00, 21.30it/s][A
Evaluating:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 993/1000 [00:46<00:00, 21.22it/s][A

reg attention sum per layer
tensor([[[0.0551],
         [0.0298],
         [0.0279],
         [0.0103],
         [0.0158],
         [0.0051],
         [0.0834],
         [0.0042],
         [0.0058],
         [0.0129],
         [0.0147],
         [0.0204]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0118],
         [0.0131],
         [0.0457],
         [0.0214],
         [0.0152],
         [0.0034],
         [0.0049],
         [0.0087],
         [0.0050],
         [0.0036],
         [0.0169],
         [0.0301]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0050],
         [0.0084],
         [0.0185],
         [0.0041],
         [0.0083],
         [0.0036],
         [0.0022],
         [0.0010],
         [0.0005],
         [0.0010],
         [0.0293],
         [0.0159]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0596],
         [0.0476],
         [0.0430],
         [0.0284],
         [0.0235],
         [0.0183],
         [0.0073],
         [0.08


Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎| 996/1000 [00:46<00:00, 21.30it/s][A

tensor([[[0.0577],
         [0.0150],
         [0.1352],
         [0.0053],
         [0.0564],
         [0.0104],
         [0.0618],
         [0.0009],
         [0.0118],
         [0.0167],
         [0.0120],
         [0.0137]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1120],
         [0.0639],
         [0.0907],
         [0.0415],
         [0.0289],
         [0.0422],
         [0.0531],
         [0.1143],
         [0.0081],
         [0.0622],
         [0.0285],
         [0.0709]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0344],
         [0.0147],
         [0.0311],
         [0.0576],
         [0.0444],
         [0.0223],
         [0.0206],
         [0.0045],
         [0.0260],
         [0.0030],
         [0.0544],
         [0.0386]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0173],
         [0.0121],
         [0.0498],
         [0.0159],
         [0.0102],
         [0.0036],
         [0.0069],
         [0.0274],
         [0.0022],
    


Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:46<00:00, 21.28it/s][A
01/28/2021 14:27:03 - INFO - __main__ -   ***** dev results *****
01/28/2021 14:27:03 - INFO - __main__ -     acc = 0.099
01/28/2021 14:27:03 - INFO - __main__ -     ar_eval_loss = 0.0
01/28/2021 14:27:03 - INFO - __main__ -     attmaxidx = [0.9693290591239929, 0.9699538946151733, 0.9792768359184265, 0.9724921584129333, 0.9697535037994385, 0.987305760383606, 0.9636990427970886, 0.988562822341919, 0.9659745097160339, 0.9961649775505066, 0.9785057902336121, 0.9845894575119019, 0.9624667167663574, 0.9610791206359863, 0.9799982905387878, 0.9793119430541992, 0.965787410736084, 0.9807645678520203, 0.9899136424064636, 0.9671565294265747, 0.9926559925079346, 1.0, 0.9709612727165222, 0.9837052822113037, 0.992339015007019, 0.977260947227478, 0.9805528521537781, 0.

01/28/2021 14:27:03 - INFO - __main__ -     avg_max_attention_mass = 0.0787351134289056
01/28/2021 14:27:03 - INFO - __main__ -     avg_max_attention_mass_non_reg = 0.9951181895136834
01/28/2021 14:27:03 - INFO - __main__ -     avg_max_value_norm = 0.08244639066141099
01/28/2021 14:27:03 - INFO - __main__ -     avg_mean_attention_mass = 0.026573443678207697
01/28/2021 14:27:03 - INFO - __main__ -     avg_mean_value_norm = 0.07423354543093592
01/28/2021 14:27:03 - INFO - __main__ -     avg_min_value_norm = 0.06596634841803461
01/28/2021 14:27:03 - INFO - __main__ -     avg_non_reg_attention_mass = 0.9734265779852868
01/28/2021 14:27:03 - INFO - __main__ -     avg_pad_attention_mass = 0.0
01/28/2021 14:27:03 - INFO - __main__ -     ce_eval_loss = 1.8069672226905822
01/28/2021 14:27:03 - INFO - __main__ -     eval_loss = 1.8069672226905822
01/28/2021 14:27:03 - INFO - __main__ -     global_step = 0
01/28/2021 14:27:03 - INFO - __main__ -     label_match_score = 0.0
01/28/2021 14:27:03 - I

01/28/2021 14:27:03 - INFO - __main__ -   segment_ids: 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

01/28/2021 14:27:03 - INFO - __main__ -   label: 5 (id = 4)
01/28/2021 14:27:03 - INFO - __main__ -   *** Example ***
01/28/2021 14:27:03 - INFO - __main__ -   guid: test-4
01/28/2021 14:27:03 - INFO - __main__ -   tokens: [CLS] i decided to check this place out after getting a coup ##on in my sa ##v ##vy shop ##per magazine for 25 % off . i am a fan of all of the little self serve yo ##gur ##t places popping up but i must be honest that this place is no different than all of the rest . be careful that you don ' t over ##load your cup or you will end up paying a lot by the time they weigh your dessert ! [SEP] [SEP]
01/28/2021 14:27:03 - INFO - __main__ -   input_ids: 101 1045 2787 2000 4638 2023 2173 2041 2044 2893 1037 8648 2239 1999 2026 7842 2615 10736 4497 4842 2932 2005 2423 1003 2125 1012 1045 2572 1037 5470 1997 2035 1997 1996 2210 2969 3710 10930 27390 2102 3182 20095 2039 2021 1045 2442 2022 7481 2008 2023 2173 2003 2053 2367 2084 2035 1997 1996 2717 1012 2022 6176 2008 2017 2

reg attention sum per layer
tensor([[[0.0276],
         [0.0162],
         [0.0243],
         [0.0159],
         [0.0134],
         [0.0208],
         [0.0188],
         [0.0010],
         [0.0030],
         [0.0054],
         [0.0217],
         [0.0191]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0148],
         [0.0197],
         [0.2280],
         [0.0103],
         [0.0097],
         [0.0077],
         [0.0067],
         [0.0031],
         [0.0019],
         [0.0018],
         [0.0451],
         [0.0578]]], device='cuda:0')
attetnion
count    1000.000000
mean        0.973427
std         0.014371
min         0.893804
25%         0.966594
50%         0.976273
75%         0.982778
max         1.000000
dtype: float64
dev	0	0.099	0.026573443678207697	0.0787351134289056	0.9951181895136834	1.8069672226905822	0.0	0.0	0.07423354543093592	0.08244639066141099	0.06596634841803461	[0.9693290591239929, 0.9699538946151733, 0.9792768359184265, 0.9724921584129333, 0.9697535037994385

01/28/2021 14:27:37 - INFO - __main__ -   ***** Running evaluation on test data*****
01/28/2021 14:27:37 - INFO - __main__ -     Num examples = 2000
01/28/2021 14:27:37 - INFO - __main__ -     Batch size = 1

Evaluating:   0%|                                                                                                                                                                                             | 0/2000 [00:00<?, ?it/s][A
Evaluating:   0%|▏                                                                                                                                                                                    | 2/2000 [00:00<01:58, 16.85it/s][A
Evaluating:   0%|▎                                                                                                                                                                                    | 4/2000 [00:00<01:53, 17.65it/s][A

reg attention sum per layer
tensor([[[0.0235],
         [0.0123],
         [0.0211],
         [0.0117],
         [0.0250],
         [0.0078],
         [0.0078],
         [0.0143],
         [0.0070],
         [0.0104],
         [0.0111],
         [0.0208]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0196],
         [0.0388],
         [0.0464],
         [0.0090],
         [0.0121],
         [0.0199],
         [0.0128],
         [0.0075],
         [0.0058],
         [0.0040],
         [0.0150],
         [0.0224]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0221],
         [0.0177],
         [0.0247],
         [0.0184],
         [0.0230],
         [0.0078],
         [0.0208],
         [0.0135],
         [0.0065],
         [0.0082],
         [0.0236],
         [0.0086]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0071],
         [0.0197],
         [0.0462],
         [0.0049],
         [0.0105],
         [0.0190],
         [0.0254],
         [0.00


Evaluating:   0%|▋                                                                                                                                                                                    | 7/2000 [00:00<01:46, 18.65it/s][A

tensor([[[0.0158],
         [0.0058],
         [0.0094],
         [0.0024],
         [0.0058],
         [0.0029],
         [0.0170],
         [0.0026],
         [0.0072],
         [0.0018],
         [0.0094],
         [0.0062]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0131],
         [0.0113],
         [0.0141],
         [0.0063],
         [0.0085],
         [0.0102],
         [0.0148],
         [0.0084],
         [0.0071],
         [0.0033],
         [0.0102],
         [0.0090]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0283],
         [0.0535],
         [0.0560],
         [0.0168],
         [0.0469],
         [0.0191],
         [0.0266],
         [0.0174],
         [0.0075],
         [0.0103],
         [0.0235],
         [0.0249]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0245],
         [0.0324],
         [0.0446],
         [0.0094],
         [0.0149],
         [0.0209],
         [0.0201],
         [0.0195],
         [0.0085],
    


Evaluating:   0%|▉                                                                                                                                                                                   | 10/2000 [00:00<01:42, 19.49it/s][A
Evaluating:   1%|█▏                                                                                                                                                                                  | 13/2000 [00:00<01:38, 20.13it/s][A

reg attention sum per layer
tensor([[[0.0292],
         [0.0226],
         [0.0254],
         [0.0038],
         [0.0068],
         [0.0147],
         [0.0144],
         [0.0050],
         [0.0136],
         [0.0102],
         [0.0162],
         [0.0088]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0955],
         [0.0340],
         [0.0441],
         [0.0176],
         [0.0541],
         [0.0150],
         [0.0455],
         [0.0125],
         [0.0311],
         [0.0409],
         [0.0094],
         [0.0177]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0344],
         [0.0605],
         [0.1110],
         [0.0168],
         [0.0261],
         [0.0421],
         [0.0409],
         [0.0268],
         [0.0115],
         [0.0067],
         [0.0701],
         [0.0373]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0675],
         [0.0221],
         [0.0886],
         [0.0143],
         [0.0151],
         [0.0305],
         [0.0591],
         [0.01


Evaluating:   1%|█▍                                                                                                                                                                                  | 16/2000 [00:00<01:37, 20.44it/s][A

tensor([[[0.0061],
         [0.0120],
         [0.0407],
         [0.0109],
         [0.0114],
         [0.0060],
         [0.0050],
         [0.0019],
         [0.0037],
         [0.0020],
         [0.0371],
         [0.0188]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0170],
         [0.0225],
         [0.0498],
         [0.0071],
         [0.0145],
         [0.0251],
         [0.0208],
         [0.0246],
         [0.0265],
         [0.0185],
         [0.0143],
         [0.0233]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1273],
         [0.0255],
         [0.0648],
         [0.0234],
         [0.0209],
         [0.0177],
         [0.0313],
         [0.0313],
         [0.0131],
         [0.0200],
         [0.0202],
         [0.0388]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0180],
         [0.0176],
         [0.0988],
         [0.0172],
         [0.0235],
         [0.0021],
         [0.0123],
         [0.0172],
         [0.0044],
    


Evaluating:   1%|█▋                                                                                                                                                                                  | 19/2000 [00:00<01:35, 20.70it/s][A

tensor([[[0.0426],
         [0.0169],
         [0.0301],
         [0.0129],
         [0.0166],
         [0.0310],
         [0.0588],
         [0.0218],
         [0.0356],
         [0.0311],
         [0.0053],
         [0.0301]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0117],
         [0.0066],
         [0.0536],
         [0.0061],
         [0.0057],
         [0.0048],
         [0.0043],
         [0.0019],
         [0.0008],
         [0.0016],
         [0.0311],
         [0.0259]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0222],
         [0.0261],
         [0.0224],
         [0.0098],
         [0.0158],
         [0.0120],
         [0.0054],
         [0.0039],
         [0.0016],
         [0.0056],
         [0.0171],
         [0.0237]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0040],
         [0.0054],
         [0.0318],
         [0.0056],
         [0.0102],
         [0.0044],
         [0.0034],
         [0.0113],
         [0.0014],
    


Evaluating:   1%|█▉                                                                                                                                                                                  | 22/2000 [00:01<01:34, 21.02it/s][A
Evaluating:   1%|██▎                                                                                                                                                                                 | 25/2000 [00:01<01:33, 21.20it/s][A

tensor([[[0.0148],
         [0.0575],
         [0.0130],
         [0.0157],
         [0.0306],
         [0.0294],
         [0.0172],
         [0.0018],
         [0.0018],
         [0.0091],
         [0.3653],
         [0.1016]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0042],
         [0.0126],
         [0.0624],
         [0.0068],
         [0.0244],
         [0.0198],
         [0.0117],
         [0.0031],
         [0.0059],
         [0.0015],
         [0.0316],
         [0.0187]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0744],
         [0.0663],
         [0.0816],
         [0.0177],
         [0.0273],
         [0.0335],
         [0.0250],
         [0.0057],
         [0.0041],
         [0.0172],
         [0.0737],
         [0.0296]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0125],
         [0.0184],
         [0.0561],
         [0.0060],
         [0.0024],
         [0.0016],
         [0.0072],
         [0.0019],
         [0.0014],
    


Evaluating:   1%|██▌                                                                                                                                                                                 | 28/2000 [00:01<01:32, 21.28it/s][A


tensor([[[0.0085],
         [0.0081],
         [0.0286],
         [0.0062],
         [0.0089],
         [0.0123],
         [0.0181],
         [0.0013],
         [0.0006],
         [0.0011],
         [0.0339],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0248],
         [0.0273],
         [0.0660],
         [0.0115],
         [0.0304],
         [0.0188],
         [0.0203],
         [0.0087],
         [0.0037],
         [0.0034],
         [0.0188],
         [0.0471]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0131],
         [0.0315],
         [0.1204],
         [0.0088],
         [0.0510],
         [0.0247],
         [0.0307],
         [0.0296],
         [0.0173],
         [0.0084],
         [0.0798],
         [0.0326]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0038],
         [0.0255],
         [0.0802],
         [0.0125],
         [0.0152],
         [0.0033],
         [0.0229],
         [0.0039],
         [0.0035],
   


Evaluating:   2%|██▊                                                                                                                                                                                 | 31/2000 [00:01<01:32, 21.34it/s][A
Evaluating:   2%|███                                                                                                                                                                                 | 34/2000 [00:01<01:31, 21.52it/s][A


reg attention sum per layer
tensor([[[0.0464],
         [0.0493],
         [0.0687],
         [0.0303],
         [0.0434],
         [0.0116],
         [0.0134],
         [0.0104],
         [0.0132],
         [0.0075],
         [0.0343],
         [0.0330]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0640],
         [0.0641],
         [0.0260],
         [0.0242],
         [0.0385],
         [0.0085],
         [0.0285],
         [0.0064],
         [0.0347],
         [0.0129],
         [0.0128],
         [0.0302]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0802],
         [0.0189],
         [0.0682],
         [0.0224],
         [0.0297],
         [0.0205],
         [0.0160],
         [0.0083],
         [0.0173],
         [0.0071],
         [0.0134],
         [0.0909]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0117],
         [0.0382],
         [0.0241],
         [0.0091],
         [0.0120],
         [0.0108],
         [0.0197],
         [0.0


Evaluating:   2%|███▎                                                                                                                                                                                | 37/2000 [00:01<01:31, 21.51it/s][A

reg attention sum per layer
tensor([[[0.0182],
         [0.0170],
         [0.0179],
         [0.0057],
         [0.0066],
         [0.0062],
         [0.0209],
         [0.0056],
         [0.0045],
         [0.0051],
         [0.0057],
         [0.0098]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.3219],
         [0.1812],
         [0.0145],
         [0.0397],
         [0.0302],
         [0.0111],
         [0.0943],
         [0.0263],
         [0.0034],
         [0.0594],
         [0.0131],
         [0.0105]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0222],
         [0.0091],
         [0.0173],
         [0.0097],
         [0.0162],
         [0.0172],
         [0.0163],
         [0.0020],
         [0.0042],
         [0.0058],
         [0.0129],
         [0.0102]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0392],
         [0.0479],
         [0.0394],
         [0.0141],
         [0.0575],
         [0.0045],
         [0.0566],
         [0.06


Evaluating:   2%|███▌                                                                                                                                                                                | 40/2000 [00:01<01:31, 21.46it/s][A

tensor([[[0.0101],
         [0.0103],
         [0.0189],
         [0.0042],
         [0.0167],
         [0.0177],
         [0.0040],
         [0.0010],
         [0.0015],
         [0.0018],
         [0.0159],
         [0.0125]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0209],
         [0.0296],
         [0.0645],
         [0.0058],
         [0.0162],
         [0.0063],
         [0.0190],
         [0.0058],
         [0.0135],
         [0.0116],
         [0.0102],
         [0.0334]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0812],
         [0.0658],
         [0.0470],
         [0.0381],
         [0.0152],
         [0.0052],
         [0.0198],
         [0.0186],
         [0.0042],
         [0.0158],
         [0.0286],
         [0.0115]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0188],
         [0.0204],
         [0.0287],
         [0.0045],
         [0.0071],
         [0.0054],
         [0.0202],
         [0.0153],
         [0.0376],
    


Evaluating:   2%|███▊                                                                                                                                                                                | 43/2000 [00:02<01:31, 21.47it/s][A
Evaluating:   2%|████▏                                                                                                                                                                               | 46/2000 [00:02<01:31, 21.43it/s][A

tensor([[[0.0171],
         [0.0088],
         [0.0122],
         [0.0032],
         [0.0190],
         [0.0043],
         [0.0126],
         [0.0065],
         [0.0091],
         [0.0109],
         [0.0025],
         [0.0121]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0274],
         [0.0366],
         [0.0409],
         [0.0107],
         [0.0096],
         [0.0069],
         [0.0446],
         [0.0028],
         [0.0060],
         [0.0060],
         [0.0450],
         [0.0173]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0473],
         [0.0145],
         [0.0447],
         [0.0149],
         [0.0150],
         [0.0025],
         [0.0095],
         [0.0256],
         [0.0108],
         [0.0107],
         [0.0078],
         [0.0810]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1328],
         [0.0164],
         [0.0684],
         [0.0123],
         [0.0196],
         [0.0139],
         [0.0420],
         [0.0029],
         [0.0031],
    


Evaluating:   2%|████▍                                                                                                                                                                               | 49/2000 [00:02<01:30, 21.44it/s][A

tensor([[[0.0495],
         [0.0150],
         [0.0120],
         [0.0080],
         [0.0044],
         [0.0013],
         [0.0102],
         [0.0052],
         [0.0021],
         [0.0252],
         [0.0370],
         [0.0072]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0376],
         [0.0629],
         [0.0406],
         [0.0381],
         [0.0341],
         [0.0128],
         [0.0265],
         [0.0057],
         [0.0066],
         [0.0088],
         [0.0567],
         [0.0563]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0749],
         [0.0157],
         [0.0177],
         [0.0125],
         [0.0342],
         [0.0093],
         [0.0172],
         [0.0106],
         [0.0195],
         [0.0144],
         [0.0048],
         [0.0121]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0054],
         [0.0105],
         [0.0091],
         [0.0048],
         [0.0031],
         [0.0010],
         [0.0118],
         [0.0129],
         [0.0032],
    


Evaluating:   3%|████▋                                                                                                                                                                               | 52/2000 [00:02<01:30, 21.41it/s][A

tensor([[[0.0221],
         [0.0168],
         [0.0046],
         [0.0060],
         [0.0007],
         [0.0032],
         [0.0055],
         [0.0031],
         [0.0004],
         [0.0024],
         [0.0018],
         [0.0159]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0391],
         [0.1430],
         [0.0689],
         [0.0156],
         [0.0069],
         [0.0087],
         [0.0218],
         [0.0144],
         [0.0058],
         [0.0060],
         [0.0168],
         [0.0677]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0049],
         [0.0213],
         [0.0449],
         [0.0084],
         [0.0271],
         [0.0133],
         [0.0056],
         [0.0116],
         [0.0009],
         [0.0013],
         [0.0201],
         [0.0318]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0666],
         [0.0332],
         [0.0342],
         [0.0160],
         [0.0338],
         [0.0308],
         [0.0201],
         [0.0125],
         [0.0152],
    


Evaluating:   3%|████▉                                                                                                                                                                               | 55/2000 [00:02<01:30, 21.53it/s][A
Evaluating:   3%|█████▏                                                                                                                                                                              | 58/2000 [00:02<01:30, 21.47it/s][A

tensor([[[0.0200],
         [0.0407],
         [0.0585],
         [0.0156],
         [0.0299],
         [0.0191],
         [0.0438],
         [0.0288],
         [0.0059],
         [0.0117],
         [0.0248],
         [0.0294]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0078],
         [0.0064],
         [0.0291],
         [0.0100],
         [0.0091],
         [0.0016],
         [0.0086],
         [0.0051],
         [0.0030],
         [0.0035],
         [0.0131],
         [0.0169]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0410],
         [0.0422],
         [0.0832],
         [0.0077],
         [0.0142],
         [0.0119],
         [0.0119],
         [0.0103],
         [0.0011],
         [0.0030],
         [0.0264],
         [0.0363]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0109],
         [0.0160],
         [0.0529],
         [0.0081],
         [0.0296],
         [0.0375],
         [0.0226],
         [0.0592],
         [0.0150],
    


Evaluating:   3%|█████▍                                                                                                                                                                              | 61/2000 [00:02<01:30, 21.47it/s][A

tensor([[[0.0302],
         [0.0773],
         [0.0311],
         [0.0116],
         [0.0103],
         [0.0083],
         [0.0353],
         [0.0056],
         [0.0043],
         [0.0062],
         [0.0461],
         [0.0211]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0656],
         [0.0373],
         [0.0303],
         [0.0178],
         [0.0341],
         [0.0207],
         [0.0252],
         [0.0172],
         [0.0042],
         [0.0148],
         [0.0120],
         [0.0278]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1630],
         [0.0904],
         [0.0647],
         [0.0295],
         [0.0424],
         [0.0143],
         [0.0575],
         [0.0444],
         [0.0276],
         [0.0256],
         [0.0227],
         [0.0469]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0130],
         [0.0299],
         [0.0721],
         [0.0100],
         [0.0745],
         [0.0162],
         [0.0074],
         [0.0191],
         [0.0066],
    


Evaluating:   3%|█████▊                                                                                                                                                                              | 64/2000 [00:03<01:30, 21.34it/s][A

tensor([[[0.0110],
         [0.1077],
         [0.0398],
         [0.0087],
         [0.0112],
         [0.0062],
         [0.0112],
         [0.0051],
         [0.0094],
         [0.0065],
         [0.1435],
         [0.0177]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0084],
         [0.0070],
         [0.0214],
         [0.0067],
         [0.0023],
         [0.0062],
         [0.0034],
         [0.0027],
         [0.0014],
         [0.0006],
         [0.0265],
         [0.0188]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0393],
         [0.0568],
         [0.1873],
         [0.0127],
         [0.0264],
         [0.0115],
         [0.0159],
         [0.0075],
         [0.0079],
         [0.0051],
         [0.0161],
         [0.0774]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0284],
         [0.0181],
         [0.0617],
         [0.0517],
         [0.0155],
         [0.0067],
         [0.0066],
         [0.0615],
         [0.0046],
    


Evaluating:   3%|██████                                                                                                                                                                              | 67/2000 [00:03<01:30, 21.43it/s][A
Evaluating:   4%|██████▎                                                                                                                                                                             | 70/2000 [00:03<01:29, 21.54it/s][A

tensor([[[0.0281],
         [0.0159],
         [0.0136],
         [0.0069],
         [0.0134],
         [0.0090],
         [0.0166],
         [0.0040],
         [0.0026],
         [0.0035],
         [0.0078],
         [0.0127]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0708],
         [0.0373],
         [0.0489],
         [0.0163],
         [0.0210],
         [0.0213],
         [0.0327],
         [0.0335],
         [0.0249],
         [0.0248],
         [0.0172],
         [0.0260]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0144],
         [0.0613],
         [0.1916],
         [0.0276],
         [0.0707],
         [0.0156],
         [0.0120],
         [0.0242],
         [0.0214],
         [0.0104],
         [0.1427],
         [0.1477]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0155],
         [0.0522],
         [0.0207],
         [0.0064],
         [0.0091],
         [0.0068],
         [0.0079],
         [0.0164],
         [0.0109],
    


Evaluating:   4%|██████▌                                                                                                                                                                             | 73/2000 [00:03<01:30, 21.38it/s][A

tensor([[[0.0191],
         [0.0255],
         [0.0231],
         [0.0165],
         [0.0194],
         [0.0067],
         [0.0125],
         [0.0101],
         [0.0032],
         [0.0057],
         [0.0069],
         [0.0125]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0176],
         [0.0047],
         [0.0311],
         [0.0118],
         [0.0080],
         [0.0056],
         [0.0141],
         [0.0103],
         [0.0018],
         [0.0063],
         [0.0213],
         [0.0205]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0550],
         [0.0312],
         [0.0608],
         [0.0178],
         [0.0170],
         [0.0034],
         [0.0499],
         [0.0180],
         [0.0082],
         [0.0095],
         [0.0195],
         [0.0439]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0316],
         [0.0269],
         [0.0197],
         [0.0175],
         [0.0723],
         [0.0409],
         [0.0130],
         [0.0298],
         [0.0052],
    


Evaluating:   4%|██████▊                                                                                                                                                                             | 76/2000 [00:03<01:30, 21.37it/s][A

tensor([[[0.0369],
         [0.0276],
         [0.0288],
         [0.0184],
         [0.0188],
         [0.0550],
         [0.0130],
         [0.0067],
         [0.0022],
         [0.0050],
         [0.0240],
         [0.0279]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0315],
         [0.0239],
         [0.1201],
         [0.0224],
         [0.0248],
         [0.0239],
         [0.0247],
         [0.0226],
         [0.0397],
         [0.0083],
         [0.0162],
         [0.1088]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0399],
         [0.0423],
         [0.0780],
         [0.0200],
         [0.0195],
         [0.0140],
         [0.0113],
         [0.0427],
         [0.0109],
         [0.0081],
         [0.0578],
         [0.0407]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0439],
         [0.0443],
         [0.0265],
         [0.0147],
         [0.0232],
         [0.0047],
         [0.0478],
         [0.0085],
         [0.0413],
    


Evaluating:   4%|███████                                                                                                                                                                             | 79/2000 [00:03<01:29, 21.45it/s][A
Evaluating:   4%|███████▍                                                                                                                                                                            | 82/2000 [00:03<01:29, 21.46it/s][A


reg attention sum per layer
tensor([[[0.0129],
         [0.0257],
         [0.0405],
         [0.0097],
         [0.0237],
         [0.0119],
         [0.0094],
         [0.0090],
         [0.0138],
         [0.0095],
         [0.0318],
         [0.0206]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0209],
         [0.0401],
         [0.0386],
         [0.0111],
         [0.0200],
         [0.0776],
         [0.0411],
         [0.0227],
         [0.0053],
         [0.0231],
         [0.0722],
         [0.0423]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0121],
         [0.0253],
         [0.0652],
         [0.0569],
         [0.0043],
         [0.0058],
         [0.0105],
         [0.0049],
         [0.0029],
         [0.0055],
         [0.0243],
         [0.0243]]], device='cuda:0')
reg attention sum per layer



Evaluating:   4%|███████▋                                                                                                                                                                            | 85/2000 [00:03<01:29, 21.51it/s][A

tensor([[[0.0158],
         [0.0286],
         [0.0245],
         [0.0356],
         [0.0154],
         [0.0790],
         [0.0247],
         [0.0033],
         [0.0023],
         [0.0072],
         [0.3054],
         [0.0500]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0654],
         [0.0183],
         [0.0252],
         [0.0226],
         [0.0140],
         [0.0049],
         [0.0205],
         [0.0598],
         [0.0034],
         [0.0194],
         [0.0122],
         [0.0207]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0242],
         [0.0558],
         [0.1576],
         [0.0603],
         [0.0383],
         [0.0356],
         [0.0189],
         [0.0732],
         [0.0197],
         [0.0278],
         [0.0458],
         [0.1565]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0161],
         [0.0508],
         [0.0312],
         [0.0150],
         [0.0151],
         [0.0091],
         [0.0342],
         [0.0081],
         [0.0071],
    


Evaluating:   4%|███████▉                                                                                                                                                                            | 88/2000 [00:04<01:28, 21.55it/s][A
Evaluating:   5%|████████▏                                                                                                                                                                           | 91/2000 [00:04<01:28, 21.62it/s][A

reg attention sum per layer
tensor([[[0.0407],
         [0.0286],
         [0.0714],
         [0.0180],
         [0.0296],
         [0.0119],
         [0.0117],
         [0.0150],
         [0.0032],
         [0.0074],
         [0.0414],
         [0.0453]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0343],
         [0.0438],
         [0.0198],
         [0.0166],
         [0.0047],
         [0.0107],
         [0.0115],
         [0.0067],
         [0.0010],
         [0.0042],
         [0.0234],
         [0.0125]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0788],
         [0.0177],
         [0.0623],
         [0.0152],
         [0.0522],
         [0.0340],
         [0.0154],
         [0.0082],
         [0.0529],
         [0.0171],
         [0.0156],
         [0.0259]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0288],
         [0.0272],
         [0.0948],
         [0.0189],
         [0.0562],
         [0.0132],
         [0.0387],
         [0.03


Evaluating:   5%|████████▍                                                                                                                                                                           | 94/2000 [00:04<01:28, 21.58it/s][A

tensor([[[0.0473],
         [0.0909],
         [0.1425],
         [0.0698],
         [0.0943],
         [0.0262],
         [0.0067],
         [0.0012],
         [0.0033],
         [0.0031],
         [0.3450],
         [0.0729]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0104],
         [0.0068],
         [0.0537],
         [0.0048],
         [0.0191],
         [0.0114],
         [0.0113],
         [0.0052],
         [0.0069],
         [0.0074],
         [0.0422],
         [0.0125]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0785],
         [0.0870],
         [0.0355],
         [0.0242],
         [0.0190],
         [0.0114],
         [0.0156],
         [0.0152],
         [0.0016],
         [0.0077],
         [0.0208],
         [0.0621]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0617],
         [0.0741],
         [0.0824],
         [0.0209],
         [0.0350],
         [0.0090],
         [0.0101],
         [0.0072],
         [0.0063],
    


Evaluating:   5%|████████▋                                                                                                                                                                           | 97/2000 [00:04<01:28, 21.60it/s][A

tensor([[[0.0175],
         [0.0384],
         [0.0288],
         [0.0066],
         [0.0124],
         [0.0162],
         [0.0242],
         [0.0159],
         [0.0061],
         [0.0051],
         [0.0190],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0191],
         [0.0286],
         [0.0332],
         [0.0128],
         [0.0083],
         [0.0064],
         [0.0092],
         [0.0093],
         [0.0045],
         [0.0031],
         [0.0168],
         [0.0489]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0701],
         [0.0658],
         [0.0284],
         [0.0153],
         [0.0260],
         [0.0183],
         [0.0131],
         [0.0087],
         [0.0018],
         [0.0079],
         [0.0379],
         [0.0807]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0345],
         [0.0472],
         [0.0441],
         [0.0091],
         [0.0078],
         [0.0036],
         [0.0134],
         [0.0122],
         [0.0188],
    


Evaluating:   5%|████████▉                                                                                                                                                                          | 100/2000 [00:04<01:28, 21.47it/s][A
Evaluating:   5%|█████████▏                                                                                                                                                                         | 103/2000 [00:04<01:28, 21.52it/s][A

tensor([[[0.0032],
         [0.0103],
         [0.0312],
         [0.0062],
         [0.0071],
         [0.0046],
         [0.0075],
         [0.0082],
         [0.0045],
         [0.0024],
         [0.0104],
         [0.0074]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1277],
         [0.0799],
         [0.0521],
         [0.0464],
         [0.0398],
         [0.0211],
         [0.0281],
         [0.0811],
         [0.0059],
         [0.0117],
         [0.0275],
         [0.0772]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0083],
         [0.0061],
         [0.0816],
         [0.0047],
         [0.0388],
         [0.0092],
         [0.0365],
         [0.0013],
         [0.0041],
         [0.0064],
         [0.0254],
         [0.0255]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0222],
         [0.0259],
         [0.0199],
         [0.0099],
         [0.0103],
         [0.0054],
         [0.0281],
         [0.0317],
         [0.0030],
    


Evaluating:   5%|█████████▍                                                                                                                                                                         | 106/2000 [00:04<01:27, 21.56it/s][A

tensor([[[0.0131],
         [0.0271],
         [0.0333],
         [0.0058],
         [0.0253],
         [0.0376],
         [0.0222],
         [0.0153],
         [0.0084],
         [0.0219],
         [0.0277],
         [0.0155]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0048],
         [0.0564],
         [0.0689],
         [0.0196],
         [0.0412],
         [0.0284],
         [0.0131],
         [0.0019],
         [0.0022],
         [0.0074],
         [0.2367],
         [0.0951]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0339],
         [0.0312],
         [0.0303],
         [0.0093],
         [0.0134],
         [0.0050],
         [0.0499],
         [0.0331],
         [0.0050],
         [0.0151],
         [0.0080],
         [0.0155]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0798],
         [0.0532],
         [0.0380],
         [0.0238],
         [0.0368],
         [0.0216],
         [0.0191],
         [0.0137],
         [0.0077],
    


Evaluating:   5%|█████████▊                                                                                                                                                                         | 109/2000 [00:05<01:28, 21.44it/s][A

tensor([[[0.0799],
         [0.0710],
         [0.0746],
         [0.0258],
         [0.0496],
         [0.0215],
         [0.0330],
         [0.0204],
         [0.0047],
         [0.0092],
         [0.0117],
         [0.0591]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1102],
         [0.0328],
         [0.0165],
         [0.0289],
         [0.0187],
         [0.0146],
         [0.0267],
         [0.0156],
         [0.0043],
         [0.0137],
         [0.0315],
         [0.0187]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0321],
         [0.0188],
         [0.0237],
         [0.0210],
         [0.0780],
         [0.0534],
         [0.0195],
         [0.0116],
         [0.0220],
         [0.0136],
         [0.0231],
         [0.0365]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0221],
         [0.0398],
         [0.0215],
         [0.0054],
         [0.0074],
         [0.0592],
         [0.0102],
         [0.0041],
         [0.0005],
    


Evaluating:   6%|██████████                                                                                                                                                                         | 112/2000 [00:05<01:27, 21.50it/s][A
Evaluating:   6%|██████████▎                                                                                                                                                                        | 115/2000 [00:05<01:27, 21.64it/s][A


reg attention sum per layer
tensor([[[0.0184],
         [0.0097],
         [0.0339],
         [0.0087],
         [0.0279],
         [0.0077],
         [0.0130],
         [0.0077],
         [0.0126],
         [0.0061],
         [0.0099],
         [0.0349]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0106],
         [0.0282],
         [0.0734],
         [0.0113],
         [0.0044],
         [0.0052],
         [0.0024],
         [0.0028],
         [0.0008],
         [0.0013],
         [0.0173],
         [0.0149]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0213],
         [0.0295],
         [0.2203],
         [0.0182],
         [0.0489],
         [0.0362],
         [0.0353],
         [0.0117],
         [0.0269],
         [0.0149],
         [0.0194],
         [0.0376]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0776],
         [0.0292],
         [0.0283],
         [0.0308],
         [0.0257],
         [0.0117],
         [0.0113],
         [0.0


Evaluating:   6%|██████████▌                                                                                                                                                                        | 118/2000 [00:05<01:27, 21.54it/s][A


reg attention sum per layer
tensor([[[0.0107],
         [0.0111],
         [0.0255],
         [0.0076],
         [0.0060],
         [0.0042],
         [0.0123],
         [0.0062],
         [0.0015],
         [0.0016],
         [0.0413],
         [0.0103]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1339],
         [0.0389],
         [0.0429],
         [0.0220],
         [0.0209],
         [0.0078],
         [0.0414],
         [0.0106],
         [0.0205],
         [0.0186],
         [0.0073],
         [0.0180]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0239],
         [0.0567],
         [0.0088],
         [0.0081],
         [0.0037],
         [0.0089],
         [0.0087],
         [0.0214],
         [0.0020],
         [0.0030],
         [0.0052],
         [0.0114]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0122],
         [0.0167],
         [0.0545],
         [0.0100],
         [0.0144],
         [0.0096],
         [0.0294],
         [0.0


Evaluating:   6%|██████████▊                                                                                                                                                                        | 121/2000 [00:05<01:27, 21.53it/s][A
Evaluating:   6%|███████████                                                                                                                                                                        | 124/2000 [00:05<01:27, 21.47it/s][A

reg attention sum per layer
tensor([[[0.0221],
         [0.0055],
         [0.0113],
         [0.0258],
         [0.0186],
         [0.0029],
         [0.0065],
         [0.0062],
         [0.0019],
         [0.0069],
         [0.0077],
         [0.0050]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0208],
         [0.0739],
         [0.0562],
         [0.0041],
         [0.0149],
         [0.0155],
         [0.0093],
         [0.0058],
         [0.0036],
         [0.0104],
         [0.1127],
         [0.0134]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0697],
         [0.0421],
         [0.2403],
         [0.0183],
         [0.0831],
         [0.0494],
         [0.0390],
         [0.0499],
         [0.0394],
         [0.0177],
         [0.0273],
         [0.0951]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0053],
         [0.0074],
         [0.0134],
         [0.0040],
         [0.0059],
         [0.0137],
         [0.0106],
         [0.01


Evaluating:   6%|███████████▎                                                                                                                                                                       | 127/2000 [00:05<01:27, 21.38it/s][A

tensor([[[0.0132],
         [0.0181],
         [0.0180],
         [0.0061],
         [0.0061],
         [0.0437],
         [0.0166],
         [0.0018],
         [0.0038],
         [0.0043],
         [0.0330],
         [0.0079]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0187],
         [0.0227],
         [0.0230],
         [0.0076],
         [0.0048],
         [0.0039],
         [0.0074],
         [0.0101],
         [0.0010],
         [0.0038],
         [0.0163],
         [0.0088]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0679],
         [0.0520],
         [0.0414],
         [0.0218],
         [0.0408],
         [0.0156],
         [0.0182],
         [0.0206],
         [0.0099],
         [0.0095],
         [0.0692],
         [0.0297]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0312],
         [0.0355],
         [0.0777],
         [0.0308],
         [0.0250],
         [0.0154],
         [0.0097],
         [0.0176],
         [0.0024],
    


Evaluating:   6%|███████████▋                                                                                                                                                                       | 130/2000 [00:06<01:27, 21.46it/s][A
Evaluating:   7%|███████████▉                                                                                                                                                                       | 133/2000 [00:06<01:26, 21.47it/s][A

reg attention sum per layer
tensor([[[0.0605],
         [0.0802],
         [0.0272],
         [0.0206],
         [0.0301],
         [0.0282],
         [0.0141],
         [0.0141],
         [0.0053],
         [0.0118],
         [0.0224],
         [0.0289]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0574],
         [0.0392],
         [0.0142],
         [0.0073],
         [0.0096],
         [0.0064],
         [0.0215],
         [0.0204],
         [0.0048],
         [0.0082],
         [0.0060],
         [0.0096]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0762],
         [0.0306],
         [0.0287],
         [0.0235],
         [0.0385],
         [0.0279],
         [0.0434],
         [0.0099],
         [0.0053],
         [0.0109],
         [0.0566],
         [0.0271]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0541],
         [0.0310],
         [0.0509],
         [0.0188],
         [0.0514],
         [0.0191],
         [0.0915],
         [0.01


Evaluating:   7%|████████████▏                                                                                                                                                                      | 136/2000 [00:06<01:26, 21.47it/s][A

tensor([[[0.0506],
         [0.0182],
         [0.0434],
         [0.0077],
         [0.0069],
         [0.0089],
         [0.0341],
         [0.0065],
         [0.0074],
         [0.0066],
         [0.0159],
         [0.0373]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0578],
         [0.0684],
         [0.0314],
         [0.0228],
         [0.0212],
         [0.0056],
         [0.0421],
         [0.0451],
         [0.0122],
         [0.0123],
         [0.0200],
         [0.0178]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0659],
         [0.0424],
         [0.0197],
         [0.0152],
         [0.0192],
         [0.0034],
         [0.0413],
         [0.0337],
         [0.0095],
         [0.0171],
         [0.0060],
         [0.0312]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0311],
         [0.0203],
         [0.0296],
         [0.0134],
         [0.0738],
         [0.0146],
         [0.0127],
         [0.0207],
         [0.0057],
    


Evaluating:   7%|████████████▍                                                                                                                                                                      | 139/2000 [00:06<01:26, 21.48it/s][A

tensor([[[0.0525],
         [0.0259],
         [0.0414],
         [0.0114],
         [0.0113],
         [0.0058],
         [0.0121],
         [0.0096],
         [0.0213],
         [0.0071],
         [0.0084],
         [0.0452]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0487],
         [0.0306],
         [0.0430],
         [0.0150],
         [0.0267],
         [0.0065],
         [0.0179],
         [0.0187],
         [0.0215],
         [0.0120],
         [0.0067],
         [0.0522]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0158],
         [0.0324],
         [0.0373],
         [0.0137],
         [0.0150],
         [0.0303],
         [0.0234],
         [0.0034],
         [0.0105],
         [0.0052],
         [0.1067],
         [0.0411]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0199],
         [0.0556],
         [0.0517],
         [0.0142],
         [0.0544],
         [0.0276],
         [0.0398],
         [0.0505],
         [0.0366],
    


Evaluating:   7%|████████████▋                                                                                                                                                                      | 142/2000 [00:06<01:26, 21.48it/s][A
Evaluating:   7%|████████████▉                                                                                                                                                                      | 145/2000 [00:06<01:26, 21.48it/s][A

tensor([[[0.0116],
         [0.0167],
         [0.0178],
         [0.0101],
         [0.0121],
         [0.0054],
         [0.0118],
         [0.0139],
         [0.0025],
         [0.0050],
         [0.0116],
         [0.0084]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0216],
         [0.0265],
         [0.0702],
         [0.0201],
         [0.0197],
         [0.0081],
         [0.0120],
         [0.0067],
         [0.0041],
         [0.0098],
         [0.0326],
         [0.0056]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0488],
         [0.0255],
         [0.0122],
         [0.0104],
         [0.0702],
         [0.0174],
         [0.0192],
         [0.0308],
         [0.0084],
         [0.0196],
         [0.0051],
         [0.0189]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0296],
         [0.0604],
         [0.0199],
         [0.0100],
         [0.0065],
         [0.0132],
         [0.0175],
         [0.0148],
         [0.0154],
    


Evaluating:   7%|█████████████▏                                                                                                                                                                     | 148/2000 [00:06<01:26, 21.34it/s][A

tensor([[[0.0135],
         [0.0179],
         [0.0404],
         [0.0115],
         [0.0145],
         [0.0163],
         [0.0217],
         [0.0076],
         [0.0037],
         [0.0036],
         [0.0254],
         [0.0168]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0346],
         [0.0129],
         [0.0318],
         [0.0065],
         [0.0224],
         [0.0080],
         [0.0290],
         [0.0024],
         [0.0054],
         [0.0078],
         [0.0086],
         [0.0390]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0151],
         [0.0150],
         [0.0682],
         [0.0142],
         [0.0183],
         [0.0050],
         [0.0091],
         [0.0104],
         [0.0063],
         [0.0034],
         [0.0352],
         [0.0924]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0301],
         [0.0112],
         [0.0589],
         [0.0187],
         [0.0083],
         [0.0043],
         [0.0160],
         [0.0147],
         [0.0012],
    


Evaluating:   8%|█████████████▌                                                                                                                                                                     | 151/2000 [00:07<01:26, 21.30it/s][A

tensor([[[0.0226],
         [0.0244],
         [0.0084],
         [0.0144],
         [0.0372],
         [0.0094],
         [0.0159],
         [0.0073],
         [0.0026],
         [0.0127],
         [0.0172],
         [0.0057]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0765],
         [0.0403],
         [0.0722],
         [0.0369],
         [0.1100],
         [0.0157],
         [0.0717],
         [0.0278],
         [0.0517],
         [0.0296],
         [0.0162],
         [0.0574]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0140],
         [0.0177],
         [0.0501],
         [0.0172],
         [0.0282],
         [0.0060],
         [0.0234],
         [0.0024],
         [0.0018],
         [0.0049],
         [0.0382],
         [0.0585]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0745],
         [0.0661],
         [0.0219],
         [0.0218],
         [0.0081],
         [0.0100],
         [0.0230],
         [0.0087],
         [0.0031],
    


Evaluating:   8%|█████████████▊                                                                                                                                                                     | 154/2000 [00:07<01:26, 21.40it/s][A
Evaluating:   8%|██████████████                                                                                                                                                                     | 157/2000 [00:07<01:25, 21.47it/s][A

tensor([[[0.0567],
         [0.0494],
         [0.0531],
         [0.0164],
         [0.0231],
         [0.0201],
         [0.0554],
         [0.0103],
         [0.0107],
         [0.0117],
         [0.0198],
         [0.0562]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0509],
         [0.0263],
         [0.0102],
         [0.0069],
         [0.0112],
         [0.0043],
         [0.0299],
         [0.0108],
         [0.0180],
         [0.0136],
         [0.0051],
         [0.0133]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0611],
         [0.1041],
         [0.0353],
         [0.0131],
         [0.0612],
         [0.0408],
         [0.0636],
         [0.0332],
         [0.0170],
         [0.0281],
         [0.0246],
         [0.0336]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0194],
         [0.0358],
         [0.0539],
         [0.0114],
         [0.0166],
         [0.0123],
         [0.0259],
         [0.0028],
         [0.0040],
    


Evaluating:   8%|██████████████▎                                                                                                                                                                    | 160/2000 [00:07<01:25, 21.48it/s][A

tensor([[[0.0430],
         [0.0786],
         [0.0288],
         [0.0070],
         [0.0132],
         [0.0098],
         [0.0479],
         [0.0055],
         [0.0100],
         [0.0158],
         [0.0531],
         [0.0350]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0957],
         [0.0582],
         [0.0388],
         [0.0568],
         [0.0196],
         [0.0037],
         [0.0655],
         [0.0121],
         [0.0082],
         [0.0211],
         [0.0296],
         [0.0090]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0538],
         [0.0522],
         [0.0325],
         [0.0080],
         [0.0068],
         [0.0079],
         [0.0190],
         [0.0067],
         [0.0019],
         [0.0068],
         [0.0059],
         [0.0163]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0825],
         [0.0158],
         [0.0325],
         [0.0056],
         [0.0204],
         [0.0207],
         [0.0220],
         [0.0089],
         [0.0142],
    


Evaluating:   8%|██████████████▌                                                                                                                                                                    | 163/2000 [00:07<01:25, 21.43it/s][A

tensor([[[0.0734],
         [0.0607],
         [0.0096],
         [0.0184],
         [0.0250],
         [0.0085],
         [0.0223],
         [0.0105],
         [0.0028],
         [0.0271],
         [0.0115],
         [0.0132]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0374],
         [0.0437],
         [0.0205],
         [0.0134],
         [0.0291],
         [0.0214],
         [0.0209],
         [0.0117],
         [0.0071],
         [0.0086],
         [0.0468],
         [0.0107]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0504],
         [0.0214],
         [0.0401],
         [0.0090],
         [0.0072],
         [0.0151],
         [0.0641],
         [0.0048],
         [0.0084],
         [0.0152],
         [0.0084],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0112],
         [0.0217],
         [0.0602],
         [0.0144],
         [0.0066],
         [0.0063],
         [0.0075],
         [0.0060],
         [0.0057],
    


Evaluating:   8%|██████████████▊                                                                                                                                                                    | 166/2000 [00:07<01:25, 21.45it/s][A
Evaluating:   8%|███████████████▏                                                                                                                                                                   | 169/2000 [00:07<01:25, 21.51it/s][A

tensor([[[0.0154],
         [0.0445],
         [0.0447],
         [0.0322],
         [0.0216],
         [0.0136],
         [0.0979],
         [0.0049],
         [0.0054],
         [0.0180],
         [0.0854],
         [0.0175]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0259],
         [0.0226],
         [0.0466],
         [0.0226],
         [0.0112],
         [0.0095],
         [0.0109],
         [0.0155],
         [0.0019],
         [0.0035],
         [0.0528],
         [0.0412]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0526],
         [0.0870],
         [0.0920],
         [0.0143],
         [0.1400],
         [0.0178],
         [0.0540],
         [0.0111],
         [0.0115],
         [0.0192],
         [0.0367],
         [0.0515]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0996],
         [0.0389],
         [0.0250],
         [0.0169],
         [0.0246],
         [0.0224],
         [0.1116],
         [0.0176],
         [0.0084],
    


Evaluating:   9%|███████████████▍                                                                                                                                                                   | 172/2000 [00:08<01:25, 21.45it/s][A

tensor([[[0.0335],
         [0.0468],
         [0.0611],
         [0.0175],
         [0.0242],
         [0.0160],
         [0.0246],
         [0.0228],
         [0.0122],
         [0.0211],
         [0.0173],
         [0.0257]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0131],
         [0.0914],
         [0.1501],
         [0.0318],
         [0.0308],
         [0.0106],
         [0.0086],
         [0.0200],
         [0.0094],
         [0.0044],
         [0.1955],
         [0.1852]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0810],
         [0.0268],
         [0.0112],
         [0.0101],
         [0.0161],
         [0.0221],
         [0.0127],
         [0.0078],
         [0.0050],
         [0.0146],
         [0.0067],
         [0.0184]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0376],
         [0.0242],
         [0.0292],
         [0.0239],
         [0.0133],
         [0.0177],
         [0.0104],
         [0.0097],
         [0.0030],
    


Evaluating:   9%|███████████████▋                                                                                                                                                                   | 175/2000 [00:08<01:25, 21.39it/s][A

tensor([[[0.0174],
         [0.0807],
         [0.0270],
         [0.0150],
         [0.0121],
         [0.0087],
         [0.0081],
         [0.0077],
         [0.0011],
         [0.0042],
         [0.0137],
         [0.0266]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0435],
         [0.0144],
         [0.0473],
         [0.0266],
         [0.0478],
         [0.0104],
         [0.0135],
         [0.1294],
         [0.0049],
         [0.0133],
         [0.0254],
         [0.0389]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0416],
         [0.1229],
         [0.1160],
         [0.0296],
         [0.0242],
         [0.0118],
         [0.0403],
         [0.0150],
         [0.0149],
         [0.0099],
         [0.0111],
         [0.0694]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0448],
         [0.0715],
         [0.0249],
         [0.0236],
         [0.0320],
         [0.0160],
         [0.0342],
         [0.0042],
         [0.0251],
    


Evaluating:   9%|███████████████▉                                                                                                                                                                   | 178/2000 [00:08<01:25, 21.42it/s][A
Evaluating:   9%|████████████████▏                                                                                                                                                                  | 181/2000 [00:08<01:24, 21.58it/s][A


reg attention sum per layer
tensor([[[0.0572],
         [0.0264],
         [0.0323],
         [0.0222],
         [0.0182],
         [0.0188],
         [0.0122],
         [0.0046],
         [0.0019],
         [0.0050],
         [0.0585],
         [0.0182]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0180],
         [0.1003],
         [0.0189],
         [0.0090],
         [0.0091],
         [0.0163],
         [0.0201],
         [0.0077],
         [0.0024],
         [0.0034],
         [0.0328],
         [0.0317]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0438],
         [0.0114],
         [0.0636],
         [0.0197],
         [0.0473],
         [0.0071],
         [0.0116],
         [0.0212],
         [0.0150],
         [0.0083],
         [0.0176],
         [0.0493]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0381],
         [0.0158],
         [0.0277],
         [0.0081],
         [0.0084],
         [0.0101],
         [0.0061],
         [0.0


Evaluating:   9%|████████████████▍                                                                                                                                                                  | 184/2000 [00:08<01:24, 21.55it/s][A

reg attention sum per layer
tensor([[[0.0049],
         [0.0050],
         [0.0023],
         [0.0018],
         [0.0031],
         [0.0036],
         [0.0070],
         [0.0024],
         [0.0014],
         [0.0010],
         [0.0007],
         [0.0039]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0358],
         [0.0303],
         [0.0382],
         [0.0184],
         [0.0113],
         [0.0221],
         [0.0087],
         [0.0137],
         [0.0024],
         [0.0103],
         [0.0064],
         [0.0129]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0168],
         [0.0157],
         [0.0351],
         [0.0077],
         [0.0153],
         [0.0692],
         [0.0312],
         [0.0037],
         [0.0011],
         [0.0035],
         [0.0294],
         [0.0226]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0091],
         [0.0082],
         [0.0263],
         [0.0085],
         [0.0151],
         [0.0068],
         [0.0135],
         [0.01


Evaluating:   9%|████████████████▋                                                                                                                                                                  | 187/2000 [00:08<01:24, 21.49it/s][A
Evaluating:  10%|█████████████████                                                                                                                                                                  | 190/2000 [00:08<01:24, 21.53it/s][A

tensor([[[0.0269],
         [0.0236],
         [0.0310],
         [0.0071],
         [0.0120],
         [0.0212],
         [0.0171],
         [0.0052],
         [0.0050],
         [0.0065],
         [0.0218],
         [0.0283]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0474],
         [0.0754],
         [0.1441],
         [0.0225],
         [0.0535],
         [0.0167],
         [0.0136],
         [0.0663],
         [0.0057],
         [0.0121],
         [0.0689],
         [0.0417]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0224],
         [0.0770],
         [0.0585],
         [0.0125],
         [0.0151],
         [0.0230],
         [0.0175],
         [0.0094],
         [0.0151],
         [0.0309],
         [0.0948],
         [0.0410]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0196],
         [0.0088],
         [0.0338],
         [0.0054],
         [0.0080],
         [0.0051],
         [0.0069],
         [0.0046],
         [0.0031],
    


Evaluating:  10%|█████████████████▎                                                                                                                                                                 | 193/2000 [00:09<01:23, 21.52it/s][A

tensor([[[0.0176],
         [0.0373],
         [0.0198],
         [0.0112],
         [0.0066],
         [0.0048],
         [0.0242],
         [0.0013],
         [0.0005],
         [0.0034],
         [0.0402],
         [0.0298]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0567],
         [0.1527],
         [0.0666],
         [0.0265],
         [0.0308],
         [0.0180],
         [0.0244],
         [0.0149],
         [0.0091],
         [0.0066],
         [0.0356],
         [0.0508]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0783],
         [0.0580],
         [0.1466],
         [0.0332],
         [0.0681],
         [0.0443],
         [0.0315],
         [0.2108],
         [0.0136],
         [0.0235],
         [0.0511],
         [0.0742]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0218],
         [0.0197],
         [0.0366],
         [0.0086],
         [0.0097],
         [0.0050],
         [0.0108],
         [0.0070],
         [0.0034],
    


Evaluating:  10%|█████████████████▌                                                                                                                                                                 | 196/2000 [00:09<01:24, 21.42it/s][A

tensor([[[0.0137],
         [0.0104],
         [0.0051],
         [0.0153],
         [0.0077],
         [0.0129],
         [0.0237],
         [0.0048],
         [0.0005],
         [0.0023],
         [0.0190],
         [0.0323]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0429],
         [0.0103],
         [0.0151],
         [0.0240],
         [0.0079],
         [0.0105],
         [0.0178],
         [0.0022],
         [0.0012],
         [0.0023],
         [0.0193],
         [0.0186]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0719],
         [0.0602],
         [0.0517],
         [0.0175],
         [0.0291],
         [0.0119],
         [0.0286],
         [0.0252],
         [0.0286],
         [0.0083],
         [0.0086],
         [0.0206]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0201],
         [0.0133],
         [0.0200],
         [0.0135],
         [0.0203],
         [0.0178],
         [0.0203],
         [0.0097],
         [0.0195],
    


Evaluating:  10%|█████████████████▊                                                                                                                                                                 | 199/2000 [00:09<01:24, 21.35it/s][A
Evaluating:  10%|██████████████████                                                                                                                                                                 | 202/2000 [00:09<01:23, 21.53it/s][A

tensor([[[0.0547],
         [0.0716],
         [0.0424],
         [0.0327],
         [0.0223],
         [0.0350],
         [0.0124],
         [0.0431],
         [0.0107],
         [0.0144],
         [0.0106],
         [0.0444]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0450],
         [0.0625],
         [0.0712],
         [0.0349],
         [0.0487],
         [0.0140],
         [0.0360],
         [0.0063],
         [0.0056],
         [0.0129],
         [0.0300],
         [0.0199]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0189],
         [0.0318],
         [0.0339],
         [0.0153],
         [0.0083],
         [0.0101],
         [0.0094],
         [0.0080],
         [0.0018],
         [0.0035],
         [0.0105],
         [0.0082]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1506],
         [0.0499],
         [0.0159],
         [0.0108],
         [0.0130],
         [0.0076],
         [0.0634],
         [0.0140],
         [0.0451],
    


Evaluating:  10%|██████████████████▎                                                                                                                                                                | 205/2000 [00:09<01:23, 21.42it/s][A

tensor([[[0.0833],
         [0.0306],
         [0.0468],
         [0.0173],
         [0.0297],
         [0.0219],
         [0.0544],
         [0.0297],
         [0.0264],
         [0.0182],
         [0.0205],
         [0.0241]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0152],
         [0.0167],
         [0.0481],
         [0.0084],
         [0.0284],
         [0.0119],
         [0.0162],
         [0.0351],
         [0.0079],
         [0.0058],
         [0.0152],
         [0.0219]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0257],
         [0.0388],
         [0.1292],
         [0.0304],
         [0.0411],
         [0.0142],
         [0.0225],
         [0.0472],
         [0.0254],
         [0.0120],
         [0.0188],
         [0.1184]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0541],
         [0.0780],
         [0.0347],
         [0.0147],
         [0.0166],
         [0.0139],
         [0.0099],
         [0.0416],
         [0.0085],
    


Evaluating:  10%|██████████████████▌                                                                                                                                                                | 208/2000 [00:09<01:23, 21.35it/s][A

tensor([[[0.0321],
         [0.0114],
         [0.0366],
         [0.0047],
         [0.0070],
         [0.0124],
         [0.0137],
         [0.0062],
         [0.0096],
         [0.0019],
         [0.0184],
         [0.0365]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0128],
         [0.0209],
         [0.0187],
         [0.0135],
         [0.0053],
         [0.0276],
         [0.0164],
         [0.0019],
         [0.0060],
         [0.0080],
         [0.0154],
         [0.0132]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0333],
         [0.0429],
         [0.0215],
         [0.0206],
         [0.0148],
         [0.0033],
         [0.0296],
         [0.0154],
         [0.0112],
         [0.0054],
         [0.0173],
         [0.0274]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0905],
         [0.0477],
         [0.0243],
         [0.0240],
         [0.0162],
         [0.0090],
         [0.0823],
         [0.0218],
         [0.0156],
    


Evaluating:  11%|██████████████████▉                                                                                                                                                                | 211/2000 [00:09<01:23, 21.30it/s][A
Evaluating:  11%|███████████████████▏                                                                                                                                                               | 214/2000 [00:09<01:23, 21.40it/s][A

tensor([[[0.0254],
         [0.0423],
         [0.0273],
         [0.0145],
         [0.0183],
         [0.0218],
         [0.0316],
         [0.0370],
         [0.0183],
         [0.0237],
         [0.0120],
         [0.0266]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0872],
         [0.0702],
         [0.0767],
         [0.0305],
         [0.0587],
         [0.0152],
         [0.0417],
         [0.0108],
         [0.0049],
         [0.0102],
         [0.0645],
         [0.0687]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0125],
         [0.0187],
         [0.0900],
         [0.0160],
         [0.0055],
         [0.0158],
         [0.0148],
         [0.0028],
         [0.0025],
         [0.0042],
         [0.0787],
         [0.0268]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0192],
         [0.0224],
         [0.0233],
         [0.0165],
         [0.0081],
         [0.0054],
         [0.0108],
         [0.0149],
         [0.0027],
    


Evaluating:  11%|███████████████████▍                                                                                                                                                               | 217/2000 [00:10<01:23, 21.43it/s][A

tensor([[[0.0438],
         [0.0160],
         [0.0320],
         [0.0098],
         [0.0064],
         [0.0199],
         [0.0202],
         [0.0059],
         [0.0031],
         [0.0046],
         [0.0730],
         [0.0273]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0218],
         [0.0036],
         [0.0202],
         [0.0112],
         [0.0308],
         [0.0061],
         [0.0137],
         [0.0082],
         [0.0054],
         [0.0139],
         [0.0069],
         [0.0104]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0089],
         [0.0432],
         [0.0167],
         [0.0111],
         [0.0032],
         [0.0058],
         [0.0216],
         [0.0025],
         [0.0011],
         [0.0032],
         [0.0219],
         [0.0218]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0414],
         [0.0306],
         [0.0210],
         [0.0156],
         [0.0116],
         [0.0196],
         [0.0275],
         [0.0047],
         [0.0018],
    


Evaluating:  11%|███████████████████▋                                                                                                                                                               | 220/2000 [00:10<01:23, 21.44it/s][A

tensor([[[0.0111],
         [0.0230],
         [0.0602],
         [0.0126],
         [0.0307],
         [0.0167],
         [0.0090],
         [0.0124],
         [0.0060],
         [0.0061],
         [0.0269],
         [0.0266]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0206],
         [0.0906],
         [0.0679],
         [0.0378],
         [0.0097],
         [0.0159],
         [0.0133],
         [0.0059],
         [0.0030],
         [0.0128],
         [0.3625],
         [0.0629]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0215],
         [0.0965],
         [0.2555],
         [0.0117],
         [0.0145],
         [0.0072],
         [0.0191],
         [0.0187],
         [0.0053],
         [0.0052],
         [0.0460],
         [0.0630]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0457],
         [0.0587],
         [0.0264],
         [0.0118],
         [0.0202],
         [0.0142],
         [0.0098],
         [0.0546],
         [0.0095],
    


Evaluating:  11%|███████████████████▉                                                                                                                                                               | 223/2000 [00:10<01:23, 21.27it/s][A
Evaluating:  11%|████████████████████▏                                                                                                                                                              | 226/2000 [00:10<01:23, 21.34it/s][A


tensor([[[0.0058],
         [0.0106],
         [0.0190],
         [0.0066],
         [0.0090],
         [0.0098],
         [0.0147],
         [0.0027],
         [0.0032],
         [0.0028],
         [0.0223],
         [0.0176]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0284],
         [0.0072],
         [0.0069],
         [0.0086],
         [0.0080],
         [0.0138],
         [0.0046],
         [0.0067],
         [0.0035],
         [0.0048],
         [0.0072],
         [0.0063]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0632],
         [0.0372],
         [0.0685],
         [0.0124],
         [0.0358],
         [0.0384],
         [0.0125],
         [0.0245],
         [0.0134],
         [0.0119],
         [0.0683],
         [0.0293]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0156],
         [0.0143],
         [0.0193],
         [0.0050],
         [0.0160],
         [0.0127],
         [0.0125],
         [0.0139],
         [0.0025],
   


Evaluating:  11%|████████████████████▍                                                                                                                                                              | 229/2000 [00:10<01:23, 21.34it/s][A

tensor([[[0.0230],
         [0.0290],
         [0.0354],
         [0.0088],
         [0.0170],
         [0.0303],
         [0.0374],
         [0.0047],
         [0.0066],
         [0.0053],
         [0.0375],
         [0.0176]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2470],
         [0.1135],
         [0.0309],
         [0.0217],
         [0.0190],
         [0.0121],
         [0.1160],
         [0.0163],
         [0.0066],
         [0.0687],
         [0.0135],
         [0.0204]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0304],
         [0.0874],
         [0.0480],
         [0.0155],
         [0.0260],
         [0.0123],
         [0.0255],
         [0.0150],
         [0.0103],
         [0.0116],
         [0.0286],
         [0.0235]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0147],
         [0.0236],
         [0.1383],
         [0.0115],
         [0.0216],
         [0.0132],
         [0.0050],
         [0.0049],
         [0.0013],
    


Evaluating:  12%|████████████████████▊                                                                                                                                                              | 232/2000 [00:10<01:22, 21.33it/s][A

tensor([[[0.0348],
         [0.0307],
         [0.1026],
         [0.0349],
         [0.0706],
         [0.0113],
         [0.0155],
         [0.0601],
         [0.0131],
         [0.0124],
         [0.0162],
         [0.0529]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0279],
         [0.0578],
         [0.0575],
         [0.0200],
         [0.0550],
         [0.0341],
         [0.0093],
         [0.0139],
         [0.0077],
         [0.0123],
         [0.0300],
         [0.0177]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0413],
         [0.0102],
         [0.0130],
         [0.0153],
         [0.0411],
         [0.0352],
         [0.0109],
         [0.0059],
         [0.0085],
         [0.0040],
         [0.0224],
         [0.0045]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0204],
         [0.0078],
         [0.0480],
         [0.0247],
         [0.0388],
         [0.0122],
         [0.0130],
         [0.0099],
         [0.0103],
    


Evaluating:  12%|█████████████████████                                                                                                                                                              | 235/2000 [00:10<01:23, 21.24it/s][A
Evaluating:  12%|█████████████████████▎                                                                                                                                                             | 238/2000 [00:11<01:22, 21.45it/s][A

tensor([[[0.0199],
         [0.0092],
         [0.0141],
         [0.0031],
         [0.0121],
         [0.0289],
         [0.0086],
         [0.0101],
         [0.0025],
         [0.0039],
         [0.0096],
         [0.0044]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0172],
         [0.0251],
         [0.0096],
         [0.0109],
         [0.0164],
         [0.0147],
         [0.0088],
         [0.0112],
         [0.0048],
         [0.0082],
         [0.0260],
         [0.0105]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0335],
         [0.1027],
         [0.0314],
         [0.0262],
         [0.0297],
         [0.0717],
         [0.0913],
         [0.0128],
         [0.0295],
         [0.0193],
         [0.0460],
         [0.0546]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0467],
         [0.0363],
         [0.0258],
         [0.0088],
         [0.0093],
         [0.0028],
         [0.0105],
         [0.0033],
         [0.0027],
    


Evaluating:  12%|█████████████████████▌                                                                                                                                                             | 241/2000 [00:11<01:22, 21.42it/s][A

tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0093],
         [0.0069],
         [0.0103],
         [0.0027],
         [0.0032],
         [0.0068],
         [0.0091],
         [0.0011],
         [0.0018],
         [0.0016],
         [0.0059],
         [0.0058]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0940],
         [0.0679],
         [0.0819],
         [0.0154],
         [0.0322],
         [0.0178],
         [0.0365],
         [0.0112],
         [0.0134],
         [0.0093],
         [0.0204],
         [0.0889]]], device='cuda:0')
reg attention sum per layer
tensor([[[6.3243e-03],
         [2.2321e-03],
         [1.2074e-03],
         [1.4905e-03],
         [3.2049e-03],
         [5.4602e-03],
         [1.9055e-03],
         [4.0296e-03],
         [8.8789e-05],
         [2.9412


Evaluating:  12%|█████████████████████▊                                                                                                                                                             | 244/2000 [00:11<01:22, 21.30it/s][A

tensor([[[0.0255],
         [0.0545],
         [0.0405],
         [0.0253],
         [0.0460],
         [0.0409],
         [0.0069],
         [0.0035],
         [0.0033],
         [0.0046],
         [0.1469],
         [0.1360]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0784],
         [0.0612],
         [0.0309],
         [0.0111],
         [0.0092],
         [0.0050],
         [0.1141],
         [0.0097],
         [0.0119],
         [0.0112],
         [0.0069],
         [0.0120]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0107],
         [0.0121],
         [0.0315],
         [0.0076],
         [0.0061],
         [0.0140],
         [0.0090],
         [0.0023],
         [0.0018],
         [0.0031],
         [0.0238],
         [0.0158]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0152],
         [0.0076],
         [0.0328],
         [0.0040],
         [0.0193],
         [0.0042],
         [0.0129],
         [0.0008],
         [0.0023],
    


Evaluating:  12%|██████████████████████                                                                                                                                                             | 247/2000 [00:11<01:21, 21.40it/s][A
Evaluating:  12%|██████████████████████▍                                                                                                                                                            | 250/2000 [00:11<01:21, 21.52it/s][A

tensor([[[0.0578],
         [0.1349],
         [0.0480],
         [0.0255],
         [0.0301],
         [0.0149],
         [0.0192],
         [0.1023],
         [0.0133],
         [0.0172],
         [0.0273],
         [0.0498]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0541],
         [0.0291],
         [0.0403],
         [0.0443],
         [0.0422],
         [0.0055],
         [0.0021],
         [0.0039],
         [0.0011],
         [0.0035],
         [0.1422],
         [0.0222]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0460],
         [0.0223],
         [0.0865],
         [0.0160],
         [0.0355],
         [0.0150],
         [0.0200],
         [0.0130],
         [0.0176],
         [0.0098],
         [0.0195],
         [0.0427]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0407],
         [0.2298],
         [0.1536],
         [0.0648],
         [0.1210],
         [0.0043],
         [0.0244],
         [0.0127],
         [0.0037],
    


Evaluating:  13%|██████████████████████▋                                                                                                                                                            | 253/2000 [00:11<01:21, 21.46it/s][A

tensor([[[0.0372],
         [0.0284],
         [0.0336],
         [0.0212],
         [0.0267],
         [0.0136],
         [0.0188],
         [0.0169],
         [0.0095],
         [0.0079],
         [0.0213],
         [0.0210]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0248],
         [0.0267],
         [0.0867],
         [0.0357],
         [0.0436],
         [0.0055],
         [0.0153],
         [0.0092],
         [0.0100],
         [0.0202],
         [0.0074],
         [0.0413]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0465],
         [0.0485],
         [0.0390],
         [0.0143],
         [0.0304],
         [0.0116],
         [0.0269],
         [0.0298],
         [0.0681],
         [0.0180],
         [0.0082],
         [0.0715]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0069],
         [0.0194],
         [0.0463],
         [0.0140],
         [0.0109],
         [0.0042],
         [0.0037],
         [0.0029],
         [0.0011],
    


Evaluating:  13%|██████████████████████▉                                                                                                                                                            | 256/2000 [00:11<01:21, 21.52it/s][A

tensor([[[0.0347],
         [0.0215],
         [0.0488],
         [0.0204],
         [0.0105],
         [0.0098],
         [0.0384],
         [0.0067],
         [0.0124],
         [0.0105],
         [0.0458],
         [0.0189]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0808],
         [0.0750],
         [0.0988],
         [0.0444],
         [0.0542],
         [0.0145],
         [0.0369],
         [0.0092],
         [0.0152],
         [0.0115],
         [0.0951],
         [0.0414]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0652],
         [0.0565],
         [0.0580],
         [0.0350],
         [0.0283],
         [0.0134],
         [0.0354],
         [0.0376],
         [0.0555],
         [0.0221],
         [0.0152],
         [0.07


Evaluating:  13%|███████████████████████▏                                                                                                                                                           | 259/2000 [00:12<01:21, 21.39it/s][A
Evaluating:  13%|███████████████████████▍                                                                                                                                                           | 262/2000 [00:12<01:20, 21.51it/s][A


reg attention sum per layer
tensor([[[0.0327],
         [0.0228],
         [0.0166],
         [0.0078],
         [0.0190],
         [0.0110],
         [0.0175],
         [0.0182],
         [0.0099],
         [0.0092],
         [0.0037],
         [0.0175]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0824],
         [0.0298],
         [0.0993],
         [0.0140],
         [0.0384],
         [0.1074],
         [0.0087],
         [0.0325],
         [0.0059],
         [0.0057],
         [0.0740],
         [0.0530]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0740],
         [0.0661],
         [0.0455],
         [0.0200],
         [0.0385],
         [0.0139],
         [0.0237],
         [0.0319],
         [0.0042],
         [0.0156],
         [0.0163],
         [0.0346]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2115],
         [0.1892],
         [0.0287],
         [0.0229],
         [0.0380],
         [0.0068],
         [0.1123],
         [0.0


Evaluating:  13%|███████████████████████▋                                                                                                                                                           | 265/2000 [00:12<01:20, 21.46it/s][A


reg attention sum per layer
tensor([[[0.1002],
         [0.0606],
         [0.0495],
         [0.0150],
         [0.0233],
         [0.0118],
         [0.0336],
         [0.0415],
         [0.0227],
         [0.0384],
         [0.0109],
         [0.0425]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0569],
         [0.1321],
         [0.0548],
         [0.0152],
         [0.0261],
         [0.0298],
         [0.0202],
         [0.0118],
         [0.0057],
         [0.0098],
         [0.0447],
         [0.0248]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0078],
         [0.0067],
         [0.0297],
         [0.0066],
         [0.0103],
         [0.0014],
         [0.0046],
         [0.0077],
         [0.0007],
         [0.0054],
         [0.0174],
         [0.0133]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0185],
         [0.0169],
         [0.0908],
         [0.0159],
         [0.0139],
         [0.0314],
         [0.0216],
         [0.0


Evaluating:  13%|███████████████████████▉                                                                                                                                                           | 268/2000 [00:12<01:20, 21.51it/s][A
Evaluating:  14%|████████████████████████▎                                                                                                                                                          | 271/2000 [00:12<01:20, 21.55it/s][A

reg attention sum per layer
tensor([[[0.2119],
         [0.1665],
         [0.0460],
         [0.0155],
         [0.0112],
         [0.0089],
         [0.0868],
         [0.0247],
         [0.0070],
         [0.0164],
         [0.0084],
         [0.0390]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0439],
         [0.0293],
         [0.0707],
         [0.0176],
         [0.0636],
         [0.0121],
         [0.0153],
         [0.0158],
         [0.0103],
         [0.0058],
         [0.0166],
         [0.0365]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0260],
         [0.0768],
         [0.0778],
         [0.0203],
         [0.0656],
         [0.0506],
         [0.0132],
         [0.0463],
         [0.0122],
         [0.0142],
         [0.0445],
         [0.0545]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0155],
         [0.0167],
         [0.0242],
         [0.0123],
         [0.0145],
         [0.0072],
         [0.0214],
         [0.02


Evaluating:  14%|████████████████████████▌                                                                                                                                                          | 274/2000 [00:12<01:20, 21.49it/s][A

tensor([[[0.0178],
         [0.0851],
         [0.0147],
         [0.0105],
         [0.0045],
         [0.0130],
         [0.0049],
         [0.0251],
         [0.0009],
         [0.0022],
         [0.0089],
         [0.0138]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0271],
         [0.0304],
         [0.0371],
         [0.0113],
         [0.0218],
         [0.0077],
         [0.0200],
         [0.0085],
         [0.0097],
         [0.0092],
         [0.0180],
         [0.0193]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0247],
         [0.0180],
         [0.0115],
         [0.0038],
         [0.0037],
         [0.0028],
         [0.0244],
         [0.0027],
         [0.0153],
         [0.0117],
         [0.0141],
         [0.0046]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0079],
         [0.0286],
         [0.0630],
         [0.0051],
         [0.0054],
         [0.0081],
         [0.0037],
         [0.0022],
         [0.0011],
    


Evaluating:  14%|████████████████████████▊                                                                                                                                                          | 277/2000 [00:12<01:20, 21.44it/s][A

tensor([[[0.0044],
         [0.0148],
         [0.0509],
         [0.0068],
         [0.0131],
         [0.0261],
         [0.0075],
         [0.0009],
         [0.0022],
         [0.0013],
         [0.0621],
         [0.0298]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0567],
         [0.0218],
         [0.0165],
         [0.0176],
         [0.0158],
         [0.0221],
         [0.0302],
         [0.0099],
         [0.0035],
         [0.0120],
         [0.0239],
         [0.0269]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0202],
         [0.0196],
         [0.0371],
         [0.0138],
         [0.0174],
         [0.0099],
         [0.0246],
         [0.0038],
         [0.0030],
         [0.0026],
         [0.0108],
         [0.0276]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0467],
         [0.0293],
         [0.0072],
         [0.0133],
         [0.0335],
         [0.0226],
         [0.0258],
         [0.0017],
         [0.0034],
    


Evaluating:  14%|█████████████████████████                                                                                                                                                          | 280/2000 [00:13<01:20, 21.50it/s][A
Evaluating:  14%|█████████████████████████▎                                                                                                                                                         | 283/2000 [00:13<01:20, 21.45it/s][A

tensor([[[0.0618],
         [0.0296],
         [0.1250],
         [0.0268],
         [0.0241],
         [0.0092],
         [0.0910],
         [0.0228],
         [0.0163],
         [0.0145],
         [0.0173],
         [0.1358]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0063],
         [0.0057],
         [0.0240],
         [0.0037],
         [0.0040],
         [0.0028],
         [0.0026],
         [0.0007],
         [0.0020],
         [0.0026],
         [0.0105],
         [0.0087]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0360],
         [0.0458],
         [0.0664],
         [0.0117],
         [0.0379],
         [0.0130],
         [0.0134],
         [0.0275],
         [0.0076],
         [0.0146],
         [0.0528],
         [0.0342]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0277],
         [0.0180],
         [0.0289],
         [0.0043],
         [0.0076],
         [0.0180],
         [0.0327],
         [0.0196],
         [0.0025],
    


Evaluating:  14%|█████████████████████████▌                                                                                                                                                         | 286/2000 [00:13<01:20, 21.41it/s][A

tensor([[[0.0423],
         [0.0248],
         [0.0300],
         [0.0192],
         [0.0208],
         [0.0165],
         [0.0105],
         [0.0240],
         [0.0096],
         [0.0063],
         [0.0170],
         [0.0321]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0115],
         [0.0353],
         [0.1034],
         [0.0101],
         [0.0191],
         [0.0059],
         [0.0114],
         [0.0005],
         [0.0069],
         [0.0012],
         [0.0988],
         [0.0157]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0271],
         [0.0330],
         [0.0613],
         [0.0402],
         [0.0304],
         [0.0176],
         [0.0208],
         [0.0017],
         [0.0019],
         [0.0083],
         [0.2910],
         [0.0324]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0167],
         [0.0170],
         [0.1085],
         [0.0364],
         [0.0228],
         [0.0028],
         [0.0118],
         [0.0199],
         [0.0062],
    


Evaluating:  14%|█████████████████████████▊                                                                                                                                                         | 289/2000 [00:13<01:19, 21.39it/s][A

tensor([[[0.0665],
         [0.0936],
         [0.2675],
         [0.1623],
         [0.1471],
         [0.0335],
         [0.0215],
         [0.1604],
         [0.0600],
         [0.0145],
         [0.1657],
         [0.1449]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0269],
         [0.0195],
         [0.0294],
         [0.0154],
         [0.0157],
         [0.0056],
         [0.0178],
         [0.0072],
         [0.0016],
         [0.0084],
         [0.0060],
         [0.0117]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0860],
         [0.0353],
         [0.0148],
         [0.0148],
         [0.0289],
         [0.0226],
         [0.0130],
         [0.0062],
         [0.0118],
         [0.0198],
         [0.0114],
         [0.0200]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0433],
         [0.0195],
         [0.1221],
         [0.0149],
         [0.0296],
         [0.0095],
         [0.0156],
         [0.0072],
         [0.0035],
    


Evaluating:  15%|██████████████████████████▏                                                                                                                                                        | 292/2000 [00:13<01:19, 21.46it/s][A
Evaluating:  15%|██████████████████████████▍                                                                                                                                                        | 295/2000 [00:13<01:19, 21.47it/s][A

tensor([[[0.0099],
         [0.0606],
         [0.0374],
         [0.0047],
         [0.0080],
         [0.0052],
         [0.0074],
         [0.0037],
         [0.0027],
         [0.0087],
         [0.0469],
         [0.0252]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0235],
         [0.0118],
         [0.0426],
         [0.0139],
         [0.0086],
         [0.0033],
         [0.0122],
         [0.0022],
         [0.0018],
         [0.0022],
         [0.0107],
         [0.0377]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0441],
         [0.0327],
         [0.0383],
         [0.0098],
         [0.0343],
         [0.0264],
         [0.0304],
         [0.0095],
         [0.0082],
         [0.0089],
         [0.0513],
         [0.0281]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0066],
         [0.0194],
         [0.0131],
         [0.0026],
         [0.0043],
         [0.0022],
         [0.0140],
         [0.0014],
         [0.0020],
    


Evaluating:  15%|██████████████████████████▋                                                                                                                                                        | 298/2000 [00:13<01:19, 21.52it/s][A

tensor([[[0.0476],
         [0.0409],
         [0.0447],
         [0.0154],
         [0.0094],
         [0.0091],
         [0.0222],
         [0.0204],
         [0.0117],
         [0.0081],
         [0.0835],
         [0.0809]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0783],
         [0.0645],
         [0.0544],
         [0.0213],
         [0.0457],
         [0.0122],
         [0.0336],
         [0.0213],
         [0.0095],
         [0.0298],
         [0.0137],
         [0.0287]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0190],
         [0.0749],
         [0.0232],
         [0.0105],
         [0.0371],
         [0.0082],
         [0.0288],
         [0.0061],
         [0.0089],
         [0.0073],
         [0.0275],
         [0.0185]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0403],
         [0.0440],
         [0.0360],
         [0.0080],
         [0.0072],
         [0.0205],
         [0.0233],
         [0.0212],
         [0.0030],
    


Evaluating:  15%|██████████████████████████▉                                                                                                                                                        | 301/2000 [00:14<01:19, 21.46it/s][A

tensor([[[0.0458],
         [0.0427],
         [0.0394],
         [0.0260],
         [0.0288],
         [0.0057],
         [0.0160],
         [0.0160],
         [0.0165],
         [0.0078],
         [0.0088],
         [0.0231]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0250],
         [0.0204],
         [0.0297],
         [0.0088],
         [0.0089],
         [0.0152],
         [0.0189],
         [0.0174],
         [0.0043],
         [0.0042],
         [0.0109],
         [0.0168]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0246],
         [0.0088],
         [0.0975],
         [0.0054],
         [0.0078],
         [0.0019],
         [0.0197],
         [0.0070],
         [0.0046],
         [0.0057],
         [0.0139],
         [0.0103]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0862],
         [0.0274],
         [0.1665],
         [0.0239],
         [0.0363],
         [0.0126],
         [0.0308],
         [0.0056],
         [0.0047],
    


Evaluating:  15%|███████████████████████████▏                                                                                                                                                       | 304/2000 [00:14<01:19, 21.38it/s][A
Evaluating:  15%|███████████████████████████▍                                                                                                                                                       | 307/2000 [00:14<01:19, 21.41it/s][A

tensor([[[0.0311],
         [0.0560],
         [0.1471],
         [0.0212],
         [0.0283],
         [0.0150],
         [0.0112],
         [0.0075],
         [0.0100],
         [0.0040],
         [0.0206],
         [0.0438]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0387],
         [0.0566],
         [0.0592],
         [0.0133],
         [0.0321],
         [0.0218],
         [0.0103],
         [0.0227],
         [0.0120],
         [0.0108],
         [0.0751],
         [0.0427]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0223],
         [0.0206],
         [0.0456],
         [0.0160],
         [0.0434],
         [0.0135],
         [0.0244],
         [0.0138],
         [0.0094],
         [0.0098],
         [0.0101],
         [0.0339]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0365],
         [0.0828],
         [0.0184],
         [0.0086],
         [0.0167],
         [0.0212],
         [0.0255],
         [0.0111],
         [0.0046],
    


Evaluating:  16%|███████████████████████████▋                                                                                                                                                       | 310/2000 [00:14<01:18, 21.43it/s][A

tensor([[[0.0310],
         [0.0306],
         [0.0714],
         [0.0171],
         [0.0264],
         [0.0143],
         [0.0105],
         [0.0228],
         [0.0052],
         [0.0049],
         [0.0599],
         [0.0318]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0384],
         [0.0346],
         [0.0068],
         [0.0028],
         [0.0060],
         [0.0079],
         [0.0228],
         [0.0086],
         [0.0045],
         [0.0100],
         [0.0043],
         [0.0075]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0217],
         [0.0262],
         [0.0373],
         [0.0096],
         [0.0095],
         [0.0100],
         [0.0099],
         [0.0086],
         [0.0015],
         [0.0017],
         [0.0205],
         [0.0177]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0084],
         [0.0634],
         [0.0061],
         [0.0109],
         [0.0096],
         [0.0030],
         [0.0130],
         [0.0034],
         [0.0014],
    


Evaluating:  16%|████████████████████████████                                                                                                                                                       | 313/2000 [00:14<01:18, 21.45it/s][A

tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0315],
         [0.0456],
         [0.0366],
         [0.0288],
         [0.0146],
         [0.0176],
         [0.0121],
         [0.0194],
         [0.0039],
         [0.0041],
         [0.0129],
         [0.0572]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0372],
         [0.0191],
         [0.0594],
         [0.0227],
         [0.0117],
         [0.0140],
         [0.0154],
         [0.0083],
         [0.0038],
         [0.0104],
         [0.0296],
         [0.0276]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0670],
         [0.0329],
         [0.0721],
         [0.0152],
         [0.0076],
         [0.0160],
         [0.0399],
         [0.0210],
         [0.0081],
         [0.0134],
         [0.0247],
         [0.03


Evaluating:  16%|████████████████████████████▎                                                                                                                                                      | 316/2000 [00:14<01:18, 21.46it/s][A
Evaluating:  16%|████████████████████████████▌                                                                                                                                                      | 319/2000 [00:14<01:18, 21.38it/s][A


reg attention sum per layer
tensor([[[0.0195],
         [0.0236],
         [0.0233],
         [0.0071],
         [0.0178],
         [0.0098],
         [0.0139],
         [0.0160],
         [0.0081],
         [0.0124],
         [0.0090],
         [0.0383]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0142],
         [0.1437],
         [0.0943],
         [0.0221],
         [0.0181],
         [0.0087],
         [0.0102],
         [0.0027],
         [0.0041],
         [0.0038],
         [0.0353],
         [0.0185]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0424],
         [0.0263],
         [0.1197],
         [0.0170],
         [0.0301],
         [0.0141],
         [0.0144],
         [0.0458],
         [0.0093],
         [0.0161],
         [0.0312],
         [0.0546]]], device='cuda:0')
reg attention sum per layer



Evaluating:  16%|████████████████████████████▊                                                                                                                                                      | 322/2000 [00:15<01:18, 21.43it/s][A

tensor([[[0.0886],
         [0.0076],
         [0.0143],
         [0.0201],
         [0.0501],
         [0.0187],
         [0.0164],
         [0.0183],
         [0.0181],
         [0.0175],
         [0.0061],
         [0.0668]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0093],
         [0.0079],
         [0.0553],
         [0.0066],
         [0.0378],
         [0.0066],
         [0.0077],
         [0.0035],
         [0.0030],
         [0.0063],
         [0.0253],
         [0.0346]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0791],
         [0.0195],
         [0.0502],
         [0.0256],
         [0.1591],
         [0.0242],
         [0.0227],
         [0.0237],
         [0.0182],
         [0.0299],
         [0.0057],
         [0.0201]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0684],
         [0.0619],
         [0.0963],
         [0.0212],
         [0.1441],
         [0.0179],
         [0.0714],
         [0.0176],
         [0.0443],
    


Evaluating:  16%|█████████████████████████████                                                                                                                                                      | 325/2000 [00:15<01:18, 21.36it/s][A


reg attention sum per layer
tensor([[[0.1220],
         [0.0184],
         [0.0468],
         [0.0162],
         [0.0225],
         [0.0144],
         [0.0477],
         [0.0295],
         [0.0083],
         [0.0688],
         [0.0215],
         [0.0210]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0201],
         [0.0823],
         [0.0134],
         [0.0147],
         [0.0072],
         [0.0092],
         [0.0230],
         [0.0016],
         [0.0019],
         [0.0182],
         [0.0733],
         [0.0171]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0516],
         [0.0254],
         [0.2090],
         [0.0191],
         [0.0612],
         [0.0415],
         [0.0372],
         [0.0071],
         [0.0248],
         [0.0280],
         [0.1405],
         [0.0431]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0116],
         [0.0147],
         [0.0263],
         [0.0053],
         [0.0098],
         [0.0047],
         [0.0011],
         [0.0


Evaluating:  16%|█████████████████████████████▎                                                                                                                                                     | 328/2000 [00:15<01:18, 21.35it/s][A
Evaluating:  17%|█████████████████████████████▌                                                                                                                                                     | 331/2000 [00:15<01:17, 21.44it/s][A


reg attention sum per layer
tensor([[[0.0649],
         [0.0339],
         [0.0519],
         [0.0409],
         [0.0610],
         [0.0223],
         [0.0485],
         [0.0269],
         [0.0260],
         [0.0226],
         [0.0130],
         [0.0189]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0097],
         [0.0599],
         [0.0251],
         [0.0071],
         [0.0059],
         [0.0020],
         [0.0093],
         [0.0045],
         [0.0009],
         [0.0023],
         [0.0440],
         [0.0237]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0492],
         [0.0114],
         [0.0169],
         [0.0065],
         [0.0077],
         [0.0091],
         [0.0026],
         [0.0131],
         [0.0004],
         [0.0019],
         [0.0053],
         [0.0053]]], device='cuda:0')
reg attention sum per layer



Evaluating:  17%|█████████████████████████████▉                                                                                                                                                     | 334/2000 [00:15<01:17, 21.45it/s][A

tensor([[[0.0960],
         [0.0814],
         [0.0321],
         [0.0213],
         [0.0291],
         [0.0181],
         [0.0392],
         [0.0225],
         [0.0151],
         [0.0239],
         [0.0202],
         [0.0297]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1378],
         [0.0738],
         [0.0087],
         [0.0138],
         [0.0314],
         [0.0228],
         [0.0209],
         [0.0083],
         [0.0090],
         [0.0197],
         [0.0281],
         [0.0104]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0064],
         [0.0115],
         [0.0167],
         [0.0045],
         [0.0020],
         [0.0054],
         [0.0057],
         [0.0019],
         [0.0002],
         [0.0005],
         [0.0533],
         [0.0247]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0344],
         [0.0484],
         [0.0736],
         [0.0372],
         [0.0414],
         [0.0161],
         [0.0095],
         [0.0252],
         [0.0104],
    


Evaluating:  17%|██████████████████████████████▏                                                                                                                                                    | 337/2000 [00:15<01:17, 21.32it/s][A


reg attention sum per layer
tensor([[[0.0469],
         [0.0589],
         [0.1210],
         [0.0335],
         [0.1104],
         [0.0299],
         [0.0245],
         [0.0200],
         [0.0163],
         [0.0182],
         [0.0769],
         [0.0917]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0114],
         [0.0290],
         [0.0601],
         [0.0129],
         [0.0075],
         [0.0194],
         [0.0101],
         [0.0023],
         [0.0007],
         [0.0012],
         [0.1159],
         [0.0360]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0189],
         [0.0494],
         [0.0847],
         [0.0088],
         [0.0071],
         [0.0117],
         [0.0020],
         [0.0070],
         [0.0013],
         [0.0021],
         [0.0554],
         [0.0201]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0208],
         [0.0154],
         [0.0229],
         [0.0113],
         [0.0327],
         [0.0333],
         [0.0056],
         [0.0


Evaluating:  17%|██████████████████████████████▍                                                                                                                                                    | 340/2000 [00:15<01:17, 21.33it/s][A
Evaluating:  17%|██████████████████████████████▋                                                                                                                                                    | 343/2000 [00:16<01:17, 21.33it/s][A


reg attention sum per layer
tensor([[[0.0890],
         [0.0232],
         [0.0356],
         [0.0150],
         [0.0199],
         [0.0109],
         [0.0182],
         [0.0169],
         [0.0054],
         [0.0133],
         [0.0060],
         [0.0545]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0235],
         [0.0473],
         [0.0294],
         [0.0299],
         [0.0241],
         [0.0154],
         [0.0241],
         [0.0276],
         [0.0091],
         [0.0076],
         [0.0397],
         [0.0293]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0619],
         [0.0410],
         [0.0316],
         [0.0090],
         [0.0119],
         [0.0300],
         [0.0161],
         [0.0111],
         [0.0098],
         [0.0099],
         [0.0200],
         [0.0215]]], device='cuda:0')
reg attention sum per layer



Evaluating:  17%|██████████████████████████████▉                                                                                                                                                    | 346/2000 [00:16<01:17, 21.29it/s][A

tensor([[[0.0181],
         [0.0198],
         [0.0459],
         [0.0154],
         [0.0118],
         [0.0041],
         [0.0105],
         [0.0079],
         [0.0086],
         [0.0059],
         [0.0145],
         [0.0358]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0054],
         [0.0072],
         [0.0184],
         [0.0027],
         [0.0062],
         [0.0042],
         [0.0073],
         [0.0110],
         [0.0026],
         [0.0038],
         [0.0041],
         [0.0032]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0285],
         [0.0135],
         [0.0335],
         [0.0150],
         [0.0259],
         [0.0202],
         [0.0182],
         [0.0199],
         [0.0040],
         [0.0054],
         [0.0086],
         [0.0303]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0174],
         [0.0305],
         [0.0444],
         [0.0088],
         [0.0050],
         [0.0116],
         [0.0156],
         [0.0050],
         [0.0017],
    


Evaluating:  17%|███████████████████████████████▏                                                                                                                                                   | 349/2000 [00:16<01:17, 21.34it/s][A



reg attention sum per layer
tensor([[[0.0121],
         [0.0140],
         [0.0191],
         [0.0099],
         [0.0133],
         [0.0123],
         [0.0162],
         [0.0046],
         [0.0015],
         [0.0039],
         [0.0247],
         [0.0171]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0158],
         [0.0255],
         [0.0252],
         [0.0066],
         [0.0174],
         [0.0347],
         [0.0257],
         [0.0071],
         [0.0033],
         [0.0095],
         [0.0266],
         [0.0205]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0353],
         [0.0369],
         [0.0295],
         [0.0096],
         [0.0514],
         [0.0078],
         [0.0145],
         [0.0151],
         [0.0067],
         [0.0075],
         [0.0159],
         [0.0283]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0613],
         [0.0608],
         [0.0227],
         [0.0384],
         [0.0241],
         [0.0230],
         [0.0350],
         [0.0

Evaluating:  18%|███████████████████████████████▌                                                                                                                                                   | 352/2000 [00:16<01:16, 21.43it/s][A
Evaluating:  18%|███████████████████████████████▊                                                                                                                                                   | 355/2000 [00:16<01:17, 21.36it/s][A

reg attention sum per layer
tensor([[[0.0376],
         [0.0245],
         [0.0199],
         [0.0267],
         [0.0173],
         [0.0107],
         [0.0984],
         [0.0214],
         [0.0142],
         [0.0399],
         [0.0117],
         [0.0230]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0657],
         [0.1578],
         [0.0372],
         [0.0163],
         [0.0189],
         [0.0076],
         [0.0118],
         [0.0386],
         [0.0080],
         [0.0064],
         [0.0094],
         [0.0292]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1038],
         [0.0477],
         [0.0111],
         [0.0047],
         [0.0025],
         [0.0054],
         [0.0225],
         [0.0063],
         [0.0018],
         [0.0038],
         [0.0018],
         [0.0069]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0074],
         [0.0368],
         [0.0361],
         [0.0075],
         [0.0203],
         [0.0058],
         [0.0183],
         [0.00


Evaluating:  18%|████████████████████████████████                                                                                                                                                   | 358/2000 [00:16<01:16, 21.35it/s][A



reg attention sum per layer
tensor([[[2.1138e-03],
         [3.5237e-03],
         [2.7444e-04],
         [2.3497e-03],
         [4.2120e-04],
         [1.0470e-03],
         [3.9601e-04],
         [3.1599e-04],
         [8.0219e-05],
         [2.7102e-04],
         [1.3130e-02],
         [8.0229e-03]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0553],
         [0.0455],
         [0.0605],
         [0.0226],
         [0.0222],
         [0.0278],
         [0.0168],
         [0.0212],
         [0.0161],
         [0.0113],
         [0.1203],
         [0.0345]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1820],
         [0.0542],
         [0.0242],
         [0.0194],
         [0.0213],
         [0.0082],
         [0.0278],
         [0.0139],
         [0.0206],
         [0.0195],
         [0.0044],
         [0.0196]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0362],
         [0.0196],
         [0.0089],
         [0.0041],
         [0.0191],
   

Evaluating:  18%|████████████████████████████████▎                                                                                                                                                  | 361/2000 [00:16<01:16, 21.39it/s][A
Evaluating:  18%|████████████████████████████████▌                                                                                                                                                  | 364/2000 [00:17<01:16, 21.42it/s][A

reg attention sum per layer
tensor([[[0.0054],
         [0.0080],
         [0.0257],
         [0.0065],
         [0.0161],
         [0.0149],
         [0.0051],
         [0.0087],
         [0.0050],
         [0.0033],
         [0.0171],
         [0.0147]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0667],
         [0.0399],
         [0.0209],
         [0.0215],
         [0.0257],
         [0.0254],
         [0.0283],
         [0.0384],
         [0.0126],
         [0.0136],
         [0.0140],
         [0.0365]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0320],
         [0.0618],
         [0.0343],
         [0.0179],
         [0.0409],
         [0.0348],
         [0.0157],
         [0.0034],
         [0.0039],
         [0.0059],
         [0.0806],
         [0.0647]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0168],
         [0.0380],
         [0.0312],
         [0.0160],
         [0.0311],
         [0.0387],
         [0.0090],
         [0.02


Evaluating:  18%|████████████████████████████████▊                                                                                                                                                  | 367/2000 [00:17<01:16, 21.39it/s][A

tensor([[[0.1029],
         [0.0102],
         [0.0647],
         [0.0303],
         [0.0322],
         [0.0089],
         [0.0316],
         [0.0176],
         [0.0229],
         [0.0195],
         [0.0082],
         [0.0356]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0338],
         [0.0131],
         [0.1387],
         [0.0181],
         [0.0322],
         [0.0144],
         [0.0061],
         [0.0208],
         [0.0057],
         [0.0049],
         [0.0272],
         [0.0713]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0144],
         [0.0068],
         [0.0031],
         [0.0035],
         [0.0121],
         [0.0073],
         [0.0075],
         [0.0042],
         [0.0013],
         [0.0044],
         [0.0072],
         [0.0067]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0205],
         [0.0087],
         [0.0077],
         [0.0088],
         [0.0045],
         [0.0037],
         [0.0777],
         [0.0110],
         [0.0085],
    


Evaluating:  18%|█████████████████████████████████                                                                                                                                                  | 370/2000 [00:17<01:16, 21.25it/s][A
Evaluating:  19%|█████████████████████████████████▍                                                                                                                                                 | 373/2000 [00:17<01:17, 21.05it/s][A

tensor([[[0.0634],
         [0.0345],
         [0.0212],
         [0.0130],
         [0.0236],
         [0.0034],
         [0.0142],
         [0.0067],
         [0.0030],
         [0.0057],
         [0.0116],
         [0.0146]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0118],
         [0.0110],
         [0.0101],
         [0.0080],
         [0.0162],
         [0.0053],
         [0.0119],
         [0.0024],
         [0.0559],
         [0.0066],
         [0.0076],
         [0.0167]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0398],
         [0.0183],
         [0.0358],
         [0.0128],
         [0.0355],
         [0.0098],
         [0.0102],
         [0.0069],
         [0.0094],
         [0.0034],
         [0.0248],
         [0.0411]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0452],
         [0.0114],
         [0.0203],
         [0.0023],
         [0.0127],
         [0.0107],
         [0.0122],
         [0.0074],
         [0.0061],
    


Evaluating:  19%|█████████████████████████████████▋                                                                                                                                                 | 376/2000 [00:17<01:17, 21.00it/s][A

tensor([[[0.0203],
         [0.0668],
         [0.0792],
         [0.0163],
         [0.0139],
         [0.0113],
         [0.0167],
         [0.0368],
         [0.0193],
         [0.0094],
         [0.0358],
         [0.0164]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0358],
         [0.0120],
         [0.0260],
         [0.0062],
         [0.0083],
         [0.0158],
         [0.0109],
         [0.0143],
         [0.0069],
         [0.0144],
         [0.0060],
         [0.0243]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0135],
         [0.0121],
         [0.0370],
         [0.0082],
         [0.0060],
         [0.0065],
         [0.0474],
         [0.0046],
         [0.0044],
         [0.0056],
         [0.0083],
         [0.0244]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0099],
         [0.0050],
         [0.0284],
         [0.0070],
         [0.0152],
         [0.0169],
         [0.0123],
         [0.0050],
         [0.0034],
    


Evaluating:  19%|█████████████████████████████████▉                                                                                                                                                 | 379/2000 [00:17<01:17, 20.84it/s][A


tensor([[[0.0325],
         [0.0187],
         [0.0666],
         [0.0357],
         [0.0375],
         [0.0070],
         [0.0094],
         [0.0078],
         [0.0034],
         [0.0049],
         [0.0148],
         [0.0132]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0371],
         [0.0227],
         [0.0370],
         [0.0080],
         [0.0159],
         [0.0144],
         [0.0376],
         [0.0158],
         [0.0051],
         [0.0122],
         [0.0066],
         [0.0228]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0131],
         [0.0075],
         [0.0131],
         [0.0057],
         [0.0132],
         [0.0129],
         [0.0148],
         [0.0054],
         [0.0036],
         [0.0034],
         [0.0124],
         [0.0121]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0152],
         [0.0207],
         [0.0299],
         [0.0078],
         [0.0117],
         [0.0227],
         [0.0096],
         [0.0024],
         [0.0035],
   


Evaluating:  19%|██████████████████████████████████▏                                                                                                                                                | 382/2000 [00:17<01:17, 20.81it/s][A
Evaluating:  19%|██████████████████████████████████▍                                                                                                                                                | 385/2000 [00:18<01:17, 20.88it/s][A


tensor([[[0.0396],
         [0.0295],
         [0.1044],
         [0.0087],
         [0.0386],
         [0.0251],
         [0.0348],
         [0.0133],
         [0.0085],
         [0.0063],
         [0.0251],
         [0.0547]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0281],
         [0.0140],
         [0.0612],
         [0.0120],
         [0.0260],
         [0.0139],
         [0.0052],
         [0.0096],
         [0.0018],
         [0.0053],
         [0.0104],
         [0.0140]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0334],
         [0.0222],
         [0.0393],
         [0.0090],
         [0.0126],
         [0.0140],
         [0.0218],
         [0.0036],
         [0.0030],
         [0.0033],
         [0.0104],
         [0.0147]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0087],
         [0.0636],
         [0.0251],
         [0.0715],
         [0.0150],
         [0.0156],
         [0.0190],
         [0.0004],
         [0.0077],
   


Evaluating:  19%|██████████████████████████████████▋                                                                                                                                                | 388/2000 [00:18<01:16, 21.01it/s][A

tensor([[[0.0276],
         [0.0207],
         [0.1924],
         [0.0185],
         [0.0353],
         [0.0084],
         [0.0196],
         [0.0093],
         [0.1329],
         [0.0224],
         [0.0219],
         [0.0792]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0122],
         [0.0121],
         [0.1258],
         [0.0099],
         [0.0108],
         [0.0065],
         [0.0284],
         [0.0025],
         [0.0101],
         [0.0099],
         [0.0256],
         [0.0367]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0305],
         [0.0562],
         [0.0347],
         [0.0138],
         [0.0221],
         [0.0232],
         [0.0280],
         [0.0101],
         [0.0096],
         [0.0106],
         [0.0288],
         [0.0296]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0218],
         [0.0074],
         [0.0179],
         [0.0134],
         [0.0247],
         [0.0174],
         [0.0108],
         [0.0013],
         [0.0011],
    


Evaluating:  20%|██████████████████████████████████▉                                                                                                                                                | 391/2000 [00:18<01:16, 21.11it/s][A

tensor([[[0.0578],
         [0.0193],
         [0.0591],
         [0.0089],
         [0.0065],
         [0.0159],
         [0.0132],
         [0.0052],
         [0.0017],
         [0.0065],
         [0.0112],
         [0.0211]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0287],
         [0.0111],
         [0.0588],
         [0.0158],
         [0.0164],
         [0.0314],
         [0.0144],
         [0.0020],
         [0.0008],
         [0.0016],
         [0.1288],
         [0.0707]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0228],
         [0.0325],
         [0.0272],
         [0.0136],
         [0.0138],
         [0.0096],
         [0.0348],
         [0.0034],
         [0.0026],
         [0.0040],
         [0.0144],
         [0.0218]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0200],
         [0.0147],
         [0.0824],
         [0.0132],
         [0.0506],
         [0.0276],
         [0.0229],
         [0.0064],
         [0.0195],
    


Evaluating:  20%|███████████████████████████████████▎                                                                                                                                               | 394/2000 [00:18<01:15, 21.22it/s][A
Evaluating:  20%|███████████████████████████████████▌                                                                                                                                               | 397/2000 [00:18<01:14, 21.39it/s][A

tensor([[[0.0038],
         [0.0222],
         [0.0171],
         [0.0232],
         [0.0308],
         [0.0363],
         [0.0132],
         [0.0036],
         [0.0074],
         [0.0046],
         [0.0621],
         [0.0196]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0101],
         [0.0279],
         [0.0246],
         [0.0111],
         [0.0123],
         [0.0282],
         [0.0376],
         [0.0131],
         [0.0087],
         [0.0042],
         [0.0287],
         [0.0230]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0132],
         [0.0231],
         [0.0859],
         [0.0152],
         [0.0271],
         [0.0068],
         [0.0112],
         [0.0082],
         [0.0072],
         [0.0041],
         [0.0315],
         [0.0832]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0088],
         [0.0091],
         [0.0107],
         [0.0030],
         [0.0148],
         [0.0101],
         [0.0060],
         [0.0099],
         [0.0040],
    


Evaluating:  20%|███████████████████████████████████▊                                                                                                                                               | 400/2000 [00:18<01:15, 21.33it/s][A

tensor([[[0.0109],
         [0.0348],
         [0.1024],
         [0.0073],
         [0.0114],
         [0.0133],
         [0.0043],
         [0.0099],
         [0.0017],
         [0.0017],
         [0.0163],
         [0.0338]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0068],
         [0.0145],
         [0.0952],
         [0.0117],
         [0.0325],
         [0.0105],
         [0.0042],
         [0.0286],
         [0.0088],
         [0.0050],
         [0.0165],
         [0.0291]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0925],
         [0.1035],
         [0.0745],
         [0.0389],
         [0.0219],
         [0.0153],
         [0.0433],
         [0.0141],
         [0.0098],
         [0.0146],
         [0.0103],
         [0.0324]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0153],
         [0.0111],
         [0.0276],
         [0.0093],
         [0.0882],
         [0.0235],
         [0.0172],
         [0.0142],
         [0.0106],
    


Evaluating:  20%|████████████████████████████████████                                                                                                                                               | 403/2000 [00:18<01:14, 21.33it/s][A

tensor([[[0.0134],
         [0.0393],
         [0.0692],
         [0.0116],
         [0.0139],
         [0.0147],
         [0.0314],
         [0.0215],
         [0.0095],
         [0.0054],
         [0.0337],
         [0.0376]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0424],
         [0.0464],
         [0.0197],
         [0.0154],
         [0.0162],
         [0.0099],
         [0.0460],
         [0.0299],
         [0.0357],
         [0.0179],
         [0.0054],
         [0.0099]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0392],
         [0.0154],
         [0.0200],
         [0.0076],
         [0.0070],
         [0.0123],
         [0.0129],
         [0.0076],
         [0.0032],
         [0.0026],
         [0.0093],
         [0.0082]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0055],
         [0.0119],
         [0.0718],
         [0.0309],
         [0.0177],
         [0.0081],
         [0.0026],
         [0.0071],
         [0.0021],
    


Evaluating:  20%|████████████████████████████████████▎                                                                                                                                              | 406/2000 [00:18<01:14, 21.42it/s][A
Evaluating:  20%|████████████████████████████████████▌                                                                                                                                              | 409/2000 [00:19<01:14, 21.39it/s][A

reg attention sum per layer
tensor([[[0.0461],
         [0.0540],
         [0.0858],
         [0.0240],
         [0.0387],
         [0.0123],
         [0.0131],
         [0.0062],
         [0.0019],
         [0.0050],
         [0.1489],
         [0.1840]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0070],
         [0.0321],
         [0.1553],
         [0.0141],
         [0.0806],
         [0.0322],
         [0.0612],
         [0.0089],
         [0.0070],
         [0.0029],
         [0.0987],
         [0.0321]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0459],
         [0.0575],
         [0.0344],
         [0.0197],
         [0.0088],
         [0.0240],
         [0.0142],
         [0.0013],
         [0.0011],
         [0.0046],
         [0.0390],
         [0.0123]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0166],
         [0.0521],
         [0.0581],
         [0.0146],
         [0.0128],
         [0.0229],
         [0.0324],
         [0.01


Evaluating:  21%|████████████████████████████████████▊                                                                                                                                              | 412/2000 [00:19<01:14, 21.38it/s][A
Evaluating:  21%|█████████████████████████████████████▏                                                                                                                                             | 415/2000 [00:19<01:13, 21.50it/s][A

tensor([[[0.1428],
         [0.0569],
         [0.0561],
         [0.0162],
         [0.0788],
         [0.0325],
         [0.1509],
         [0.0105],
         [0.0397],
         [0.0190],
         [0.0144],
         [0.0137]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1738],
         [0.0629],
         [0.0308],
         [0.0170],
         [0.0413],
         [0.0199],
         [0.0570],
         [0.0387],
         [0.0076],
         [0.0318],
         [0.0102],
         [0.0280]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0213],
         [0.0224],
         [0.0250],
         [0.0084],
         [0.0085],
         [0.0393],
         [0.0212],
         [0.0127],
         [0.0023],
         [0.0063],
         [0.0215],
         [0.0267]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0242],
         [0.0131],
         [0.0387],
         [0.0131],
         [0.0348],
         [0.0216],
         [0.0158],
         [0.0101],
         [0.0070],
    


Evaluating:  21%|█████████████████████████████████████▍                                                                                                                                             | 418/2000 [00:19<01:13, 21.50it/s][A

reg attention sum per layer
tensor([[[0.1272],
         [0.0863],
         [0.0363],
         [0.0206],
         [0.0525],
         [0.0216],
         [0.0671],
         [0.0234],
         [0.0299],
         [0.0245],
         [0.0079],
         [0.0335]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0066],
         [0.0111],
         [0.0134],
         [0.0058],
         [0.0019],
         [0.0099],
         [0.0025],
         [0.0008],
         [0.0004],
         [0.0012],
         [0.0186],
         [0.0113]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0161],
         [0.0144],
         [0.0336],
         [0.0109],
         [0.0173],
         [0.0211],
         [0.0034],
         [0.0056],
         [0.0012],
         [0.0047],
         [0.0379],
         [0.0125]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0703],
         [0.0513],
         [0.0860],
         [0.0441],
         [0.0494],
         [0.0160],
         [0.0340],
         [0.07


Evaluating:  21%|█████████████████████████████████████▋                                                                                                                                             | 421/2000 [00:19<01:13, 21.45it/s][A

tensor([[[0.0204],
         [0.0065],
         [0.0149],
         [0.0037],
         [0.0047],
         [0.0037],
         [0.0114],
         [0.0019],
         [0.0046],
         [0.0072],
         [0.0055],
         [0.0070]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0477],
         [0.0245],
         [0.0202],
         [0.0131],
         [0.0445],
         [0.0075],
         [0.0115],
         [0.0135],
         [0.0052],
         [0.0124],
         [0.0168],
         [0.0262]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0397],
         [0.0372],
         [0.0366],
         [0.0094],
         [0.0227],
         [0.0083],
         [0.0235],
         [0.0056],
         [0.0179],
         [0.0070],
         [0.0065],
         [0.0221]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0102],
         [0.0107],
         [0.0239],
         [0.0090],
         [0.0099],
         [0.0104],
         [0.0216],
         [0.0026],
         [0.0046],
    


Evaluating:  21%|█████████████████████████████████████▉                                                                                                                                             | 424/2000 [00:19<01:14, 21.28it/s][A
Evaluating:  21%|██████████████████████████████████████▏                                                                                                                                            | 427/2000 [00:19<01:13, 21.34it/s][A

tensor([[[0.0390],
         [0.0979],
         [0.0824],
         [0.0313],
         [0.0243],
         [0.0119],
         [0.0495],
         [0.0327],
         [0.0099],
         [0.0065],
         [0.0474],
         [0.0369]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0420],
         [0.1483],
         [0.1034],
         [0.0208],
         [0.0165],
         [0.0385],
         [0.0360],
         [0.0069],
         [0.0078],
         [0.0167],
         [0.2800],
         [0.0320]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0170],
         [0.0194],
         [0.0819],
         [0.0176],
         [0.0301],
         [0.0059],
         [0.0109],
         [0.0112],
         [0.0033],
         [0.0086],
         [0.0244],
         [0.0674]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0279],
         [0.0120],
         [0.0771],
         [0.0137],
         [0.0200],
         [0.0083],
         [0.0062],
         [0.0179],
         [0.0073],
    


Evaluating:  22%|██████████████████████████████████████▍                                                                                                                                            | 430/2000 [00:20<01:13, 21.38it/s][A

reg attention sum per layer
tensor([[[0.0242],
         [0.0323],
         [0.0691],
         [0.0212],
         [0.0371],
         [0.0367],
         [0.0230],
         [0.0326],
         [0.0088],
         [0.0103],
         [0.0678],
         [0.0753]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0827],
         [0.0434],
         [0.0306],
         [0.0285],
         [0.0261],
         [0.0168],
         [0.0188],
         [0.0085],
         [0.0031],
         [0.0085],
         [0.0309],
         [0.0438]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0307],
         [0.0500],
         [0.0342],
         [0.0115],
         [0.0220],
         [0.0209],
         [0.0122],
         [0.0106],
         [0.0031],
         [0.0056],
         [0.0478],
         [0.0342]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.


Evaluating:  22%|██████████████████████████████████████▊                                                                                                                                            | 433/2000 [00:20<01:13, 21.41it/s][A

tensor([[[0.0804],
         [0.1486],
         [0.2347],
         [0.0289],
         [0.0604],
         [0.0387],
         [0.0283],
         [0.1019],
         [0.0405],
         [0.0360],
         [0.1127],
         [0.1276]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0287],
         [0.1716],
         [0.1375],
         [0.0508],
         [0.0386],
         [0.0281],
         [0.0169],
         [0.0986],
         [0.0191],
         [0.0099],
         [0.1639],
         [0.0936]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0566],
         [0.0470],
         [0.0695],
         [0.0241],
         [0.0520],
         [0.0429],
         [0.0134],
         [0.1231],
         [0.0094],
         [0.0179],
         [0.0236],
         [0.0475]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0164],
         [0.0109],
         [0.0242],
         [0.0156],
         [0.0356],
         [0.0206],
         [0.0171],
         [0.0152],
         [0.0196],
    


Evaluating:  22%|███████████████████████████████████████                                                                                                                                            | 436/2000 [00:20<01:13, 21.39it/s][A
Evaluating:  22%|███████████████████████████████████████▎                                                                                                                                           | 439/2000 [00:20<01:13, 21.37it/s][A

tensor([[[0.0192],
         [0.0695],
         [0.0757],
         [0.0287],
         [0.0275],
         [0.0103],
         [0.0189],
         [0.0040],
         [0.0086],
         [0.0098],
         [0.0428],
         [0.0757]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0143],
         [0.0297],
         [0.0824],
         [0.0104],
         [0.0524],
         [0.0120],
         [0.0137],
         [0.0199],
         [0.0123],
         [0.0055],
         [0.0456],
         [0.0522]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0842],
         [0.0585],
         [0.0848],
         [0.0234],
         [0.0580],
         [0.0131],
         [0.1063],
         [0.0259],
         [0.0369],
         [0.0256],
         [0.0058],
         [0.0951]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0372],
         [0.0164],
         [0.0269],
         [0.0148],
         [0.0163],
         [0.0052],
         [0.0216],
         [0.0063],
         [0.0066],
    


Evaluating:  22%|███████████████████████████████████████▌                                                                                                                                           | 442/2000 [00:20<01:13, 21.31it/s][A

tensor([[[0.0594],
         [0.0873],
         [0.0302],
         [0.0091],
         [0.0126],
         [0.0090],
         [0.0182],
         [0.0599],
         [0.0186],
         [0.0153],
         [0.0020],
         [0.0492]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0082],
         [0.0091],
         [0.0253],
         [0.0143],
         [0.0116],
         [0.0061],
         [0.0034],
         [0.0052],
         [0.0031],
         [0.0020],
         [0.0049],
         [0.0136]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0192],
         [0.0136],
         [0.0265],
         [0.0068],
         [0.0102],
         [0.0070],
         [0.0071],
         [0.0071],
         [0.0028],
         [0.0079],
         [0.0141],
         [0.0108]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0042],
         [0.0074],
         [0.0292],
         [0.0077],
         [0.0250],
         [0.0055],
         [0.0018],
         [0.0040],
         [0.0014],
    


Evaluating:  22%|███████████████████████████████████████▊                                                                                                                                           | 445/2000 [00:20<01:13, 21.27it/s][A
Evaluating:  22%|████████████████████████████████████████                                                                                                                                           | 448/2000 [00:20<01:12, 21.38it/s][A

reg attention sum per layer
tensor([[[0.0170],
         [0.0273],
         [0.0368],
         [0.0092],
         [0.0242],
         [0.0174],
         [0.0045],
         [0.0046],
         [0.0021],
         [0.0038],
         [0.0124],
         [0.0095]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0308],
         [0.0981],
         [0.0404],
         [0.0170],
         [0.0143],
         [0.0239],
         [0.0181],
         [0.0163],
         [0.0043],
         [0.0055],
         [0.0912],
         [0.0784]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0133],
         [0.0075],
         [0.0373],
         [0.0134],
         [0.0205],
         [0.0073],
         [0.0210],
         [0.0084],
         [0.0246],
         [0.0024],
         [0.0131],
         [0.0255]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0247],
         [0.0238],
         [0.1025],
         [0.0275],
         [0.0423],
         [0.0168],
         [0.0210],
         [0.01


Evaluating:  23%|████████████████████████████████████████▎                                                                                                                                          | 451/2000 [00:21<01:13, 21.10it/s][A

tensor([[[0.1582],
         [0.0336],
         [0.0431],
         [0.0604],
         [0.0517],
         [0.0144],
         [0.0652],
         [0.0939],
         [0.0088],
         [0.0213],
         [0.0081],
         [0.0327]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0547],
         [0.0283],
         [0.0239],
         [0.0101],
         [0.0168],
         [0.0141],
         [0.0123],
         [0.0258],
         [0.0095],
         [0.0317],
         [0.0116],
         [0.0180]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0116],
         [0.0132],
         [0.0734],
         [0.0082],
         [0.0167],
         [0.0121],
         [0.0084],
         [0.0121],
         [0.0026],
         [0.0091],
         [0.0089],
         [0.0070]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0767],
         [0.0296],
         [0.0358],
         [0.0164],
         [0.0275],
         [0.0117],
         [0.0483],
         [0.0083],
         [0.0150],
    


Evaluating:  23%|████████████████████████████████████████▋                                                                                                                                          | 454/2000 [00:21<01:12, 21.21it/s][A
Evaluating:  23%|████████████████████████████████████████▉                                                                                                                                          | 457/2000 [00:21<01:12, 21.43it/s][A


reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0141],
         [0.0270],
         [0.0630],
         [0.0145],
         [0.0056],
         [0.0120],
         [0.0108],
         [0.0378],
         [0.0031],
         [0.0046],
         [0.0246],
         [0.0193]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0697],
         [0.0533],
         [0.1148],
         [0.0405],
         [0.0476],
         [0.0179],
         [0.0157],
         [0.0675],
         [0.0040],
         [0.0166],
         [0.0906],
         [0.0936]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0340],
         [0.0439],
         [0.0203],
         [0.0195],
         [0.0030],
         [0.0039],
         [0.0091],
         [0.0029],
         [0.0012],
         [0.0017],
    


Evaluating:  23%|█████████████████████████████████████████▏                                                                                                                                         | 460/2000 [00:21<01:11, 21.40it/s][A

reg attention sum per layer
tensor([[[0.0173],
         [0.0184],
         [0.0200],
         [0.0029],
         [0.0084],
         [0.0091],
         [0.0102],
         [0.0017],
         [0.0020],
         [0.0025],
         [0.0449],
         [0.0124]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0145],
         [0.0221],
         [0.1325],
         [0.0080],
         [0.0190],
         [0.0049],
         [0.0456],
         [0.0034],
         [0.0030],
         [0.0077],
         [0.0981],
         [0.0665]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0523],
         [0.0855],
         [0.0714],
         [0.0225],
         [0.0524],
         [0.0256],
         [0.0195],
         [0.0076],
         [0.0060],
         [0.0036],
         [0.0566],
         [0.0382]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0283],
         [0.0681],
         [0.2205],
         [0.0281],
         [0.0487],
         [0.0141],
         [0.0230],
         [0.01


Evaluating:  23%|█████████████████████████████████████████▍                                                                                                                                         | 463/2000 [00:21<01:11, 21.38it/s][A

tensor([[[0.0092],
         [0.0125],
         [0.0213],
         [0.0355],
         [0.0131],
         [0.0057],
         [0.0300],
         [0.0016],
         [0.0010],
         [0.0043],
         [0.1289],
         [0.0180]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1575],
         [0.0602],
         [0.0809],
         [0.0171],
         [0.0228],
         [0.0049],
         [0.1546],
         [0.0091],
         [0.0132],
         [0.0200],
         [0.0087],
         [0.0523]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0847],
         [0.0121],
         [0.0192],
         [0.0080],
         [0.0262],
         [0.0105],
         [0.0222],
         [0.0031],
         [0.0059],
         [0.0180],
         [0.0055],
         [0.0070]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0280],
         [0.0060],
         [0.0941],
         [0.0115],
         [0.0160],
         [0.0083],
         [0.0248],
         [0.0025],
         [0.0017],
    


Evaluating:  23%|█████████████████████████████████████████▋                                                                                                                                         | 466/2000 [00:21<01:11, 21.41it/s][A
Evaluating:  23%|█████████████████████████████████████████▉                                                                                                                                         | 469/2000 [00:21<01:11, 21.48it/s][A

tensor([[[0.0536],
         [0.0918],
         [0.1020],
         [0.0300],
         [0.0419],
         [0.0220],
         [0.0207],
         [0.0079],
         [0.0360],
         [0.0112],
         [0.0274],
         [0.0668]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0399],
         [0.0346],
         [0.0489],
         [0.0428],
         [0.0147],
         [0.0047],
         [0.0060],
         [0.0030],
         [0.0013],
         [0.0062],
         [0.1458],
         [0.0392]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0095],
         [0.0094],
         [0.0270],
         [0.0077],
         [0.0106],
         [0.0032],
         [0.0020],
         [0.0052],
         [0.0035],
         [0.0013],
         [0.0063],
         [0.0192]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0263],
         [0.0079],
         [0.0123],
         [0.0070],
         [0.0075],
         [0.0081],
         [0.0257],
         [0.0044],
         [0.0543],
    


Evaluating:  24%|██████████████████████████████████████████▏                                                                                                                                        | 472/2000 [00:22<01:11, 21.39it/s][A

tensor([[[0.0231],
         [0.0247],
         [0.0144],
         [0.0118],
         [0.0096],
         [0.0174],
         [0.0170],
         [0.0045],
         [0.0063],
         [0.0054],
         [0.0076],
         [0.0100]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0174],
         [0.0039],
         [0.1320],
         [0.0362],
         [0.0059],
         [0.0046],
         [0.0259],
         [0.0033],
         [0.0035],
         [0.0113],
         [0.0603],
         [0.0749]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0214],
         [0.0205],
         [0.0992],
         [0.0288],
         [0.0468],
         [0.0241],
         [0.0420],
         [0.0126],
         [0.0038],
         [0.0081],
         [0.0456],
         [0.1221]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0389],
         [0.0325],
         [0.0197],
         [0.0128],
         [0.0542],
         [0.0082],
         [0.0337],
         [0.0539],
         [0.0237],
    


Evaluating:  24%|██████████████████████████████████████████▌                                                                                                                                        | 475/2000 [00:22<01:11, 21.24it/s][A

tensor([[[0.0424],
         [0.1426],
         [0.0313],
         [0.0137],
         [0.0229],
         [0.0184],
         [0.0204],
         [0.0165],
         [0.0141],
         [0.0159],
         [0.0174],
         [0.0155]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0331],
         [0.0363],
         [0.0512],
         [0.0126],
         [0.1010],
         [0.0187],
         [0.0148],
         [0.0098],
         [0.0085],
         [0.0092],
         [0.0127],
         [0.0356]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0047],
         [0.0101],
         [0.0129],
         [0.0019],
         [0.0029],
         [0.0039],
         [0.0064],
         [0.0032],
         [0.0004],
         [0.0022],
         [0.0363],
         [0.0148]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0063],
         [0.0066],
         [0.0175],
         [0.0043],
         [0.0118],
         [0.0184],
         [0.0037],
         [0.0010],
         [0.0014],
    


Evaluating:  24%|██████████████████████████████████████████▊                                                                                                                                        | 478/2000 [00:22<01:11, 21.36it/s][A
Evaluating:  24%|███████████████████████████████████████████                                                                                                                                        | 481/2000 [00:22<01:10, 21.40it/s][A

tensor([[[0.0544],
         [0.1115],
         [0.1154],
         [0.0108],
         [0.0443],
         [0.0227],
         [0.0119],
         [0.0084],
         [0.0025],
         [0.0119],
         [0.1166],
         [0.0771]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0163],
         [0.0084],
         [0.0635],
         [0.0118],
         [0.0206],
         [0.0037],
         [0.0072],
         [0.0296],
         [0.0024],
         [0.0080],
         [0.0105],
         [0.0242]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0151],
         [0.0126],
         [0.0220],
         [0.0110],
         [0.0169],
         [0.0140],
         [0.0173],
         [0.0170],
         [0.0086],
         [0.0093],
         [0.0131],
         [0.0141]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0290],
         [0.0369],
         [0.0932],
         [0.0412],
         [0.0310],
         [0.0124],
         [0.0404],
         [0.0038],
         [0.0032],
    


Evaluating:  24%|███████████████████████████████████████████▎                                                                                                                                       | 484/2000 [00:22<01:10, 21.42it/s][A

tensor([[[0.0388],
         [0.0233],
         [0.0853],
         [0.0104],
         [0.0207],
         [0.0389],
         [0.0421],
         [0.0234],
         [0.0141],
         [0.0152],
         [0.0174],
         [0.0521]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0727],
         [0.0095],
         [0.0433],
         [0.0199],
         [0.0231],
         [0.0160],
         [0.0346],
         [0.0283],
         [0.0064],
         [0.0214],
         [0.0367],
         [0.0591]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0565],
         [0.0528],
         [0.0565],
         [0.0376],
         [0.0219],
         [0.0119],
         [0.0486],
         [0.0155],
         [0.0340],
         [0.0098],
         [0.0170],
         [0.0601]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0169],
         [0.0126],
         [0.0112],
         [0.0154],
         [0.0333],
         [0.0181],
         [0.0110],
         [0.0077],
         [0.0101],
    


Evaluating:  24%|███████████████████████████████████████████▌                                                                                                                                       | 487/2000 [00:22<01:10, 21.35it/s][A

tensor([[[0.0441],
         [0.0627],
         [0.0745],
         [0.0143],
         [0.0152],
         [0.0146],
         [0.0464],
         [0.0224],
         [0.0175],
         [0.0095],
         [0.0367],
         [0.0666]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0489],
         [0.0168],
         [0.0235],
         [0.0139],
         [0.0090],
         [0.0251],
         [0.0114],
         [0.0037],
         [0.0022],
         [0.0038],
         [0.0285],
         [0.0209]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0173],
         [0.0461],
         [0.0226],
         [0.0182],
         [0.0124],
         [0.0107],
         [0.0099],
         [0.0245],
         [0.0032],
         [0.0068],
         [0.0126],
         [0.0335]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0407],
         [0.0052],
         [0.0327],
         [0.0117],
         [0.0124],
         [0.0071],
         [0.0257],
         [0.0037],
         [0.0368],
    


Evaluating:  24%|███████████████████████████████████████████▊                                                                                                                                       | 490/2000 [00:22<01:10, 21.35it/s][A
Evaluating:  25%|████████████████████████████████████████████                                                                                                                                       | 493/2000 [00:23<01:10, 21.34it/s][A

tensor([[[0.0322],
         [0.0455],
         [0.0429],
         [0.0136],
         [0.0250],
         [0.0095],
         [0.0224],
         [0.0140],
         [0.0361],
         [0.0127],
         [0.0070],
         [0.0541]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0581],
         [0.0209],
         [0.0848],
         [0.0153],
         [0.0344],
         [0.0138],
         [0.0480],
         [0.0198],
         [0.0152],
         [0.0194],
         [0.0072],
         [0.0362]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1212],
         [0.4025],
         [0.1351],
         [0.0228],
         [0.0242],
         [0.0124],
         [0.0423],
         [0.0179],
         [0.0183],
         [0.0405],
         [0.1004],
         [0.1203]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0537],
         [0.0279],
         [0.0282],
         [0.0093],
         [0.0112],
         [0.0129],
         [0.0563],
         [0.0090],
         [0.0168],
    


Evaluating:  25%|████████████████████████████████████████████▍                                                                                                                                      | 496/2000 [00:23<01:10, 21.38it/s][A

tensor([[[0.0543],
         [0.0126],
         [0.0576],
         [0.0206],
         [0.0256],
         [0.0114],
         [0.0408],
         [0.0181],
         [0.0714],
         [0.0125],
         [0.0107],
         [0.0588]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0791],
         [0.0256],
         [0.0159],
         [0.0091],
         [0.0494],
         [0.0275],
         [0.0828],
         [0.0126],
         [0.0418],
         [0.0265],
         [0.0024],
         [0.0290]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0359],
         [0.1136],
         [0.0652],
         [0.0119],
         [0.0107],
         [0.0148],
         [0.0300],
         [0.0282],
         [0.0071],
         [0.0085],
         [0.0289],
         [0.0240]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0205],
         [0.0075],
         [0.0032],
         [0.0063],
         [0.0078],
         [0.0149],
         [0.0189],
         [0.0007],
         [0.0011],
    


Evaluating:  25%|████████████████████████████████████████████▋                                                                                                                                      | 499/2000 [00:23<01:10, 21.37it/s][A

tensor([[[0.0197],
         [0.0452],
         [0.0638],
         [0.0298],
         [0.0189],
         [0.0181],
         [0.0134],
         [0.0121],
         [0.0100],
         [0.0060],
         [0.0464],
         [0.0628]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0071],
         [0.0207],
         [0.0293],
         [0.0130],
         [0.0386],
         [0.0058],
         [0.0099],
         [0.0024],
         [0.0031],
         [0.0036],
         [0.0092],
         [0.0471]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0381],
         [0.0588],
         [0.0608],
         [0.0295],
         [0.0540],
         [0.0075],
         [0.0379],
         [0.0109],
         [0.0399],
         [0.0252],
         [0.0206],
         [0.0183]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0401],
         [0.0388],
         [0.0968],
         [0.0296],
         [0.0087],
         [0.0199],
         [0.0081],
         [0.0335],
         [0.0039],
    


Evaluating:  25%|████████████████████████████████████████████▉                                                                                                                                      | 502/2000 [00:23<01:09, 21.45it/s][A
Evaluating:  25%|█████████████████████████████████████████████▏                                                                                                                                     | 505/2000 [00:23<01:09, 21.51it/s][A

tensor([[[0.0425],
         [0.0299],
         [0.0319],
         [0.0166],
         [0.0373],
         [0.0132],
         [0.0263],
         [0.0319],
         [0.0259],
         [0.0114],
         [0.0105],
         [0.0248]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0851],
         [0.0675],
         [0.0456],
         [0.0215],
         [0.0239],
         [0.0094],
         [0.0513],
         [0.0178],
         [0.0077],
         [0.0229],
         [0.0159],
         [0.0313]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0132],
         [0.0346],
         [0.0188],
         [0.0060],
         [0.0057],
         [0.0357],
         [0.0167],
         [0.0077],
         [0.0049],
         [0.0036],
         [0.0188],
         [0.0069]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0114],
         [0.0383],
         [0.1351],
         [0.0177],
         [0.0069],
         [0.0040],
         [0.0101],
         [0.0042],
         [0.0015],
    


Evaluating:  25%|█████████████████████████████████████████████▍                                                                                                                                     | 508/2000 [00:23<01:09, 21.50it/s][A

tensor([[[0.0186],
         [0.0514],
         [0.0220],
         [0.0075],
         [0.0247],
         [0.0067],
         [0.0089],
         [0.0058],
         [0.0138],
         [0.0083],
         [0.0054],
         [0.0303]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0219],
         [0.0461],
         [0.0569],
         [0.0101],
         [0.0047],
         [0.0047],
         [0.0270],
         [0.0170],
         [0.0079],
         [0.0137],
         [0.0312],
         [0.0299]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0519],
         [0.1439],
         [0.0467],
         [0.0141],
         [0.0380],
         [0.0177],
         [0.0146],
         [0.0169],
         [0.0042],
         [0.0045],
         [0.0712],
         [0.03


Evaluating:  26%|█████████████████████████████████████████████▋                                                                                                                                     | 511/2000 [00:23<01:09, 21.40it/s][A

tensor([[[0.0177],
         [0.0217],
         [0.0552],
         [0.0086],
         [0.0258],
         [0.0116],
         [0.0166],
         [0.0073],
         [0.0050],
         [0.0052],
         [0.0367],
         [0.0202]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0200],
         [0.0318],
         [0.0778],
         [0.0160],
         [0.0426],
         [0.0086],
         [0.0295],
         [0.0070],
         [0.0324],
         [0.0055],
         [0.0318],
         [0.0556]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0057],
         [0.0175],
         [0.0252],
         [0.0038],
         [0.0056],
         [0.0071],
         [0.0176],
         [0.0033],
         [0.0051],
         [0.0058],
         [0.0149],
         [0.0157]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.3488],
         [0.0670],
         [0.0278],
         [0.0441],
         [0.0194],
         [0.0148],
         [0.2523],
         [0.0399],
         [0.0084],
    


Evaluating:  26%|██████████████████████████████████████████████                                                                                                                                     | 514/2000 [00:24<01:09, 21.29it/s][A
Evaluating:  26%|██████████████████████████████████████████████▎                                                                                                                                    | 517/2000 [00:24<01:09, 21.30it/s][A

tensor([[[0.0508],
         [0.0508],
         [0.0339],
         [0.0165],
         [0.0267],
         [0.0216],
         [0.0268],
         [0.0538],
         [0.0108],
         [0.0141],
         [0.0103],
         [0.0147]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0332],
         [0.0164],
         [0.0250],
         [0.0122],
         [0.0206],
         [0.0036],
         [0.0145],
         [0.0273],
         [0.0055],
         [0.0154],
         [0.0047],
         [0.0320]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0529],
         [0.0222],
         [0.0261],
         [0.0076],
         [0.0129],
         [0.0057],
         [0.0446],
         [0.0078],
         [0.0731],
         [0.0105],
         [0.0038],
         [0.02


Evaluating:  26%|██████████████████████████████████████████████▌                                                                                                                                    | 520/2000 [00:24<01:09, 21.31it/s][A

tensor([[[0.0086],
         [0.0121],
         [0.0252],
         [0.0097],
         [0.0146],
         [0.0244],
         [0.0061],
         [0.0018],
         [0.0012],
         [0.0021],
         [0.0301],
         [0.0167]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0223],
         [0.0395],
         [0.0527],
         [0.0142],
         [0.0360],
         [0.0115],
         [0.0329],
         [0.1210],
         [0.0081],
         [0.0136],
         [0.0364],
         [0.0159]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0079],
         [0.0213],
         [0.0115],
         [0.0081],
         [0.0091],
         [0.0055],
         [0.0145],
         [0.0062],
         [0.0060],
         [0.0128],
         [0.0143],
         [0.00


Evaluating:  26%|██████████████████████████████████████████████▊                                                                                                                                    | 523/2000 [00:24<01:09, 21.32it/s][A

tensor([[[0.0357],
         [0.0256],
         [0.0656],
         [0.0297],
         [0.0350],
         [0.0228],
         [0.0180],
         [0.0357],
         [0.0193],
         [0.0056],
         [0.0171],
         [0.0528]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0206],
         [0.0296],
         [0.0847],
         [0.0066],
         [0.0124],
         [0.0191],
         [0.0134],
         [0.0025],
         [0.0012],
         [0.0028],
         [0.0888],
         [0.0192]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0766],
         [0.2120],
         [0.1146],
         [0.0369],
         [0.1455],
         [0.0311],
         [0.0182],
         [0.0324],
         [0.0172],
         [0.0441],
         [0.0614],
         [0.1676]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0122],
         [0.0112],
         [0.0241],
         [0.0089],
         [0.0403],
         [0.0212],
         [0.0070],
         [0.0073],
         [0.0023],
    


Evaluating:  26%|███████████████████████████████████████████████                                                                                                                                    | 526/2000 [00:24<01:09, 21.32it/s][A
Evaluating:  26%|███████████████████████████████████████████████▎                                                                                                                                   | 529/2000 [00:24<01:08, 21.37it/s][A

tensor([[[0.0474],
         [0.0346],
         [0.1724],
         [0.0431],
         [0.0358],
         [0.0210],
         [0.0197],
         [0.0179],
         [0.0098],
         [0.0135],
         [0.1394],
         [0.0909]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0080],
         [0.0115],
         [0.0323],
         [0.0035],
         [0.0114],
         [0.0048],
         [0.0134],
         [0.0213],
         [0.0047],
         [0.0036],
         [0.0037],
         [0.0154]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0482],
         [0.0357],
         [0.0099],
         [0.0084],
         [0.0139],
         [0.0293],
         [0.0138],
         [0.0020],
         [0.0006],
         [0.0037],
         [0.0224],
         [0.0134]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0234],
         [0.0067],
         [0.0226],
         [0.0138],
         [0.0201],
         [0.0036],
         [0.0087],
         [0.0087],
         [0.0042],
    


Evaluating:  27%|███████████████████████████████████████████████▌                                                                                                                                   | 532/2000 [00:24<01:09, 21.27it/s][A

tensor([[[0.0281],
         [0.0122],
         [0.0320],
         [0.0112],
         [0.0157],
         [0.0169],
         [0.0484],
         [0.0098],
         [0.0380],
         [0.0064],
         [0.0192],
         [0.0136]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0368],
         [0.0400],
         [0.0607],
         [0.0280],
         [0.0199],
         [0.0703],
         [0.0215],
         [0.0136],
         [0.0033],
         [0.0084],
         [0.0492],
         [0.0318]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0348],
         [0.0875],
         [0.0716],
         [0.0274],
         [0.0262],
         [0.0113],
         [0.0116],
         [0.0134],
         [0.0071],
         [0.0125],
         [0.0405],
         [0.0452]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0737],
         [0.0624],
         [0.0684],
         [0.0283],
         [0.0376],
         [0.0184],
         [0.0267],
         [0.0122],
         [0.0634],
    


Evaluating:  27%|███████████████████████████████████████████████▉                                                                                                                                   | 535/2000 [00:25<01:08, 21.24it/s][A

tensor([[[0.0570],
         [0.0452],
         [0.0874],
         [0.0271],
         [0.0330],
         [0.0334],
         [0.0369],
         [0.0303],
         [0.0332],
         [0.0221],
         [0.0338],
         [0.0540]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0684],
         [0.1215],
         [0.0131],
         [0.0123],
         [0.0114],
         [0.0173],
         [0.0126],
         [0.0068],
         [0.0014],
         [0.0048],
         [0.0135],
         [0.0290]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0360],
         [0.0250],
         [0.0905],
         [0.0426],
         [0.0764],
         [0.0051],
         [0.0190],
         [0.0076],
         [0.0018],
         [0.0161],
         [0.0955],
         [0.0547]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0061],
         [0.0298],
         [0.0517],
         [0.0125],
         [0.0148],
         [0.0113],
         [0.0060],
         [0.0035],
         [0.0033],
    


Evaluating:  27%|████████████████████████████████████████████████▏                                                                                                                                  | 538/2000 [00:25<01:08, 21.32it/s][A
Evaluating:  27%|████████████████████████████████████████████████▍                                                                                                                                  | 541/2000 [00:25<01:08, 21.37it/s][A

tensor([[[0.0160],
         [0.0363],
         [0.0101],
         [0.0086],
         [0.0183],
         [0.0076],
         [0.0451],
         [0.0097],
         [0.0152],
         [0.0067],
         [0.0075],
         [0.0263]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0257],
         [0.1029],
         [0.0314],
         [0.0097],
         [0.0033],
         [0.0039],
         [0.0087],
         [0.0163],
         [0.0017],
         [0.0060],
         [0.0045],
         [0.0136]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0222],
         [0.0223],
         [0.0233],
         [0.0057],
         [0.0018],
         [0.0213],
         [0.0255],
         [0.0062],
         [0.0019],
         [0.0040],
         [0.0376],
         [0.0307]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0258],
         [0.0642],
         [0.0993],
         [0.0382],
         [0.0615],
         [0.0089],
         [0.0241],
         [0.0106],
         [0.0205],
    


Evaluating:  27%|████████████████████████████████████████████████▋                                                                                                                                  | 544/2000 [00:25<01:08, 21.36it/s][A

tensor([[[0.0553],
         [0.0410],
         [0.0433],
         [0.0171],
         [0.0392],
         [0.0114],
         [0.0539],
         [0.0430],
         [0.0337],
         [0.0300],
         [0.0048],
         [0.0194]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0096],
         [0.0324],
         [0.1859],
         [0.0167],
         [0.0206],
         [0.0067],
         [0.0108],
         [0.0120],
         [0.0162],
         [0.0060],
         [0.0514],
         [0.1501]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0521],
         [0.0673],
         [0.0961],
         [0.0289],
         [0.0272],
         [0.0224],
         [0.0370],
         [0.0279],
         [0.0032],
         [0.0045],
         [0.0576],
         [0.0964]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0246],
         [0.0376],
         [0.0692],
         [0.0115],
         [0.0190],
         [0.0622],
         [0.0102],
         [0.0513],
         [0.0115],
    


Evaluating:  27%|████████████████████████████████████████████████▉                                                                                                                                  | 547/2000 [00:25<01:08, 21.30it/s][A

tensor([[[0.0233],
         [0.0177],
         [0.0971],
         [0.0119],
         [0.0189],
         [0.0056],
         [0.0169],
         [0.0161],
         [0.0017],
         [0.0022],
         [0.0095],
         [0.0116]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0089],
         [0.0188],
         [0.0363],
         [0.0024],
         [0.0069],
         [0.0093],
         [0.0095],
         [0.0059],
         [0.0066],
         [0.0029],
         [0.0096],
         [0.0111]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0189],
         [0.0346],
         [0.0609],
         [0.0072],
         [0.0432],
         [0.0282],
         [0.0153],
         [0.0238],
         [0.0154],
         [0.0068],
         [0.0410],
         [0.0330]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0643],
         [0.0480],
         [0.0691],
         [0.0282],
         [0.0421],
         [0.0439],
         [0.0360],
         [0.0179],
         [0.0134],
    


Evaluating:  28%|█████████████████████████████████████████████████▏                                                                                                                                 | 550/2000 [00:25<01:07, 21.36it/s][A
Evaluating:  28%|█████████████████████████████████████████████████▍                                                                                                                                 | 553/2000 [00:25<01:07, 21.44it/s][A

tensor([[[0.0303],
         [0.0254],
         [0.0144],
         [0.0141],
         [0.0059],
         [0.0073],
         [0.0661],
         [0.0023],
         [0.0029],
         [0.0050],
         [0.0120],
         [0.0208]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0282],
         [0.0413],
         [0.0401],
         [0.0277],
         [0.0214],
         [0.0086],
         [0.0196],
         [0.0254],
         [0.0028],
         [0.0019],
         [0.0499],
         [0.0402]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1362],
         [0.1082],
         [0.0430],
         [0.0128],
         [0.0387],
         [0.0574],
         [0.0526],
         [0.0506],
         [0.0150],
         [0.0209],
         [0.0285],
         [0.0237]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0234],
         [0.0160],
         [0.0231],
         [0.0125],
         [0.0142],
         [0.0038],
         [0.0071],
         [0.0082],
         [0.0038],
    


Evaluating:  28%|█████████████████████████████████████████████████▊                                                                                                                                 | 556/2000 [00:26<01:07, 21.46it/s][A

tensor([[[0.0191],
         [0.0278],
         [0.0319],
         [0.0054],
         [0.0052],
         [0.0026],
         [0.0194],
         [0.0010],
         [0.0030],
         [0.0061],
         [0.0194],
         [0.0510]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0126],
         [0.0225],
         [0.0212],
         [0.0158],
         [0.0044],
         [0.0101],
         [0.0053],
         [0.0015],
         [0.0018],
         [0.0017],
         [0.0187],
         [0.0274]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0463],
         [0.0258],
         [0.0290],
         [0.0123],
         [0.0302],
         [0.0154],
         [0.0212],
         [0.0317],
         [0.0146],
         [0.0111],
         [0.0120],
         [0.0300]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0439],
         [0.0546],
         [0.1172],
         [0.0303],
         [0.0169],
         [0.0455],
         [0.0266],
         [0.0240],
         [0.0031],
    


Evaluating:  28%|██████████████████████████████████████████████████                                                                                                                                 | 559/2000 [00:26<01:07, 21.37it/s][A

tensor([[[0.0688],
         [0.0258],
         [0.0491],
         [0.0120],
         [0.0161],
         [0.0042],
         [0.0118],
         [0.0034],
         [0.0029],
         [0.0048],
         [0.0579],
         [0.0136]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0068],
         [0.0212],
         [0.0324],
         [0.0053],
         [0.0046],
         [0.0020],
         [0.0024],
         [0.0038],
         [0.0004],
         [0.0015],
         [0.0141],
         [0.0053]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0151],
         [0.0150],
         [0.0937],
         [0.0063],
         [0.0402],
         [0.0148],
         [0.0131],
         [0.0093],
         [0.0222],
         [0.0204],
         [0.0239],
         [0.0505]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [


Evaluating:  28%|██████████████████████████████████████████████████▎                                                                                                                                | 562/2000 [00:26<01:06, 21.54it/s][A
Evaluating:  28%|██████████████████████████████████████████████████▌                                                                                                                                | 565/2000 [00:26<01:06, 21.48it/s][A


reg attention sum per layer
tensor([[[0.0217],
         [0.0459],
         [0.0234],
         [0.0293],
         [0.0111],
         [0.0054],
         [0.0212],
         [0.0044],
         [0.0128],
         [0.0067],
         [0.0404],
         [0.0221]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0388],
         [0.0422],
         [0.0498],
         [0.0154],
         [0.0630],
         [0.0374],
         [0.0143],
         [0.0127],
         [0.0066],
         [0.0101],
         [0.0223],
         [0.0198]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0238],
         [0.0640],
         [0.0386],
         [0.0131],
         [0.0160],
         [0.0157],
         [0.0309],
         [0.0055],
         [0.0107],
         [0.0070],
         [0.0183],
         [0.0337]]], device='cuda:0')
reg attention sum per layer



Evaluating:  28%|██████████████████████████████████████████████████▊                                                                                                                                | 568/2000 [00:26<01:06, 21.39it/s][A

tensor([[[0.0842],
         [0.0097],
         [0.0241],
         [0.0078],
         [0.0036],
         [0.0036],
         [0.0307],
         [0.0155],
         [0.0238],
         [0.0151],
         [0.0030],
         [0.0166]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0119],
         [0.0252],
         [0.0580],
         [0.0095],
         [0.0110],
         [0.0037],
         [0.0126],
         [0.0091],
         [0.0046],
         [0.0018],
         [0.0334],
         [0.0481]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0484],
         [0.0493],
         [0.0859],
         [0.0225],
         [0.0492],
         [0.0673],
         [0.0152],
         [0.0066],
         [0.0046],
         [0.0102],
         [0.0406],
         [0.0785]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0366],
         [0.0225],
         [0.0598],
         [0.0212],
         [0.0291],
         [0.0149],
         [0.0042],
         [0.0324],
         [0.0201],
    


Evaluating:  29%|███████████████████████████████████████████████████                                                                                                                                | 571/2000 [00:26<01:07, 21.10it/s][A

tensor([[[0.0076],
         [0.0111],
         [0.0084],
         [0.0038],
         [0.0064],
         [0.0225],
         [0.0021],
         [0.0013],
         [0.0006],
         [0.0043],
         [0.0108],
         [0.0103]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0416],
         [0.0498],
         [0.0483],
         [0.0099],
         [0.0087],
         [0.0116],
         [0.0096],
         [0.0126],
         [0.0104],
         [0.0069],
         [0.0142],
         [0.0398]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1887],
         [0.0645],
         [0.0180],
         [0.0275],
         [0.0201],
         [0.0124],
         [0.1120],
         [0.0227],
         [0.0129],
         [0.0372],
         [0.0041],
         [0.0148]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1396],
         [0.0991],
         [0.0364],
         [0.0209],
         [0.0177],
         [0.0086],
         [0.1246],
         [0.0784],
         [0.0227],
    


Evaluating:  29%|███████████████████████████████████████████████████▎                                                                                                                               | 574/2000 [00:26<01:07, 21.17it/s][A
Evaluating:  29%|███████████████████████████████████████████████████▋                                                                                                                               | 577/2000 [00:26<01:07, 21.22it/s][A

tensor([[[0.0272],
         [0.0141],
         [0.0515],
         [0.0083],
         [0.0462],
         [0.0083],
         [0.0039],
         [0.0057],
         [0.0026],
         [0.0118],
         [0.0206],
         [0.0305]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0132],
         [0.1283],
         [0.1388],
         [0.0146],
         [0.0193],
         [0.0093],
         [0.0060],
         [0.0356],
         [0.0064],
         [0.0036],
         [0.0533],
         [0.1860]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0529],
         [0.0576],
         [0.0474],
         [0.0275],
         [0.0113],
         [0.0052],
         [0.0293],
         [0.0045],
         [0.0053],
         [0.0041],
         [0.0178],
         [0.0234]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0127],
         [0.0193],
         [0.0511],
         [0.0327],
         [0.0263],
         [0.0065],
         [0.0071],
         [0.0115],
         [0.0017],
    


Evaluating:  29%|███████████████████████████████████████████████████▉                                                                                                                               | 580/2000 [00:27<01:06, 21.34it/s][A

tensor([[[0.0071],
         [0.0147],
         [0.0317],
         [0.0038],
         [0.0120],
         [0.0020],
         [0.0099],
         [0.0033],
         [0.0010],
         [0.0018],
         [0.0135],
         [0.0144]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0200],
         [0.0350],
         [0.0336],
         [0.0107],
         [0.0130],
         [0.0168],
         [0.0246],
         [0.0126],
         [0.0039],
         [0.0045],
         [0.0131],
         [0.0245]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0064],
         [0.0096],
         [0.0143],
         [0.0057],
         [0.0273],
         [0.0038],
         [0.0022],
         [0.0002],
         [0.0006],
         [0.0007],
         [0.0277],
         [0.0061]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0324],
         [0.0189],
         [0.0901],
         [0.0390],
         [0.0298],
         [0.0159],
         [0.0078],
         [0.0218],
         [0.0064],
    


Evaluating:  29%|████████████████████████████████████████████████████▏                                                                                                                              | 583/2000 [00:27<01:06, 21.25it/s][A

tensor([[[0.1133],
         [0.0794],
         [0.1066],
         [0.0491],
         [0.0569],
         [0.0315],
         [0.0330],
         [0.0354],
         [0.0871],
         [0.0256],
         [0.0437],
         [0.1051]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0718],
         [0.0620],
         [0.0856],
         [0.0236],
         [0.0745],
         [0.0334],
         [0.0199],
         [0.0189],
         [0.0086],
         [0.0100],
         [0.0182],
         [0.0276]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0287],
         [0.0501],
         [0.0237],
         [0.0042],
         [0.0099],
         [0.0133],
         [0.0091],
         [0.0227],
         [0.0023],
         [0.0026],
         [0.0357],
         [0.0101]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0531],
         [0.0786],
         [0.1412],
         [0.0145],
         [0.0139],
         [0.0072],
         [0.0185],
         [0.0539],
         [0.0140],
    


Evaluating:  29%|████████████████████████████████████████████████████▍                                                                                                                              | 586/2000 [00:27<01:06, 21.23it/s][A
Evaluating:  29%|████████████████████████████████████████████████████▋                                                                                                                              | 589/2000 [00:27<01:06, 21.35it/s][A

tensor([[[0.0683],
         [0.0099],
         [0.0157],
         [0.0055],
         [0.0080],
         [0.0114],
         [0.0104],
         [0.0134],
         [0.0029],
         [0.0052],
         [0.0023],
         [0.0140]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0220],
         [0.0584],
         [0.0384],
         [0.0095],
         [0.0237],
         [0.0063],
         [0.0081],
         [0.0407],
         [0.0071],
         [0.0024],
         [0.0145],
         [0.0219]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0874],
         [0.0577],
         [0.1510],
         [0.0403],
         [0.0609],
         [0.0110],
         [0.0237],
         [0.0144],
         [0.0075],
         [0.0141],
         [0.0812],
         [0.0781]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0108],
         [0.0182],
         [0.0419],
         [0.0094],
         [0.0160],
         [0.0175],
         [0.0030],
         [0.0103],
         [0.0010],
    


Evaluating:  30%|████████████████████████████████████████████████████▉                                                                                                                              | 592/2000 [00:27<01:05, 21.35it/s][A

tensor([[[0.0529],
         [0.0826],
         [0.0903],
         [0.0236],
         [0.0235],
         [0.0053],
         [0.0281],
         [0.0059],
         [0.0020],
         [0.0080],
         [0.0894],
         [0.0319]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0449],
         [0.0135],
         [0.1213],
         [0.0182],
         [0.0247],
         [0.0092],
         [0.0442],
         [0.0068],
         [0.0094],
         [0.0091],
         [0.0140],
         [0.1953]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0148],
         [0.0221],
         [0.0403],
         [0.0126],
         [0.0164],
         [0.0130],
         [0.0175],
         [0.0065],
         [0.0038],
         [0.0032],
         [0.1625],
         [0.0218]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0182],
         [0.0290],
         [0.1000],
         [0.0150],
         [0.0427],
         [0.0081],
         [0.0123],
         [0.0113],
         [0.0064],
    


Evaluating:  30%|█████████████████████████████████████████████████████▎                                                                                                                             | 595/2000 [00:27<01:05, 21.30it/s][A

tensor([[[0.1507],
         [0.0583],
         [0.0138],
         [0.0179],
         [0.0091],
         [0.0121],
         [0.1222],
         [0.0266],
         [0.0029],
         [0.0320],
         [0.0111],
         [0.0662]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0351],
         [0.0343],
         [0.1333],
         [0.0121],
         [0.0275],
         [0.0113],
         [0.0107],
         [0.0061],
         [0.0044],
         [0.0079],
         [0.0143],
         [0.1150]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1169],
         [0.0296],
         [0.0291],
         [0.0142],
         [0.1496],
         [0.0160],
         [0.0892],
         [0.0080],
         [0.0806],
         [0.0301],
         [0.0132],
         [0.0225]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0221],
         [0.0323],
         [0.0907],
         [0.0383],
         [0.0165],
         [0.0303],
         [0.0133],
         [0.0291],
         [0.0266],
    


Evaluating:  30%|█████████████████████████████████████████████████████▌                                                                                                                             | 598/2000 [00:27<01:05, 21.35it/s][A
Evaluating:  30%|█████████████████████████████████████████████████████▊                                                                                                                             | 601/2000 [00:28<01:05, 21.44it/s][A


tensor([[[0.1096],
         [0.0305],
         [0.0343],
         [0.0308],
         [0.0630],
         [0.0408],
         [0.0498],
         [0.0344],
         [0.0221],
         [0.0133],
         [0.0184],
         [0.0263]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0212],
         [0.0543],
         [0.0656],
         [0.0186],
         [0.0083],
         [0.0081],
         [0.0142],
         [0.0368],
         [0.0030],
         [0.0046],
         [0.0141],
         [0.0282]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0460],
         [0.0295],
         [0.0308],
         [0.0134],
         [0.0305],
         [0.0047],
         [0.0120],
         [0.0083],
         [0.0022],
         [0.0107],
         [0.0145],
         [0.0069]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0253],
         [0.0293],
         [0.0177],
         [0.0048],
         [0.0159],
         [0.0220],
         [0.0142],
         [0.0056],
         [0.0051],
   


Evaluating:  30%|██████████████████████████████████████████████████████                                                                                                                             | 604/2000 [00:28<01:05, 21.36it/s][A

tensor([[[0.0132],
         [0.0148],
         [0.0117],
         [0.0035],
         [0.0110],
         [0.0082],
         [0.0233],
         [0.0110],
         [0.0265],
         [0.0107],
         [0.0012],
         [0.0245]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0336],
         [0.0217],
         [0.0413],
         [0.0149],
         [0.0331],
         [0.0172],
         [0.0228],
         [0.0048],
         [0.0148],
         [0.0079],
         [0.0218],
         [0.0445]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0481],
         [0.0146],
         [0.0262],
         [0.0108],
         [0.0111],
         [0.0026],
         [0.0043],
         [0.0046],
         [0.0045],
         [0.0034],
         [0.0162],
         [0.0232]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0607],
         [0.0866],
         [0.0767],
         [0.0464],
         [0.0253],
         [0.0234],
         [0.0273],
         [0.0110],
         [0.0038],
    


Evaluating:  30%|██████████████████████████████████████████████████████▎                                                                                                                            | 607/2000 [00:28<01:05, 21.31it/s][A

tensor([[[0.0829],
         [0.0180],
         [0.0124],
         [0.0134],
         [0.0128],
         [0.0151],
         [0.0257],
         [0.0010],
         [0.0023],
         [0.0055],
         [0.0254],
         [0.0145]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0251],
         [0.0265],
         [0.0180],
         [0.0069],
         [0.0060],
         [0.0121],
         [0.0148],
         [0.0040],
         [0.0012],
         [0.0053],
         [0.0468],
         [0.0720]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0325],
         [0.0152],
         [0.0291],
         [0.0085],
         [0.0169],
         [0.0227],
         [0.0093],
         [0.0034],
         [0.0057],
         [0.0062],
         [0.0058],
         [0.0075]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0270],
         [0.0142],
         [0.0419],
         [0.0132],
         [0.0186],
         [0.0259],
         [0.0194],
         [0.0018],
         [0.0010],
    


Evaluating:  30%|██████████████████████████████████████████████████████▌                                                                                                                            | 610/2000 [00:28<01:05, 21.36it/s][A
Evaluating:  31%|██████████████████████████████████████████████████████▊                                                                                                                            | 613/2000 [00:28<01:04, 21.40it/s][A

tensor([[[0.0371],
         [0.0454],
         [0.0692],
         [0.0082],
         [0.0121],
         [0.0078],
         [0.0153],
         [0.0361],
         [0.0030],
         [0.0085],
         [0.0370],
         [0.0311]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0269],
         [0.0340],
         [0.1098],
         [0.0275],
         [0.0175],
         [0.0117],
         [0.0163],
         [0.0138],
         [0.0037],
         [0.0046],
         [0.0409],
         [0.1075]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0308],
         [0.0334],
         [0.0341],
         [0.0180],
         [0.0137],
         [0.0094],
         [0.0148],
         [0.0162],
         [0.0073],
         [0.0086],
         [0.0110],
         [0.0408]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0388],
         [0.0591],
         [0.0156],
         [0.0245],
         [0.0419],
         [0.0185],
         [0.0491],
         [0.0060],
         [0.0043],
    


Evaluating:  31%|███████████████████████████████████████████████████████▏                                                                                                                           | 616/2000 [00:28<01:05, 21.29it/s][A

tensor([[[0.0108],
         [0.0230],
         [0.0165],
         [0.0061],
         [0.0138],
         [0.0095],
         [0.0291],
         [0.0033],
         [0.0114],
         [0.0184],
         [0.0051],
         [0.0077]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1103],
         [0.0999],
         [0.0347],
         [0.0167],
         [0.1061],
         [0.0232],
         [0.0168],
         [0.0354],
         [0.0061],
         [0.0377],
         [0.0142],
         [0.0470]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0341],
         [0.0454],
         [0.0630],
         [0.0172],
         [0.0343],
         [0.0136],
         [0.0155],
         [0.0043],
         [0.0053],
         [0.0073],
         [0.0259],
         [0.0288]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0152],
         [0.0279],
         [0.0209],
         [0.0080],
         [0.0251],
         [0.0042],
         [0.0163],
         [0.0018],
         [0.0027],
    


Evaluating:  31%|███████████████████████████████████████████████████████▍                                                                                                                           | 619/2000 [00:28<01:04, 21.26it/s][A

tensor([[[0.0082],
         [0.0323],
         [0.0615],
         [0.0069],
         [0.0242],
         [0.0072],
         [0.0074],
         [0.0035],
         [0.0024],
         [0.0052],
         [0.0474],
         [0.0283]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0435],
         [0.0264],
         [0.0289],
         [0.0088],
         [0.0203],
         [0.0107],
         [0.0377],
         [0.0523],
         [0.0658],
         [0.0243],
         [0.0118],
         [0.0136]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2241],
         [0.0360],
         [0.0294],
         [0.0541],
         [0.0504],
         [0.0073],
         [0.0554],
         [0.0322],
         [0.0242],
         [0.0808],
         [0.0059],
         [0.0241]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0370],
         [0.0564],
         [0.0886],
         [0.0138],
         [0.0168],
         [0.0049],
         [0.0368],
         [0.0059],
         [0.0107],
    


Evaluating:  31%|███████████████████████████████████████████████████████▋                                                                                                                           | 622/2000 [00:29<01:04, 21.28it/s][A
Evaluating:  31%|███████████████████████████████████████████████████████▉                                                                                                                           | 625/2000 [00:29<01:04, 21.34it/s][A

tensor([[[0.0104],
         [0.0048],
         [0.0076],
         [0.0153],
         [0.0032],
         [0.0065],
         [0.0112],
         [0.0023],
         [0.0004],
         [0.0012],
         [0.0414],
         [0.0174]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0236],
         [0.0185],
         [0.0141],
         [0.0095],
         [0.0111],
         [0.0042],
         [0.0339],
         [0.0184],
         [0.0314],
         [0.0123],
         [0.0016],
         [0.0251]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0522],
         [0.1531],
         [0.0539],
         [0.0210],
         [0.0314],
         [0.0110],
         [0.0150],
         [0.0111],
         [0.0044],
         [0.0091],
         [0.0582],
         [0.0550]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0356],
         [0.0361],
         [0.0318],
         [0.0169],
         [0.0198],
         [0.0084],
         [0.0171],
         [0.0076],
         [0.0032],
    


Evaluating:  31%|████████████████████████████████████████████████████████▏                                                                                                                          | 628/2000 [00:29<01:04, 21.43it/s][A

tensor([[[0.1829],
         [0.1173],
         [0.0281],
         [0.0257],
         [0.0371],
         [0.0062],
         [0.1339],
         [0.0712],
         [0.1452],
         [0.1362],
         [0.0039],
         [0.0252]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0480],
         [0.1092],
         [0.0618],
         [0.0252],
         [0.0497],
         [0.0227],
         [0.0418],
         [0.0231],
         [0.0276],
         [0.0240],
         [0.0148],
         [0.0937]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0221],
         [0.0303],
         [0.0374],
         [0.0107],
         [0.0203],
         [0.0121],
         [0.0094],
         [0.0064],
         [0.0032],
         [0.0086],
         [0.0227],
         [0.01


Evaluating:  32%|████████████████████████████████████████████████████████▍                                                                                                                          | 631/2000 [00:29<01:04, 21.36it/s][A

tensor([[[0.0987],
         [0.0796],
         [0.0618],
         [0.0092],
         [0.0223],
         [0.0177],
         [0.0146],
         [0.0031],
         [0.0034],
         [0.0075],
         [0.0134],
         [0.0162]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0649],
         [0.0223],
         [0.0575],
         [0.0184],
         [0.0390],
         [0.0338],
         [0.0121],
         [0.0150],
         [0.0020],
         [0.0135],
         [0.0165],
         [0.0277]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0737],
         [0.0647],
         [0.0257],
         [0.0212],
         [0.0306],
         [0.0655],
         [0.0362],
         [0.0149],
         [0.0271],
         [0.0299],
         [0.0146],
         [0.0348]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0537],
         [0.0778],
         [0.0479],
         [0.0159],
         [0.0132],
         [0.0153],
         [0.0195],
         [0.0077],
         [0.0045],
    


Evaluating:  32%|████████████████████████████████████████████████████████▋                                                                                                                          | 634/2000 [00:29<01:03, 21.35it/s][A
Evaluating:  32%|█████████████████████████████████████████████████████████                                                                                                                          | 637/2000 [00:29<01:04, 21.16it/s][A

tensor([[[0.0992],
         [0.0473],
         [0.0473],
         [0.0329],
         [0.0494],
         [0.0441],
         [0.0242],
         [0.0233],
         [0.0036],
         [0.0168],
         [0.0146],
         [0.0440]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0487],
         [0.0084],
         [0.0887],
         [0.0120],
         [0.0141],
         [0.0240],
         [0.0164],
         [0.0055],
         [0.0108],
         [0.0054],
         [0.0030],
         [0.0226]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0287],
         [0.0211],
         [0.0451],
         [0.0096],
         [0.0255],
         [0.0061],
         [0.0292],
         [0.0050],
         [0.0437],
         [0.0148],
         [0.0037],
         [0.0226]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0235],
         [0.0389],
         [0.0692],
         [0.0156],
         [0.0091],
         [0.0103],
         [0.0203],
         [0.0516],
         [0.0062],
    


Evaluating:  32%|█████████████████████████████████████████████████████████▎                                                                                                                         | 640/2000 [00:29<01:03, 21.30it/s][A

tensor([[[0.0087],
         [0.0081],
         [0.0207],
         [0.0106],
         [0.0078],
         [0.0163],
         [0.0015],
         [0.0041],
         [0.0003],
         [0.0006],
         [0.0216],
         [0.0212]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0603],
         [0.0285],
         [0.0189],
         [0.0400],
         [0.0364],
         [0.0327],
         [0.0848],
         [0.0079],
         [0.0080],
         [0.0239],
         [0.0419],
         [0.0122]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0410],
         [0.0489],
         [0.1305],
         [0.0489],
         [0.0890],
         [0.0356],
         [0.0256],
         [0.0280],
         [0.0024],
         [0.0085],
         [0.0870],
         [0.2477]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0375],
         [0.2480],
         [0.0664],
         [0.0116],
         [0.0276],
         [0.0115],
         [0.0043],
         [0.0103],
         [0.0057],
    


Evaluating:  32%|█████████████████████████████████████████████████████████▌                                                                                                                         | 643/2000 [00:30<01:03, 21.27it/s][A

tensor([[[0.0253],
         [0.0216],
         [0.0268],
         [0.0040],
         [0.0084],
         [0.0048],
         [0.0064],
         [0.0155],
         [0.0061],
         [0.0043],
         [0.0066],
         [0.0117]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0529],
         [0.0385],
         [0.0328],
         [0.0192],
         [0.0190],
         [0.0077],
         [0.0176],
         [0.0109],
         [0.0102],
         [0.0283],
         [0.0094],
         [0.0239]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0135],
         [0.0303],
         [0.0383],
         [0.0109],
         [0.0129],
         [0.0150],
         [0.0189],
         [0.0337],
         [0.0037],
         [0.0060],
         [0.0500],
         [0.0131]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0145],
         [0.0689],
         [0.0209],
         [0.0234],
         [0.0239],
         [0.0176],
         [0.0086],
         [0.0245],
         [0.0028],
    


Evaluating:  32%|█████████████████████████████████████████████████████████▊                                                                                                                         | 646/2000 [00:30<01:03, 21.29it/s][A
Evaluating:  32%|██████████████████████████████████████████████████████████                                                                                                                         | 649/2000 [00:30<01:03, 21.35it/s][A

tensor([[[0.0663],
         [0.0145],
         [0.0110],
         [0.0051],
         [0.0057],
         [0.0021],
         [0.1022],
         [0.0047],
         [0.0050],
         [0.0077],
         [0.0036],
         [0.0050]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0184],
         [0.0509],
         [0.0330],
         [0.0058],
         [0.0132],
         [0.0133],
         [0.0100],
         [0.0113],
         [0.0021],
         [0.0050],
         [0.0385],
         [0.0268]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0078],
         [0.0194],
         [0.0270],
         [0.0134],
         [0.0057],
         [0.0049],
         [0.0077],
         [0.0030],
         [0.0012],
         [0.0011],
         [0.0111],
         [0.0263]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0068],
         [0.0309],
         [0.0625],
         [0.0171],
         [0.0185],
         [0.0118],
         [0.0398],
         [0.0018],
         [0.0096],
    


Evaluating:  33%|██████████████████████████████████████████████████████████▎                                                                                                                        | 652/2000 [00:30<01:02, 21.43it/s][A

tensor([[[0.0389],
         [0.0394],
         [0.0991],
         [0.0391],
         [0.0114],
         [0.0031],
         [0.0259],
         [0.0098],
         [0.0034],
         [0.0050],
         [0.0399],
         [0.0446]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0513],
         [0.0365],
         [0.0883],
         [0.0185],
         [0.0303],
         [0.0048],
         [0.0151],
         [0.0044],
         [0.0058],
         [0.0080],
         [0.0434],
         [0.0540]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0126],
         [0.0210],
         [0.0159],
         [0.0097],
         [0.0098],
         [0.0137],
         [0.0026],
         [0.0353],
         [0.0016],
         [0.0048],
         [0.0110],
         [0.0082]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0635],
         [0.0202],
         [0.0227],
         [0.0116],
         [0.0270],
         [0.0284],
         [0.0478],
         [0.0213],
         [0.0522],
    


Evaluating:  33%|██████████████████████████████████████████████████████████▌                                                                                                                        | 655/2000 [00:30<01:02, 21.36it/s][A

tensor([[[0.0287],
         [0.0753],
         [0.0401],
         [0.0162],
         [0.0238],
         [0.0153],
         [0.0243],
         [0.0265],
         [0.0035],
         [0.0212],
         [0.0105],
         [0.0273]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0314],
         [0.0305],
         [0.0702],
         [0.0121],
         [0.0151],
         [0.0059],
         [0.0192],
         [0.0313],
         [0.0082],
         [0.0114],
         [0.0171],
         [0.0437]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0592],
         [0.0626],
         [0.0493],
         [0.0325],
         [0.0403],
         [0.0093],
         [0.0104],
         [0.0214],
         [0.0026],
         [0.0084],
         [0.0564],
         [0.0460]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1374],
         [0.0287],
         [0.0127],
         [0.0144],
         [0.0046],
         [0.0235],
         [0.0288],
         [0.0548],
         [0.0064],
    


Evaluating:  33%|██████████████████████████████████████████████████████████▉                                                                                                                        | 658/2000 [00:30<01:02, 21.35it/s][A
Evaluating:  33%|███████████████████████████████████████████████████████████▏                                                                                                                       | 661/2000 [00:30<01:02, 21.44it/s][A

tensor([[[0.0607],
         [0.0924],
         [0.0208],
         [0.0152],
         [0.0261],
         [0.0160],
         [0.0172],
         [0.0115],
         [0.0036],
         [0.0132],
         [0.0205],
         [0.0255]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0123],
         [0.0206],
         [0.0255],
         [0.0360],
         [0.0329],
         [0.0344],
         [0.0357],
         [0.0019],
         [0.0021],
         [0.0023],
         [0.3442],
         [0.1166]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0155],
         [0.0154],
         [0.0594],
         [0.0265],
         [0.0240],
         [0.0192],
         [0.0143],
         [0.0058],
         [0.0101],
         [0.0053],
         [0.0561],
         [0.0326]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0411],
         [0.0494],
         [0.0086],
         [0.0161],
         [0.0348],
         [0.0550],
         [0.0291],
         [0.0154],
         [0.0037],
    


Evaluating:  33%|███████████████████████████████████████████████████████████▍                                                                                                                       | 664/2000 [00:31<01:02, 21.31it/s][A

tensor([[[0.0540],
         [0.0577],
         [0.0775],
         [0.0258],
         [0.0232],
         [0.0179],
         [0.0205],
         [0.0245],
         [0.0090],
         [0.0085],
         [0.0169],
         [0.0513]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0443],
         [0.0276],
         [0.0757],
         [0.0215],
         [0.0550],
         [0.0137],
         [0.0197],
         [0.0346],
         [0.0078],
         [0.0128],
         [0.0240],
         [0.0470]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0285],
         [0.0131],
         [0.0342],
         [0.0068],
         [0.0091],
         [0.0096],
         [0.0358],
         [0.0074],
         [0.0076],
         [0.0048],
         [0.0088],
         [0.0268]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0171],
         [0.0475],
         [0.0393],
         [0.0143],
         [0.0557],
         [0.0071],
         [0.0225],
         [0.0029],
         [0.0066],
    


Evaluating:  33%|███████████████████████████████████████████████████████████▋                                                                                                                       | 667/2000 [00:31<01:02, 21.28it/s][A

tensor([[[0.0281],
         [0.1048],
         [0.0535],
         [0.0140],
         [0.0261],
         [0.0132],
         [0.0120],
         [0.0197],
         [0.0082],
         [0.0058],
         [0.0191],
         [0.0225]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0290],
         [0.0173],
         [0.0349],
         [0.0090],
         [0.0062],
         [0.0021],
         [0.0346],
         [0.0066],
         [0.0022],
         [0.0054],
         [0.0050],
         [0.0211]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0656],
         [0.0470],
         [0.0267],
         [0.0118],
         [0.0186],
         [0.0040],
         [0.0309],
         [0.0078],
         [0.0048],
         [0.0082],
         [0.0051],
         [0.0157]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0044],
         [0.0307],
         [0.0147],
         [0.0229],
         [0.0067],
         [0.0032],
         [0.0092],
         [0.0049],
         [0.0111],
    


Evaluating:  34%|███████████████████████████████████████████████████████████▉                                                                                                                       | 670/2000 [00:31<01:02, 21.34it/s][A
Evaluating:  34%|████████████████████████████████████████████████████████████▏                                                                                                                      | 673/2000 [00:31<01:02, 21.34it/s][A

tensor([[[0.0602],
         [0.0347],
         [0.0734],
         [0.0162],
         [0.0222],
         [0.0245],
         [0.0401],
         [0.0094],
         [0.0034],
         [0.0172],
         [0.0201],
         [0.0234]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0338],
         [0.0152],
         [0.0212],
         [0.0226],
         [0.0139],
         [0.0061],
         [0.0248],
         [0.0113],
         [0.0106],
         [0.0118],
         [0.0154],
         [0.0261]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0423],
         [0.0178],
         [0.0417],
         [0.0146],
         [0.0213],
         [0.0137],
         [0.0202],
         [0.0199],
         [0.0081],
         [0.0101],
         [0.0352],
         [0.0243]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1089],
         [0.0661],
         [0.0810],
         [0.0267],
         [0.0461],
         [0.0100],
         [0.0507],
         [0.0748],
         [0.0214],
    


Evaluating:  34%|████████████████████████████████████████████████████████████▌                                                                                                                      | 676/2000 [00:31<01:02, 21.16it/s][A

tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0166],
         [0.0247],
         [0.0223],
         [0.0127],
         [0.0391],
         [0.0101],
         [0.0132],
         [0.0159],
         [0.0132],
         [0.0126],
         [0.0062],
         [0.0084]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0297],
         [0.0598],
         [0.1975],
         [0.0491],
         [0.0817],
         [0.0387],
         [0.0211],
         [0.0619],
         [0.0447],
         [0.0118],
         [0.1257],
         [0.0628]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0152],
         [0.0154],
         [0.0241],
         [0.0112],
         [0.0107],
         [0.0123],
         [0.0195],
         [0.0025],
         [0.0008],
         [0.0033],
         [0.1031],
         [0.02


Evaluating:  34%|████████████████████████████████████████████████████████████▊                                                                                                                      | 679/2000 [00:31<01:02, 21.16it/s][A

tensor([[[0.0260],
         [0.0288],
         [0.0424],
         [0.0196],
         [0.0139],
         [0.0154],
         [0.0159],
         [0.0025],
         [0.0033],
         [0.0028],
         [0.0404],
         [0.0299]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0550],
         [0.0501],
         [0.0474],
         [0.0180],
         [0.0279],
         [0.0672],
         [0.0191],
         [0.0012],
         [0.0040],
         [0.0087],
         [0.0208],
         [0.0213]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0235],
         [0.0246],
         [0.0497],
         [0.0162],
         [0.0153],
         [0.0495],
         [0.0190],
         [0.0168],
         [0.0056],
         [0.0041],
         [0.0806],
         [0.0182]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0601],
         [0.0570],
         [0.0679],
         [0.0134],
         [0.0438],
         [0.0266],
         [0.0062],
         [0.0589],
         [0.0048],
    


Evaluating:  34%|█████████████████████████████████████████████████████████████                                                                                                                      | 682/2000 [00:31<01:01, 21.26it/s][A
Evaluating:  34%|█████████████████████████████████████████████████████████████▎                                                                                                                     | 685/2000 [00:32<01:01, 21.28it/s][A

tensor([[[0.0556],
         [0.0344],
         [0.1302],
         [0.0081],
         [0.0166],
         [0.0112],
         [0.0190],
         [0.0011],
         [0.0017],
         [0.0026],
         [0.0528],
         [0.0987]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0377],
         [0.0297],
         [0.0306],
         [0.0081],
         [0.0380],
         [0.0260],
         [0.0189],
         [0.0023],
         [0.0023],
         [0.0093],
         [0.0672],
         [0.0254]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0297],
         [0.0374],
         [0.0806],
         [0.0180],
         [0.1083],
         [0.0153],
         [0.0064],
         [0.0179],
         [0.0098],
         [0.0062],
         [0.0208],
         [0.0349]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0217],
         [0.0611],
         [0.0772],
         [0.0110],
         [0.0530],
         [0.0218],
         [0.0151],
         [0.0245],
         [0.0065],
    


Evaluating:  34%|█████████████████████████████████████████████████████████████▌                                                                                                                     | 688/2000 [00:32<01:01, 21.30it/s][A

tensor([[[0.0147],
         [0.0218],
         [0.0581],
         [0.0200],
         [0.0494],
         [0.0375],
         [0.0181],
         [0.0167],
         [0.0244],
         [0.0080],
         [0.1281],
         [0.0398]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0149],
         [0.0283],
         [0.0484],
         [0.0146],
         [0.0288],
         [0.0103],
         [0.0163],
         [0.0030],
         [0.0105],
         [0.0083],
         [0.0745],
         [0.0297]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0349],
         [0.0625],
         [0.1767],
         [0.0218],
         [0.0160],
         [0.0136],
         [0.0454],
         [0.0119],
         [0.0180],
         [0.0108],
         [0.0165],
         [0.0547]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0594],
         [0.0410],
         [0.0839],
         [0.0207],
         [0.0629],
         [0.0232],
         [0.0136],
         [0.0387],
         [0.0051],
    


Evaluating:  35%|█████████████████████████████████████████████████████████████▊                                                                                                                     | 691/2000 [00:32<01:01, 21.35it/s][A

tensor([[[0.0335],
         [0.0239],
         [0.0524],
         [0.0080],
         [0.0104],
         [0.0099],
         [0.0177],
         [0.0055],
         [0.0043],
         [0.0047],
         [0.0363],
         [0.0337]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0099],
         [0.0187],
         [0.0238],
         [0.0061],
         [0.0054],
         [0.0100],
         [0.0268],
         [0.0013],
         [0.0043],
         [0.0030],
         [0.0257],
         [0.0052]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0095],
         [0.0120],
         [0.0724],
         [0.0113],
         [0.0069],
         [0.0114],
         [0.0148],
         [0.0056],
         [0.0053],
         [0.0041],
         [0.0144],
         [0.0509]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0089],
         [0.0075],
         [0.0095],
         [0.0024],
         [0.0029],
         [0.0020],
         [0.0176],
         [0.0006],
         [0.0030],
    


Evaluating:  35%|██████████████████████████████████████████████████████████████                                                                                                                     | 694/2000 [00:32<01:01, 21.26it/s][A
Evaluating:  35%|██████████████████████████████████████████████████████████████▍                                                                                                                    | 697/2000 [00:32<01:01, 21.23it/s][A

tensor([[[0.1369],
         [0.0159],
         [0.0291],
         [0.0302],
         [0.0541],
         [0.0141],
         [0.0210],
         [0.0420],
         [0.0079],
         [0.0306],
         [0.0301],
         [0.0210]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2772],
         [0.1507],
         [0.0965],
         [0.0556],
         [0.1092],
         [0.0153],
         [0.0584],
         [0.0177],
         [0.1069],
         [0.0928],
         [0.0267],
         [0.0841]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0933],
         [0.1254],
         [0.0817],
         [0.0192],
         [0.0347],
         [0.0186],
         [0.0181],
         [0.0913],
         [0.0141],
         [0.0245],
         [0.0415],
         [0.0307]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0095],
         [0.0410],
         [0.0288],
         [0.0090],
         [0.0227],
         [0.0252],
         [0.0037],
         [0.0037],
         [0.0015],
    


Evaluating:  35%|██████████████████████████████████████████████████████████████▋                                                                                                                    | 700/2000 [00:32<01:01, 21.29it/s][A

tensor([[[0.0445],
         [0.0452],
         [0.0442],
         [0.0443],
         [0.0299],
         [0.0087],
         [0.0340],
         [0.0058],
         [0.0050],
         [0.0174],
         [0.0375],
         [0.0229]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1099],
         [0.0712],
         [0.0347],
         [0.0396],
         [0.0289],
         [0.0293],
         [0.0315],
         [0.0080],
         [0.0082],
         [0.0444],
         [0.1568],
         [0.0463]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0277],
         [0.0288],
         [0.0715],
         [0.0138],
         [0.0241],
         [0.0094],
         [0.0123],
         [0.0042],
         [0.0058],
         [0.0046],
         [0.0128],
         [0.0224]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0314],
         [0.0542],
         [0.0113],
         [0.0228],
         [0.0054],
         [0.0110],
         [0.0369],
         [0.0043],
         [0.0023],
    


Evaluating:  35%|██████████████████████████████████████████████████████████████▉                                                                                                                    | 703/2000 [00:32<01:01, 21.21it/s][A



reg attention sum per layer
tensor([[[0.0099],
         [0.0119],
         [0.0238],
         [0.0066],
         [0.0188],
         [0.0116],
         [0.0155],
         [0.0026],
         [0.0048],
         [0.0039],
         [0.0161],
         [0.0136]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0139],
         [0.0290],
         [0.0116],
         [0.0053],
         [0.0046],
         [0.0069],
         [0.0055],
         [0.0014],
         [0.0005],
         [0.0018],
         [0.0477],
         [0.0104]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0196],
         [0.0145],
         [0.0707],
         [0.0047],
         [0.0293],
         [0.0078],
         [0.0257],
         [0.0029],
         [0.0034],
         [0.0077],
         [0.0256],
         [0.0762]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0339],
         [0.0592],
         [0.1126],
         [0.0351],
         [0.0466],
         [0.0130],
         [0.0173],
         [0.0

Evaluating:  35%|███████████████████████████████████████████████████████████████▏                                                                                                                   | 706/2000 [00:33<01:00, 21.27it/s][A
Evaluating:  35%|███████████████████████████████████████████████████████████████▍                                                                                                                   | 709/2000 [00:33<01:00, 21.24it/s][A

reg attention sum per layer
tensor([[[0.2500],
         [0.1728],
         [0.0410],
         [0.0437],
         [0.0130],
         [0.0100],
         [0.0474],
         [0.0456],
         [0.0058],
         [0.0092],
         [0.0288],
         [0.0622]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0542],
         [0.0499],
         [0.0787],
         [0.0353],
         [0.0440],
         [0.0138],
         [0.0135],
         [0.0122],
         [0.0062],
         [0.0184],
         [0.1114],
         [0.0512]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0290],
         [0.0285],
         [0.0880],
         [0.0218],
         [0.0315],
         [0.0042],
         [0.0100],
         [0.0186],
         [0.0114],
         [0.0084],
         [0.0192],
         [0.1003]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0161],
         [0.0182],
         [0.0203],
         [0.0059],
         [0.0350],
         [0.0192],
         [0.0143],
         [0.01


Evaluating:  36%|███████████████████████████████████████████████████████████████▋                                                                                                                   | 712/2000 [00:33<01:00, 21.18it/s][A


reg attention sum per layer
tensor([[[0.1363],
         [0.0947],
         [0.0509],
         [0.0387],
         [0.0436],
         [0.0493],
         [0.0532],
         [0.0093],
         [0.0030],
         [0.0094],
         [0.0382],
         [0.0585]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0963],
         [0.0472],
         [0.0262],
         [0.0173],
         [0.0258],
         [0.0104],
         [0.0611],
         [0.0353],
         [0.0265],
         [0.0281],
         [0.0098],
         [0.0281]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0090],
         [0.0163],
         [0.0652],
         [0.0061],
         [0.0203],
         [0.0258],
         [0.0086],
         [0.0096],
         [0.0068],
         [0.0034],
         [0.0507],
         [0.0781]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0139],
         [0.0407],
         [0.0538],
         [0.0096],
         [0.0196],
         [0.0039],
         [0.0133],
         [0.0


Evaluating:  36%|███████████████████████████████████████████████████████████████▉                                                                                                                   | 715/2000 [00:33<01:00, 21.27it/s][A
Evaluating:  36%|████████████████████████████████████████████████████████████████▎                                                                                                                  | 718/2000 [00:33<00:59, 21.38it/s][A


reg attention sum per layer
tensor([[[0.0038],
         [0.0231],
         [0.0430],
         [0.0101],
         [0.0181],
         [0.0058],
         [0.0075],
         [0.0110],
         [0.0067],
         [0.0030],
         [0.0184],
         [0.0178]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1021],
         [0.0416],
         [0.0487],
         [0.0286],
         [0.0327],
         [0.0398],
         [0.0503],
         [0.0483],
         [0.0093],
         [0.0265],
         [0.0274],
         [0.0626]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0305],
         [0.1262],
         [0.1126],
         [0.0259],
         [0.0343],
         [0.0199],
         [0.0169],
         [0.0133],
         [0.0062],
         [0.0065],
         [0.0615],
         [0.0632]]], device='cuda:0')
reg attention sum per layer



Evaluating:  36%|████████████████████████████████████████████████████████████████▌                                                                                                                  | 721/2000 [00:33<01:00, 21.28it/s][A

tensor([[[0.0570],
         [0.0441],
         [0.0207],
         [0.0205],
         [0.0157],
         [0.0115],
         [0.0170],
         [0.0071],
         [0.0036],
         [0.0152],
         [0.0199],
         [0.0218]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0429],
         [0.0273],
         [0.0431],
         [0.0253],
         [0.0250],
         [0.0156],
         [0.0174],
         [0.0106],
         [0.0058],
         [0.0161],
         [0.0468],
         [0.0350]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0448],
         [0.0296],
         [0.0224],
         [0.0175],
         [0.0065],
         [0.0121],
         [0.0840],
         [0.0184],
         [0.0110],
         [0.0123],
         [0.0046],
         [0.0200]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [


Evaluating:  36%|████████████████████████████████████████████████████████████████▊                                                                                                                  | 724/2000 [00:33<00:59, 21.29it/s][A



reg attention sum per layer
tensor([[[0.0759],
         [0.0321],
         [0.0850],
         [0.0618],
         [0.0401],
         [0.0131],
         [0.0247],
         [0.0314],
         [0.0028],
         [0.0128],
         [0.0712],
         [0.1211]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0691],
         [0.0931],
         [0.0435],
         [0.0228],
         [0.0421],
         [0.0353],
         [0.0255],
         [0.0184],
         [0.0043],
         [0.0080],
         [0.0243],
         [0.0306]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0485],
         [0.0368],
         [0.0750],
         [0.0143],
         [0.0657],
         [0.0135],
         [0.0236],
         [0.0491],
         [0.0469],
         [0.0123],
         [0.0149],
         [0.1019]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0118],
         [0.0168],
         [0.0601],
         [0.0202],
         [0.0419],
         [0.0082],
         [0.0182],
         [0.0

Evaluating:  36%|█████████████████████████████████████████████████████████████████                                                                                                                  | 727/2000 [00:34<00:59, 21.40it/s][A
Evaluating:  36%|█████████████████████████████████████████████████████████████████▎                                                                                                                 | 730/2000 [00:34<00:59, 21.38it/s][A

reg attention sum per layer
tensor([[[0.0238],
         [0.1679],
         [0.2184],
         [0.0458],
         [0.0231],
         [0.0188],
         [0.0549],
         [0.0057],
         [0.0068],
         [0.0079],
         [0.5102],
         [0.0739]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0266],
         [0.0221],
         [0.0284],
         [0.0137],
         [0.0184],
         [0.0429],
         [0.0121],
         [0.0085],
         [0.0041],
         [0.0093],
         [0.0156],
         [0.0047]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0328],
         [0.0397],
         [0.0732],
         [0.0620],
         [0.0258],
         [0.0233],
         [0.0331],
         [0.0357],
         [0.0077],
         [0.0135],
         [0.0444],
         [0.0172]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0146],
         [0.0562],
         [0.0328],
         [0.0021],
         [0.0063],
         [0.0245],
         [0.0196],
         [0.01


Evaluating:  37%|█████████████████████████████████████████████████████████████████▌                                                                                                                 | 733/2000 [00:34<00:59, 21.41it/s][A
Evaluating:  37%|█████████████████████████████████████████████████████████████████▊                                                                                                                 | 736/2000 [00:34<00:58, 21.48it/s][A

reg attention sum per layer
tensor([[[0.0306],
         [0.0142],
         [0.0215],
         [0.0215],
         [0.0146],
         [0.0084],
         [0.0223],
         [0.0043],
         [0.0051],
         [0.0063],
         [0.0284],
         [0.0335]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0253],
         [0.0138],
         [0.0162],
         [0.0096],
         [0.0117],
         [0.0208],
         [0.0085],
         [0.0018],
         [0.0019],
         [0.0052],
         [0.0374],
         [0.0390]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0091],
         [0.0152],
         [0.0466],
         [0.0126],
         [0.0324],
         [0.0093],
         [0.0115],
         [0.0034],
         [0.0024],
         [0.0030],
         [0.0259],
         [0.0442]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0815],
         [0.0300],
         [0.0371],
         [0.0127],
         [0.0273],
         [0.0237],
         [0.0301],
         [0.01


Evaluating:  37%|██████████████████████████████████████████████████████████████████▏                                                                                                                | 739/2000 [00:34<00:58, 21.39it/s][A

tensor([[[0.0672],
         [0.0513],
         [0.0333],
         [0.0222],
         [0.0624],
         [0.0106],
         [0.0379],
         [0.0124],
         [0.0086],
         [0.0197],
         [0.0256],
         [0.0182]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0550],
         [0.0428],
         [0.1064],
         [0.0140],
         [0.0314],
         [0.0035],
         [0.0264],
         [0.0134],
         [0.0271],
         [0.0276],
         [0.0097],
         [0.0327]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0335],
         [0.0144],
         [0.0644],
         [0.0275],
         [0.0192],
         [0.0066],
         [0.0068],
         [0.0059],
         [0.0009],
         [0.0023],
         [0.0239],
         [0.0310]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0212],
         [0.0223],
         [0.0383],
         [0.0075],
         [0.0312],
         [0.0213],
         [0.0063],
         [0.0247],
         [0.0019],
    


Evaluating:  37%|██████████████████████████████████████████████████████████████████▍                                                                                                                | 742/2000 [00:34<00:58, 21.37it/s][A
Evaluating:  37%|██████████████████████████████████████████████████████████████████▋                                                                                                                | 745/2000 [00:34<00:58, 21.50it/s][A

reg attention sum per layer
tensor([[[0.0240],
         [0.0247],
         [0.0961],
         [0.0331],
         [0.0293],
         [0.0154],
         [0.0048],
         [0.0088],
         [0.0003],
         [0.0026],
         [0.1330],
         [0.0451]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0133],
         [0.0221],
         [0.0240],
         [0.0075],
         [0.0159],
         [0.0207],
         [0.0182],
         [0.0097],
         [0.0099],
         [0.0080],
         [0.0388],
         [0.0315]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0243],
         [0.0257],
         [0.0463],
         [0.0067],
         [0.0075],
         [0.0121],
         [0.0554],
         [0.0042],
         [0.0129],
         [0.0052],
         [0.0136],
         [0.0310]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0873],
         [0.1156],
         [0.1362],
         [0.0247],
         [0.0489],
         [0.0203],
         [0.0648],
         [0.01


Evaluating:  37%|██████████████████████████████████████████████████████████████████▉                                                                                                                | 748/2000 [00:35<00:58, 21.40it/s][A

tensor([[[0.0498],
         [0.0341],
         [0.0495],
         [0.0089],
         [0.0487],
         [0.0107],
         [0.0634],
         [0.0045],
         [0.0638],
         [0.0201],
         [0.0121],
         [0.0289]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0125],
         [0.0060],
         [0.0090],
         [0.0041],
         [0.0098],
         [0.0044],
         [0.0066],
         [0.0273],
         [0.0034],
         [0.0075],
         [0.0079],
         [0.0070]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1199],
         [0.0983],
         [0.0675],
         [0.0527],
         [0.0413],
         [0.0265],
         [0.0978],
         [0.0203],
         [0.0448],
         [0.0255],
         [0.0231],
         [0.0786]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0212],
         [0.0287],
         [0.0485],
         [0.0213],
         [0.0519],
         [0.0084],
         [0.0121],
         [0.0065],
         [0.0054],
    


Evaluating:  38%|███████████████████████████████████████████████████████████████████▏                                                                                                               | 751/2000 [00:35<00:58, 21.34it/s][A

tensor([[[0.0150],
         [0.0605],
         [0.0740],
         [0.0629],
         [0.0175],
         [0.0029],
         [0.0091],
         [0.0061],
         [0.0049],
         [0.0085],
         [0.0446],
         [0.0313]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0074],
         [0.0049],
         [0.0451],
         [0.0135],
         [0.0145],
         [0.0169],
         [0.0030],
         [0.0035],
         [0.0022],
         [0.0016],
         [0.0192],
         [0.0403]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0338],
         [0.0572],
         [0.0522],
         [0.0462],
         [0.0382],
         [0.0311],
         [0.0562],
         [0.0169],
         [0.0156],
         [0.0444],
         [0.0125],
         [0.0178]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0263],
         [0.0054],
         [0.0101],
         [0.0085],
         [0.0103],
         [0.0084],
         [0.0100],
         [0.0048],
         [0.0067],
    


Evaluating:  38%|███████████████████████████████████████████████████████████████████▍                                                                                                               | 754/2000 [00:35<00:58, 21.38it/s][A
Evaluating:  38%|███████████████████████████████████████████████████████████████████▊                                                                                                               | 757/2000 [00:35<00:58, 21.41it/s][A

tensor([[[0.0251],
         [0.1048],
         [0.1192],
         [0.0204],
         [0.0384],
         [0.0206],
         [0.0312],
         [0.0266],
         [0.0048],
         [0.0124],
         [0.0376],
         [0.0404]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0476],
         [0.0464],
         [0.0824],
         [0.0129],
         [0.0179],
         [0.0133],
         [0.0146],
         [0.0152],
         [0.0107],
         [0.0149],
         [0.0113],
         [0.0539]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0046],
         [0.0023],
         [0.0109],
         [0.0013],
         [0.0067],
         [0.0034],
         [0.0032],
         [0.0003],
         [0.0005],
         [0.0034],
         [0.0086],
         [0.0011]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0098],
         [0.0331],
         [0.1239],
         [0.0112],
         [0.0170],
         [0.0052],
         [0.0088],
         [0.0009],
         [0.0009],
    


Evaluating:  38%|████████████████████████████████████████████████████████████████████                                                                                                               | 760/2000 [00:35<00:57, 21.39it/s][A

tensor([[[0.0175],
         [0.0354],
         [0.0730],
         [0.0310],
         [0.0393],
         [0.0114],
         [0.0127],
         [0.0137],
         [0.0032],
         [0.0047],
         [0.0443],
         [0.0822]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0316],
         [0.0542],
         [0.1273],
         [0.0198],
         [0.0336],
         [0.0101],
         [0.0051],
         [0.0129],
         [0.0028],
         [0.0073],
         [0.0195],
         [0.0384]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0174],
         [0.0244],
         [0.0528],
         [0.0104],
         [0.0340],
         [0.0063],
         [0.0095],
         [0.0351],
         [0.0060],
         [0.0084],
         [0.0304],
         [0.0228]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0280],
         [0.0279],
         [0.0589],
         [0.0192],
         [0.0307],
         [0.0132],
         [0.0372],
         [0.0334],
         [0.0757],
    


Evaluating:  38%|████████████████████████████████████████████████████████████████████▎                                                                                                              | 763/2000 [00:35<00:58, 21.33it/s][A

tensor([[[0.0288],
         [0.0221],
         [0.0202],
         [0.0051],
         [0.0088],
         [0.0038],
         [0.0160],
         [0.0113],
         [0.0045],
         [0.0038],
         [0.0057],
         [0.0080]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0426],
         [0.0597],
         [0.0237],
         [0.0124],
         [0.0125],
         [0.0189],
         [0.0277],
         [0.0018],
         [0.0032],
         [0.0093],
         [0.0249],
         [0.0394]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0565],
         [0.0240],
         [0.0227],
         [0.0213],
         [0.0141],
         [0.0341],
         [0.0485],
         [0.0239],
         [0.0042],
         [0.0104],
         [0.0057],
         [0.0255]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0110],
         [0.0207],
         [0.0529],
         [0.0203],
         [0.0097],
         [0.0171],
         [0.0050],
         [0.0073],
         [0.0006],
    


Evaluating:  38%|████████████████████████████████████████████████████████████████████▌                                                                                                              | 766/2000 [00:35<00:57, 21.42it/s][A
Evaluating:  38%|████████████████████████████████████████████████████████████████████▊                                                                                                              | 769/2000 [00:35<00:57, 21.35it/s][A

tensor([[[0.0530],
         [0.0290],
         [0.1214],
         [0.0082],
         [0.0487],
         [0.0252],
         [0.0326],
         [0.0141],
         [0.0080],
         [0.0148],
         [0.0514],
         [0.0369]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0521],
         [0.0188],
         [0.0539],
         [0.0235],
         [0.0231],
         [0.0128],
         [0.0075],
         [0.0281],
         [0.0148],
         [0.0120],
         [0.0267],
         [0.0297]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0327],
         [0.0394],
         [0.1171],
         [0.0170],
         [0.0147],
         [0.0510],
         [0.0374],
         [0.0106],
         [0.0056],
         [0.0273],
         [0.3114],
         [0.0600]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0118],
         [0.0268],
         [0.0640],
         [0.0057],
         [0.0033],
         [0.0067],
         [0.0088],
         [0.0210],
         [0.0031],
    


Evaluating:  39%|█████████████████████████████████████████████████████████████████████                                                                                                              | 772/2000 [00:36<00:57, 21.21it/s][A

tensor([[[0.0242],
         [0.0113],
         [0.0243],
         [0.0033],
         [0.0356],
         [0.0142],
         [0.0245],
         [0.0114],
         [0.0172],
         [0.0080],
         [0.0025],
         [0.0234]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0230],
         [0.0476],
         [0.0594],
         [0.0103],
         [0.0081],
         [0.0142],
         [0.0101],
         [0.0161],
         [0.0034],
         [0.0075],
         [0.0191],
         [0.0166]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0198],
         [0.0134],
         [0.0448],
         [0.0070],
         [0.0108],
         [0.0034],
         [0.0328],
         [0.0144],
         [0.0127],
         [0.0067],
         [0.0041],
         [0.0144]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0538],
         [0.0745],
         [0.0098],
         [0.0227],
         [0.0092],
         [0.0100],
         [0.0304],
         [0.0338],
         [0.0056],
    


Evaluating:  39%|█████████████████████████████████████████████████████████████████████▎                                                                                                             | 775/2000 [00:36<00:57, 21.20it/s][A

tensor([[[0.0061],
         [0.0220],
         [0.0833],
         [0.0261],
         [0.0186],
         [0.0130],
         [0.0087],
         [0.0109],
         [0.0100],
         [0.0039],
         [0.0142],
         [0.0301]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0172],
         [0.0149],
         [0.0449],
         [0.0094],
         [0.0158],
         [0.0126],
         [0.0055],
         [0.0025],
         [0.0015],
         [0.0019],
         [0.0272],
         [0.0371]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0133],
         [0.0102],
         [0.0337],
         [0.0063],
         [0.0051],
         [0.0079],
         [0.0057],
         [0.0098],
         [0.0016],
         [0.0022],
         [0.0080],
         [0.0096]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0049],
         [0.0092],
         [0.0245],
         [0.0051],
         [0.0098],
         [0.0040],
         [0.0112],
         [0.0046],
         [0.0084],
    


Evaluating:  39%|█████████████████████████████████████████████████████████████████████▋                                                                                                             | 778/2000 [00:36<00:57, 21.20it/s][A
Evaluating:  39%|█████████████████████████████████████████████████████████████████████▉                                                                                                             | 781/2000 [00:36<00:57, 21.24it/s][A

tensor([[[0.0662],
         [0.0968],
         [0.0700],
         [0.0425],
         [0.0163],
         [0.0093],
         [0.0184],
         [0.0856],
         [0.0117],
         [0.0217],
         [0.0327],
         [0.1059]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0046],
         [0.0056],
         [0.0041],
         [0.0016],
         [0.0016],
         [0.0036],
         [0.0022],
         [0.0010],
         [0.0005],
         [0.0015],
         [0.0044],
         [0.0023]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0353],
         [0.0296],
         [0.0125],
         [0.0117],
         [0.0154],
         [0.0085],
         [0.0397],
         [0.0079],
         [0.0045],
         [0.0131],
         [0.0072],
         [0.0058]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0422],
         [0.0766],
         [0.0944],
         [0.0117],
         [0.0150],
         [0.0071],
         [0.0259],
         [0.0638],
         [0.0847],
    


Evaluating:  39%|██████████████████████████████████████████████████████████████████████▏                                                                                                            | 784/2000 [00:36<00:57, 21.27it/s][A

reg attention sum per layer
tensor([[[0.0749],
         [0.0341],
         [0.0513],
         [0.0171],
         [0.0149],
         [0.0072],
         [0.1018],
         [0.0040],
         [0.0025],
         [0.0072],
         [0.0312],
         [0.0370]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0287],
         [0.0204],
         [0.0655],
         [0.0148],
         [0.0326],
         [0.0097],
         [0.0139],
         [0.0263],
         [0.0066],
         [0.0086],
         [0.0303],
         [0.0295]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0310],
         [0.0399],
         [0.0757],
         [0.0044],
         [0.0082],
         [0.0068],
         [0.0266],
         [0.0224],
         [0.0089],
         [0.0045],
         [0.0129],
         [0.0227]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0297],
         [0.0233],
         [0.1164],
         [0.0233],
         [0.0198],
         [0.0119],
         [0.0116],
         [0.00


Evaluating:  39%|██████████████████████████████████████████████████████████████████████▍                                                                                                            | 787/2000 [00:36<00:57, 21.21it/s][A

tensor([[[0.0535],
         [0.0379],
         [0.1154],
         [0.0335],
         [0.0415],
         [0.0146],
         [0.0150],
         [0.0236],
         [0.0080],
         [0.0096],
         [0.0368],
         [0.0644]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0547],
         [0.0527],
         [0.0404],
         [0.0128],
         [0.0651],
         [0.0100],
         [0.0920],
         [0.0301],
         [0.0208],
         [0.0309],
         [0.0081],
         [0.0484]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0158],
         [0.0198],
         [0.0307],
         [0.0058],
         [0.0203],
         [0.0248],
         [0.0104],
         [0.0057],
         [0.0035],
         [0.0049],
         [0.0285],
         [0.0187]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0217],
         [0.0250],
         [0.1045],
         [0.0102],
         [0.0407],
         [0.0367],
         [0.0190],
         [0.0163],
         [0.0203],
    


Evaluating:  40%|██████████████████████████████████████████████████████████████████████▋                                                                                                            | 790/2000 [00:36<00:57, 20.93it/s][A
Evaluating:  40%|██████████████████████████████████████████████████████████████████████▉                                                                                                            | 793/2000 [00:37<00:57, 20.96it/s][A

tensor([[[0.0183],
         [0.0300],
         [0.0605],
         [0.0149],
         [0.0272],
         [0.0095],
         [0.0336],
         [0.0036],
         [0.0035],
         [0.0059],
         [0.0517],
         [0.0265]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0121],
         [0.0082],
         [0.0050],
         [0.0027],
         [0.0125],
         [0.0246],
         [0.0120],
         [0.0021],
         [0.0010],
         [0.0045],
         [0.0348],
         [0.0047]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0284],
         [0.0180],
         [0.0166],
         [0.0111],
         [0.0093],
         [0.0147],
         [0.0274],
         [0.0077],
         [0.0047],
         [0.0078],
         [0.0064],
         [0.0074]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0207],
         [0.0331],
         [0.1739],
         [0.0328],
         [0.0276],
         [0.0329],
         [0.0066],
         [0.0166],
         [0.0041],
    


Evaluating:  40%|███████████████████████████████████████████████████████████████████████▏                                                                                                           | 796/2000 [00:37<00:58, 20.72it/s][A

tensor([[[0.0045],
         [0.0125],
         [0.0226],
         [0.0035],
         [0.0031],
         [0.0043],
         [0.0055],
         [0.0025],
         [0.0015],
         [0.0002],
         [0.0318],
         [0.0138]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0450],
         [0.0152],
         [0.0234],
         [0.0141],
         [0.0210],
         [0.0109],
         [0.0207],
         [0.0193],
         [0.0073],
         [0.0060],
         [0.0102],
         [0.0441]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0520],
         [0.0335],
         [0.0134],
         [0.0198],
         [0.0139],
         [0.0227],
         [0.0270],
         [0.0043],
         [0.0084],
         [0.0393],
         [0.0313],
         [0.0582]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0169],
         [0.0497],
         [0.0341],
         [0.0063],
         [0.0104],
         [0.0112],
         [0.0207],
         [0.0118],
         [0.0046],
    


Evaluating:  40%|███████████████████████████████████████████████████████████████████████▌                                                                                                           | 799/2000 [00:37<00:57, 20.82it/s][A

tensor([[[0.0392],
         [0.0260],
         [0.0281],
         [0.0084],
         [0.0113],
         [0.0107],
         [0.0238],
         [0.0285],
         [0.0181],
         [0.0151],
         [0.0042],
         [0.0173]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0204],
         [0.0064],
         [0.0634],
         [0.0070],
         [0.0074],
         [0.0060],
         [0.0112],
         [0.0067],
         [0.0027],
         [0.0071],
         [0.0409],
         [0.0535]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0126],
         [0.0343],
         [0.0484],
         [0.0090],
         [0.0127],
         [0.0046],
         [0.0074],
         [0.0062],
         [0.0061],
         [0.0025],
         [0.0218],
         [0.0336]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0126],
         [0.0112],
         [0.0509],
         [0.0067],
         [0.0292],
         [0.0163],
         [0.0057],
         [0.0143],
         [0.0045],
    


Evaluating:  40%|███████████████████████████████████████████████████████████████████████▊                                                                                                           | 802/2000 [00:37<00:57, 20.97it/s][A
Evaluating:  40%|████████████████████████████████████████████████████████████████████████                                                                                                           | 805/2000 [00:37<00:56, 21.12it/s][A


reg attention sum per layer
tensor([[[0.0330],
         [0.0224],
         [0.0507],
         [0.0139],
         [0.0202],
         [0.0259],
         [0.0314],
         [0.0199],
         [0.0105],
         [0.0067],
         [0.0169],
         [0.0297]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0045],
         [0.0286],
         [0.0394],
         [0.0105],
         [0.0089],
         [0.0075],
         [0.0073],
         [0.0079],
         [0.0084],
         [0.0058],
         [0.0231],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0190],
         [0.0431],
         [0.0500],
         [0.0110],
         [0.0071],
         [0.0193],
         [0.0142],
         [0.0045],
         [0.0011],
         [0.0032],
         [0.0357],
         [0.0203]]], device='cuda:0')
reg attention sum per layer



Evaluating:  40%|████████████████████████████████████████████████████████████████████████▎                                                                                                          | 808/2000 [00:37<00:56, 21.18it/s][A

tensor([[[0.0041],
         [0.0023],
         [0.0303],
         [0.0048],
         [0.0051],
         [0.0072],
         [0.0032],
         [0.0032],
         [0.0007],
         [0.0016],
         [0.0222],
         [0.0089]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0127],
         [0.0055],
         [0.0201],
         [0.0034],
         [0.0082],
         [0.0104],
         [0.0029],
         [0.0057],
         [0.0001],
         [0.0005],
         [0.0041],
         [0.0245]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0164],
         [0.0573],
         [0.0519],
         [0.0085],
         [0.0121],
         [0.0473],
         [0.0388],
         [0.0113],
         [0.0096],
         [0.0068],
         [0.1230],
         [0.0485]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0853],
         [0.0103],
         [0.0340],
         [0.0293],
         [0.0335],
         [0.0119],
         [0.0658],
         [0.0307],
         [0.0199],
    


Evaluating:  41%|████████████████████████████████████████████████████████████████████████▌                                                                                                          | 811/2000 [00:37<00:56, 21.23it/s][A

tensor([[[0.0423],
         [0.0158],
         [0.0163],
         [0.0201],
         [0.0199],
         [0.0093],
         [0.0144],
         [0.0407],
         [0.0062],
         [0.0089],
         [0.0239],
         [0.0195]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0656],
         [0.0268],
         [0.0382],
         [0.0077],
         [0.0812],
         [0.0511],
         [0.0121],
         [0.0025],
         [0.0037],
         [0.0100],
         [0.0191],
         [0.0208]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0260],
         [0.0228],
         [0.0485],
         [0.0054],
         [0.0099],
         [0.0030],
         [0.0174],
         [0.0017],
         [0.0007],
         [0.0027],
         [0.0140],
         [0.0250]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0103],
         [0.0278],
         [0.1102],
         [0.0135],
         [0.0249],
         [0.0255],
         [0.0107],
         [0.0135],
         [0.0053],
    


Evaluating:  41%|████████████████████████████████████████████████████████████████████████▊                                                                                                          | 814/2000 [00:38<00:55, 21.22it/s][A
Evaluating:  41%|█████████████████████████████████████████████████████████████████████████                                                                                                          | 817/2000 [00:38<00:55, 21.27it/s][A

tensor([[[0.0050],
         [0.0036],
         [0.0016],
         [0.0020],
         [0.0019],
         [0.0037],
         [0.0031],
         [0.0004],
         [0.0006],
         [0.0007],
         [0.0019],
         [0.0019]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0139],
         [0.0093],
         [0.0698],
         [0.0186],
         [0.0406],
         [0.0141],
         [0.0098],
         [0.0066],
         [0.0047],
         [0.0053],
         [0.0264],
         [0.0359]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0023],
         [0.0202],
         [0.0598],
         [0.0088],
         [0.0045],
         [0.0161],
         [0.0110],
         [0.0042],
         [0.0020],
         [0.0021],
         [0.0389],
         [0.0130]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0188],
         [0.0144],
         [0.0179],
         [0.0077],
         [0.0126],
         [0.0280],
         [0.0247],
         [0.0037],
         [0.0024],
    


Evaluating:  41%|█████████████████████████████████████████████████████████████████████████▍                                                                                                         | 820/2000 [00:38<00:55, 21.25it/s][A

tensor([[[0.0219],
         [0.0611],
         [0.2192],
         [0.0447],
         [0.0387],
         [0.0178],
         [0.0067],
         [0.0298],
         [0.0387],
         [0.0053],
         [0.1006],
         [0.1419]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0435],
         [0.0108],
         [0.0080],
         [0.0043],
         [0.0064],
         [0.0081],
         [0.0313],
         [0.0352],
         [0.0133],
         [0.0159],
         [0.0026],
         [0.0085]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0363],
         [0.0751],
         [0.2871],
         [0.0768],
         [0.0455],
         [0.0094],
         [0.0672],
         [0.0040],
         [0.0356],
         [0.0176],
         [0.2208],
         [0.1168]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0406],
         [0.0176],
         [0.0418],
         [0.0179],
         [0.0174],
         [0.0179],
         [0.0137],
         [0.0065],
         [0.0068],
    


Evaluating:  41%|█████████████████████████████████████████████████████████████████████████▋                                                                                                         | 823/2000 [00:38<00:55, 21.18it/s][A

tensor([[[0.0028],
         [0.0044],
         [0.0238],
         [0.0051],
         [0.0243],
         [0.0140],
         [0.0029],
         [0.0037],
         [0.0009],
         [0.0007],
         [0.0321],
         [0.0137]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1499],
         [0.0957],
         [0.0251],
         [0.0407],
         [0.0051],
         [0.0065],
         [0.0476],
         [0.0353],
         [0.0034],
         [0.0187],
         [0.0037],
         [0.0249]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0380],
         [0.0403],
         [0.1104],
         [0.0215],
         [0.0712],
         [0.0170],
         [0.0110],
         [0.0081],
         [0.0025],
         [0.0108],
         [0.0285],
         [0.0334]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0361],
         [0.0447],
         [0.0563],
         [0.0086],
         [0.0429],
         [0.0103],
         [0.0234],
         [0.0047],
         [0.0197],
    


Evaluating:  41%|█████████████████████████████████████████████████████████████████████████▉                                                                                                         | 826/2000 [00:38<00:55, 21.23it/s][A
Evaluating:  41%|██████████████████████████████████████████████████████████████████████████▏                                                                                                        | 829/2000 [00:38<00:55, 21.23it/s][A

tensor([[[0.0812],
         [0.0393],
         [0.0234],
         [0.0184],
         [0.0088],
         [0.0088],
         [0.0123],
         [0.0112],
         [0.0027],
         [0.0085],
         [0.0173],
         [0.0277]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0097],
         [0.0090],
         [0.0904],
         [0.0067],
         [0.0162],
         [0.0050],
         [0.0149],
         [0.0069],
         [0.0048],
         [0.0042],
         [0.0139],
         [0.0261]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0583],
         [0.0420],
         [0.0663],
         [0.0174],
         [0.0654],
         [0.0394],
         [0.0184],
         [0.0173],
         [0.0080],
         [0.0098],
         [0.0609],
         [0.0609]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0652],
         [0.0816],
         [0.0597],
         [0.0098],
         [0.0160],
         [0.0142],
         [0.0229],
         [0.0018],
         [0.0042],
    


Evaluating:  42%|██████████████████████████████████████████████████████████████████████████▍                                                                                                        | 832/2000 [00:38<00:54, 21.26it/s][A

tensor([[[0.0340],
         [0.0164],
         [0.0169],
         [0.0209],
         [0.0542],
         [0.0214],
         [0.0077],
         [0.0187],
         [0.0028],
         [0.0053],
         [0.0125],
         [0.0098]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0218],
         [0.0417],
         [0.0455],
         [0.0128],
         [0.0356],
         [0.0077],
         [0.0292],
         [0.0165],
         [0.0159],
         [0.0189],
         [0.0207],
         [0.0175]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0121],
         [0.0603],
         [0.0214],
         [0.0097],
         [0.0059],
         [0.0044],
         [0.0024],
         [0.0082],
         [0.0014],
         [0.0023],
         [0.0065],
         [0.0066]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0374],
         [0.0413],
         [0.0580],
         [0.0270],
         [0.0479],
         [0.0100],
         [0.0386],
         [0.0550],
         [0.1250],
    


Evaluating:  42%|██████████████████████████████████████████████████████████████████████████▋                                                                                                        | 835/2000 [00:39<00:54, 21.24it/s][A

tensor([[[0.0074],
         [0.0103],
         [0.0090],
         [0.0077],
         [0.0174],
         [0.0098],
         [0.0036],
         [0.0080],
         [0.0009],
         [0.0050],
         [0.0097],
         [0.0099]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0400],
         [0.0313],
         [0.0570],
         [0.0066],
         [0.0296],
         [0.0053],
         [0.0146],
         [0.0126],
         [0.0055],
         [0.0202],
         [0.0093],
         [0.0140]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0323],
         [0.0229],
         [0.0626],
         [0.0196],
         [0.0367],
         [0.0055],
         [0.0263],
         [0.0190],
         [0.0170],
         [0.0199],
         [0.0157],
         [0.0409]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0414],
         [0.0513],
         [0.0435],
         [0.0213],
         [0.0184],
         [0.0066],
         [0.0095],
         [0.0189],
         [0.0045],
    


Evaluating:  42%|███████████████████████████████████████████████████████████████████████████                                                                                                        | 838/2000 [00:39<00:54, 21.31it/s][A
Evaluating:  42%|███████████████████████████████████████████████████████████████████████████▎                                                                                                       | 841/2000 [00:39<00:54, 21.36it/s][A

tensor([[[0.0476],
         [0.0475],
         [0.0204],
         [0.0271],
         [0.0373],
         [0.0335],
         [0.0143],
         [0.0441],
         [0.0061],
         [0.0094],
         [0.0259],
         [0.0817]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0337],
         [0.0182],
         [0.0180],
         [0.0073],
         [0.0197],
         [0.0592],
         [0.0092],
         [0.0220],
         [0.0034],
         [0.0199],
         [0.0103],
         [0.0113]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0268],
         [0.0528],
         [0.0257],
         [0.0189],
         [0.0116],
         [0.0146],
         [0.0705],
         [0.0043],
         [0.0076],
         [0.0087],
         [0.1022],
         [0.0389]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0353],
         [0.0238],
         [0.0782],
         [0.0196],
         [0.0467],
         [0.0379],
         [0.0151],
         [0.0032],
         [0.0027],
    


Evaluating:  42%|███████████████████████████████████████████████████████████████████████████▌                                                                                                       | 844/2000 [00:39<00:54, 21.26it/s][A

tensor([[[0.0082],
         [0.0283],
         [0.0166],
         [0.0075],
         [0.0307],
         [0.0015],
         [0.0249],
         [0.0033],
         [0.0083],
         [0.0091],
         [0.0098],
         [0.0494]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1161],
         [0.1331],
         [0.0723],
         [0.0289],
         [0.0911],
         [0.0430],
         [0.0337],
         [0.0205],
         [0.0096],
         [0.0287],
         [0.0782],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0613],
         [0.0521],
         [0.0435],
         [0.0262],
         [0.0496],
         [0.0378],
         [0.0263],
         [0.0779],
         [0.0448],
         [0.0243],
         [0.0247],
         [0.0387]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0194],
         [0.0076],
         [0.0216],
         [0.0082],
         [0.0184],
         [0.0234],
         [0.0104],
         [0.0060],
         [0.0019],
    


Evaluating:  42%|███████████████████████████████████████████████████████████████████████████▊                                                                                                       | 847/2000 [00:39<00:54, 21.33it/s][A

tensor([[[0.0292],
         [0.0250],
         [0.0225],
         [0.0124],
         [0.0123],
         [0.0191],
         [0.0183],
         [0.0156],
         [0.0066],
         [0.0096],
         [0.0137],
         [0.0172]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0167],
         [0.0085],
         [0.0807],
         [0.0156],
         [0.0250],
         [0.0088],
         [0.0162],
         [0.0167],
         [0.0095],
         [0.0126],
         [0.0184],
         [0.0516]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0310],
         [0.0087],
         [0.0111],
         [0.0139],
         [0.0137],
         [0.0188],
         [0.0052],
         [0.0028],
         [0.0011],
         [0.0020],
         [0.0081],
         [0.0120]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0281],
         [0.0265],
         [0.0608],
         [0.0146],
         [0.0184],
         [0.0049],
         [0.0119],
         [0.0121],
         [0.0060],
    


Evaluating:  42%|████████████████████████████████████████████████████████████████████████████                                                                                                       | 850/2000 [00:39<00:53, 21.38it/s][A
Evaluating:  43%|████████████████████████████████████████████████████████████████████████████▎                                                                                                      | 853/2000 [00:39<00:53, 21.50it/s][A

tensor([[[0.0169],
         [0.0624],
         [0.0242],
         [0.0195],
         [0.0052],
         [0.0342],
         [0.0417],
         [0.0042],
         [0.0036],
         [0.0057],
         [0.0890],
         [0.0319]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0162],
         [0.0211],
         [0.0342],
         [0.0115],
         [0.0160],
         [0.0108],
         [0.0176],
         [0.0065],
         [0.0077],
         [0.0077],
         [0.0129],
         [0.0173]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0213],
         [0.0182],
         [0.0550],
         [0.0432],
         [0.0149],
         [0.0099],
         [0.0166],
         [0.0034],
         [0.0058],
         [0.0048],
         [0.0509],
         [0.0369]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0162],
         [0.0220],
         [0.3011],
         [0.0546],
         [0.0636],
         [0.0092],
         [0.0117],
         [0.0040],
         [0.0220],
    


Evaluating:  43%|████████████████████████████████████████████████████████████████████████████▌                                                                                                      | 856/2000 [00:40<00:53, 21.31it/s][A

tensor([[[0.0181],
         [0.0477],
         [0.0540],
         [0.0153],
         [0.0086],
         [0.0068],
         [0.0145],
         [0.0109],
         [0.0038],
         [0.0070],
         [0.0086],
         [0.0250]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0087],
         [0.0214],
         [0.0633],
         [0.0154],
         [0.0148],
         [0.0093],
         [0.0043],
         [0.0069],
         [0.0024],
         [0.0029],
         [0.0181],
         [0.0192]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0143],
         [0.0201],
         [0.0122],
         [0.0121],
         [0.0052],
         [0.0228],
         [0.0197],
         [0.0127],
         [0.0019],
         [0.0033],
         [0.0107],
         [0.0121]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0079],
         [0.0231],
         [0.1436],
         [0.0191],
         [0.0468],
         [0.0058],
         [0.0054],
         [0.0045],
         [0.0093],
    


Evaluating:  43%|████████████████████████████████████████████████████████████████████████████▉                                                                                                      | 859/2000 [00:40<00:53, 21.27it/s][A

tensor([[[0.0283],
         [0.0988],
         [0.0927],
         [0.0180],
         [0.0168],
         [0.0089],
         [0.0218],
         [0.0668],
         [0.0073],
         [0.0073],
         [0.0693],
         [0.0334]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0237],
         [0.0862],
         [0.0343],
         [0.0216],
         [0.0041],
         [0.0042],
         [0.0156],
         [0.0188],
         [0.0071],
         [0.0067],
         [0.0359],
         [0.0320]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0155],
         [0.0328],
         [0.1990],
         [0.0186],
         [0.0242],
         [0.0464],
         [0.0262],
         [0.0089],
         [0.0080],
         [0.0031],
         [0.0569],
         [0.0553]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0210],
         [0.0537],
         [0.0751],
         [0.0101],
         [0.0176],
         [0.0251],
         [0.0165],
         [0.0096],
         [0.0024],
    


Evaluating:  43%|█████████████████████████████████████████████████████████████████████████████▏                                                                                                     | 862/2000 [00:40<00:53, 21.34it/s][A
Evaluating:  43%|█████████████████████████████████████████████████████████████████████████████▍                                                                                                     | 865/2000 [00:40<00:53, 21.38it/s][A

tensor([[[0.0378],
         [0.0477],
         [0.0873],
         [0.0379],
         [0.0180],
         [0.0248],
         [0.0084],
         [0.0329],
         [0.0024],
         [0.0056],
         [0.0665],
         [0.0774]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2745],
         [0.1243],
         [0.0420],
         [0.0431],
         [0.0313],
         [0.0079],
         [0.1052],
         [0.0231],
         [0.0559],
         [0.0543],
         [0.0058],
         [0.0459]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0364],
         [0.0047],
         [0.0221],
         [0.0190],
         [0.0292],
         [0.0127],
         [0.0051],
         [0.0183],
         [0.0040],
         [0.0138],
         [0.0045],
         [0.0054]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0185],
         [0.0271],
         [0.0292],
         [0.0071],
         [0.0071],
         [0.0100],
         [0.0245],
         [0.0079],
         [0.0064],
    


Evaluating:  43%|█████████████████████████████████████████████████████████████████████████████▋                                                                                                     | 868/2000 [00:40<00:53, 21.32it/s][A

tensor([[[0.0299],
         [0.0350],
         [0.0348],
         [0.0125],
         [0.0151],
         [0.0069],
         [0.0321],
         [0.0043],
         [0.0061],
         [0.0055],
         [0.0346],
         [0.0375]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0191],
         [0.0123],
         [0.0075],
         [0.0022],
         [0.0035],
         [0.0081],
         [0.0068],
         [0.0064],
         [0.0010],
         [0.0021],
         [0.0041],
         [0.0094]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0285],
         [0.0145],
         [0.0245],
         [0.0064],
         [0.0095],
         [0.0217],
         [0.0291],
         [0.0102],
         [0.0057],
         [0.0073],
         [0.0143],
         [0.0158]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0308],
         [0.0164],
         [0.0179],
         [0.0089],
         [0.0234],
         [0.0133],
         [0.0176],
         [0.0163],
         [0.0074],
    


Evaluating:  44%|█████████████████████████████████████████████████████████████████████████████▉                                                                                                     | 871/2000 [00:40<00:53, 21.23it/s][A

tensor([[[0.2043],
         [0.1559],
         [0.0284],
         [0.0222],
         [0.0164],
         [0.0076],
         [0.0598],
         [0.0072],
         [0.0036],
         [0.0248],
         [0.0083],
         [0.0377]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1216],
         [0.1043],
         [0.0609],
         [0.0232],
         [0.0494],
         [0.0167],
         [0.0134],
         [0.0932],
         [0.0139],
         [0.0147],
         [0.0077],
         [0.0323]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0551],
         [0.0066],
         [0.0336],
         [0.0183],
         [0.0204],
         [0.0205],
         [0.0182],
         [0.0105],
         [0.0101],
         [0.0062],
         [0.0046],
         [0.0518]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0845],
         [0.0241],
         [0.0286],
         [0.0126],
         [0.0172],
         [0.0065],
         [0.0199],
         [0.0072],
         [0.0081],
    


Evaluating:  44%|██████████████████████████████████████████████████████████████████████████████▏                                                                                                    | 874/2000 [00:40<00:53, 21.17it/s][A
Evaluating:  44%|██████████████████████████████████████████████████████████████████████████████▍                                                                                                    | 877/2000 [00:41<00:52, 21.22it/s][A

tensor([[[0.0584],
         [0.0406],
         [0.0515],
         [0.0238],
         [0.0264],
         [0.0149],
         [0.0421],
         [0.0221],
         [0.0272],
         [0.0138],
         [0.0077],
         [0.0479]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0298],
         [0.0107],
         [0.2037],
         [0.0178],
         [0.0490],
         [0.0205],
         [0.0090],
         [0.0078],
         [0.0044],
         [0.0084],
         [0.0313],
         [0.0337]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0305],
         [0.0151],
         [0.0198],
         [0.0077],
         [0.0493],
         [0.0231],
         [0.0149],
         [0.0075],
         [0.0066],
         [0.0172],
         [0.0136],
         [0.0164]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0348],
         [0.0190],
         [0.0125],
         [0.0104],
         [0.0089],
         [0.0200],
         [0.0076],
         [0.0101],
         [0.0012],
    


Evaluating:  44%|██████████████████████████████████████████████████████████████████████████████▊                                                                                                    | 880/2000 [00:41<00:53, 20.94it/s][A


tensor([[[0.0247],
         [0.0500],
         [0.1376],
         [0.0483],
         [0.0376],
         [0.0228],
         [0.0060],
         [0.0140],
         [0.0042],
         [0.0034],
         [0.0531],
         [0.1581]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0156],
         [0.0082],
         [0.0076],
         [0.0063],
         [0.0047],
         [0.0082],
         [0.0067],
         [0.0036],
         [0.0006],
         [0.0012],
         [0.0068],
         [0.0118]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0544],
         [0.0084],
         [0.0328],
         [0.0206],
         [0.0391],
         [0.0123],
         [0.0149],
         [0.0339],
         [0.0179],
         [0.0146],
         [0.0064],
         [0.0442]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0079],
         [0.0281],
         [0.0492],
         [0.0082],
         [0.0128],
         [0.0450],
         [0.0051],
         [0.0151],
         [0.0020],
   


Evaluating:  44%|███████████████████████████████████████████████████████████████████████████████                                                                                                    | 883/2000 [00:41<00:53, 20.97it/s][A

tensor([[[0.0088],
         [0.0263],
         [0.0340],
         [0.0043],
         [0.0084],
         [0.0073],
         [0.0109],
         [0.0083],
         [0.0047],
         [0.0033],
         [0.0366],
         [0.0193]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0972],
         [0.0409],
         [0.0261],
         [0.0291],
         [0.0302],
         [0.0225],
         [0.0478],
         [0.0194],
         [0.0118],
         [0.0202],
         [0.0144],
         [0.0303]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0943],
         [0.0739],
         [0.0410],
         [0.0156],
         [0.0108],
         [0.0087],
         [0.0973],
         [0.0016],
         [0.0207],
         [0.0252],
         [0.0200],
         [0.0064]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0338],
         [0.0154],
         [0.0273],
         [0.0117],
         [0.0083],
         [0.0049],
         [0.0099],
         [0.0124],
         [0.0013],
    


Evaluating:  44%|███████████████████████████████████████████████████████████████████████████████▎                                                                                                   | 886/2000 [00:41<00:52, 21.12it/s][A
Evaluating:  44%|███████████████████████████████████████████████████████████████████████████████▌                                                                                                   | 889/2000 [00:41<00:52, 21.28it/s][A


reg attention sum per layer
tensor([[[0.0434],
         [0.0879],
         [0.2425],
         [0.0297],
         [0.0227],
         [0.0295],
         [0.0150],
         [0.0099],
         [0.0080],
         [0.0034],
         [0.0720],
         [0.0584]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0447],
         [0.0890],
         [0.0785],
         [0.0454],
         [0.0523],
         [0.0080],
         [0.0134],
         [0.0127],
         [0.0066],
         [0.0162],
         [0.0321],
         [0.0384]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0132],
         [0.0058],
         [0.0287],
         [0.0084],
         [0.0567],
         [0.0038],
         [0.0154],
         [0.0020],
         [0.0164],
         [0.0035],
         [0.0150],
         [0.0114]]], device='cuda:0')
reg attention sum per layer



Evaluating:  45%|███████████████████████████████████████████████████████████████████████████████▊                                                                                                   | 892/2000 [00:41<00:52, 21.07it/s][A

tensor([[[0.0412],
         [0.0073],
         [0.0097],
         [0.0052],
         [0.0296],
         [0.0039],
         [0.0249],
         [0.0031],
         [0.0370],
         [0.0075],
         [0.0017],
         [0.0046]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0370],
         [0.0693],
         [0.0888],
         [0.0283],
         [0.0132],
         [0.0332],
         [0.0114],
         [0.0202],
         [0.0019],
         [0.0031],
         [0.0476],
         [0.0557]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0511],
         [0.0799],
         [0.0867],
         [0.0180],
         [0.0568],
         [0.0124],
         [0.0123],
         [0.0493],
         [0.0274],
         [0.0101],
         [0.0242],
         [0.0426]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0182],
         [0.0098],
         [0.0258],
         [0.0048],
         [0.0370],
         [0.0142],
         [0.0175],
         [0.0059],
         [0.0211],
    


Evaluating:  45%|████████████████████████████████████████████████████████████████████████████████                                                                                                   | 895/2000 [00:41<00:52, 21.15it/s][A

tensor([[[0.0218],
         [0.0148],
         [0.0179],
         [0.0094],
         [0.0068],
         [0.0023],
         [0.0327],
         [0.0137],
         [0.0130],
         [0.0083],
         [0.0021],
         [0.0231]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0163],
         [0.0536],
         [0.0364],
         [0.0043],
         [0.0033],
         [0.0033],
         [0.0123],
         [0.0020],
         [0.0011],
         [0.0012],
         [0.0497],
         [0.0097]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0042],
         [0.0281],
         [0.0681],
         [0.0289],
         [0.0082],
         [0.0202],
         [0.0056],
         [0.0093],
         [0.0016],
         [0.0025],
         [0.1676],
         [0.1308]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0439],
         [0.0092],
         [0.0061],
         [0.0027],
         [0.0036],
         [0.0038],
         [0.0088],
         [0.0012],
         [0.0016],
    


Evaluating:  45%|████████████████████████████████████████████████████████████████████████████████▎                                                                                                  | 898/2000 [00:42<00:51, 21.29it/s][A
Evaluating:  45%|████████████████████████████████████████████████████████████████████████████████▋                                                                                                  | 901/2000 [00:42<00:51, 21.24it/s][A

tensor([[[0.0281],
         [0.1883],
         [0.1055],
         [0.0484],
         [0.0358],
         [0.0512],
         [0.0308],
         [0.0255],
         [0.0071],
         [0.0176],
         [0.1066],
         [0.0745]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0254],
         [0.1037],
         [0.0673],
         [0.0113],
         [0.0134],
         [0.0081],
         [0.0364],
         [0.0041],
         [0.0075],
         [0.0083],
         [0.0563],
         [0.0267]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1073],
         [0.0569],
         [0.0289],
         [0.0419],
         [0.0192],
         [0.0140],
         [0.0212],
         [0.0140],
         [0.0034],
         [0.0147],
         [0.0131],
         [0.0193]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0147],
         [0.0355],
         [0.1082],
         [0.0361],
         [0.0539],
         [0.0150],
         [0.0159],
         [0.0099],
         [0.0278],
    


Evaluating:  45%|████████████████████████████████████████████████████████████████████████████████▉                                                                                                  | 904/2000 [00:42<00:51, 21.31it/s][A

tensor([[[0.0603],
         [0.0401],
         [0.0720],
         [0.0183],
         [0.0080],
         [0.0078],
         [0.0225],
         [0.0247],
         [0.0361],
         [0.0065],
         [0.0130],
         [0.0808]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0129],
         [0.0121],
         [0.0510],
         [0.0058],
         [0.0063],
         [0.0059],
         [0.0084],
         [0.0062],
         [0.0030],
         [0.0025],
         [0.0058],
         [0.0289]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1776],
         [0.0198],
         [0.0413],
         [0.0425],
         [0.0689],
         [0.0112],
         [0.0564],
         [0.0125],
         [0.0897],
         [0.0255],
         [0.0060],
         [0.0506]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0280],
         [0.0076],
         [0.0199],
         [0.0036],
         [0.0165],
         [0.0044],
         [0.0092],
         [0.0019],
         [0.0036],
    


Evaluating:  45%|█████████████████████████████████████████████████████████████████████████████████▏                                                                                                 | 907/2000 [00:42<00:51, 21.32it/s][A

tensor([[[0.0225],
         [0.0409],
         [0.1229],
         [0.0113],
         [0.0245],
         [0.0221],
         [0.0166],
         [0.0064],
         [0.0089],
         [0.0061],
         [0.0389],
         [0.0254]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0429],
         [0.0255],
         [0.0367],
         [0.0104],
         [0.0092],
         [0.0197],
         [0.0626],
         [0.0350],
         [0.0138],
         [0.0163],
         [0.0124],
         [0.0236]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0207],
         [0.0372],
         [0.0489],
         [0.0133],
         [0.0169],
         [0.0161],
         [0.0116],
         [0.0148],
         [0.0034],
         [0.0058],
         [0.0315],
         [0.0268]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0748],
         [0.0862],
         [0.0532],
         [0.0349],
         [0.0093],
         [0.0168],
         [0.0560],
         [0.0098],
         [0.0141],
    


Evaluating:  46%|█████████████████████████████████████████████████████████████████████████████████▍                                                                                                 | 910/2000 [00:42<00:50, 21.41it/s][A
Evaluating:  46%|█████████████████████████████████████████████████████████████████████████████████▋                                                                                                 | 913/2000 [00:42<00:51, 21.16it/s][A


reg attention sum per layer
tensor([[[0.0232],
         [0.0348],
         [0.0192],
         [0.0116],
         [0.0140],
         [0.0065],
         [0.0587],
         [0.0189],
         [0.0072],
         [0.0083],
         [0.0056],
         [0.0117]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0835],
         [0.1970],
         [0.0317],
         [0.0324],
         [0.0276],
         [0.0151],
         [0.1926],
         [0.0176],
         [0.0313],
         [0.0493],
         [0.0051],
         [0.0246]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0046],
         [0.0269],
         [0.1415],
         [0.0118],
         [0.0580],
         [0.0085],
         [0.0142],
         [0.0059],
         [0.0061],
         [0.0026],
         [0.1094],
         [0.0476]]], device='cuda:0')
reg attention sum per layer



Evaluating:  46%|█████████████████████████████████████████████████████████████████████████████████▉                                                                                                 | 916/2000 [00:42<00:51, 21.21it/s][A

tensor([[[0.0419],
         [0.0278],
         [0.0775],
         [0.0157],
         [0.0247],
         [0.0073],
         [0.0210],
         [0.0054],
         [0.0094],
         [0.0115],
         [0.0206],
         [0.0435]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0209],
         [0.0062],
         [0.0196],
         [0.0036],
         [0.0142],
         [0.0102],
         [0.0049],
         [0.0059],
         [0.0025],
         [0.0043],
         [0.0254],
         [0.0074]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0272],
         [0.0409],
         [0.0377],
         [0.0119],
         [0.0067],
         [0.0048],
         [0.0319],
         [0.0027],
         [0.0029],
         [0.0043],
         [0.0260],
         [0.0113]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0275],
         [0.0177],
         [0.0082],
         [0.0160],
         [0.0156],
         [0.0061],
         [0.0227],
         [0.0020],
         [0.0030],
    


Evaluating:  46%|██████████████████████████████████████████████████████████████████████████████████▎                                                                                                | 919/2000 [00:43<00:51, 21.03it/s][A

tensor([[[0.0213],
         [0.0870],
         [0.1303],
         [0.0188],
         [0.0187],
         [0.0168],
         [0.0295],
         [0.0471],
         [0.0385],
         [0.0115],
         [0.0265],
         [0.1412]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0509],
         [0.0351],
         [0.1400],
         [0.0264],
         [0.0260],
         [0.0356],
         [0.0089],
         [0.0068],
         [0.0029],
         [0.0056],
         [0.0660],
         [0.0228]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0091],
         [0.0185],
         [0.0514],
         [0.0110],
         [0.0298],
         [0.0132],
         [0.0133],
         [0.0070],
         [0.0096],
         [0.0040],
         [0.0224],
         [0.0315]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0419],
         [0.0225],
         [0.0081],
         [0.0024],
         [0.0066],
         [0.0094],
         [0.0208],
         [0.0087],
         [0.0136],
    


Evaluating:  46%|██████████████████████████████████████████████████████████████████████████████████▌                                                                                                | 922/2000 [00:43<00:51, 21.12it/s][A
Evaluating:  46%|██████████████████████████████████████████████████████████████████████████████████▊                                                                                                | 925/2000 [00:43<00:50, 21.09it/s][A

tensor([[[0.0866],
         [0.0277],
         [0.0334],
         [0.0177],
         [0.0275],
         [0.0071],
         [0.0216],
         [0.0154],
         [0.0046],
         [0.0245],
         [0.0313],
         [0.0218]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0363],
         [0.0591],
         [0.0461],
         [0.0175],
         [0.0453],
         [0.0232],
         [0.0989],
         [0.0188],
         [0.0707],
         [0.0354],
         [0.0090],
         [0.0228]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0482],
         [0.0629],
         [0.1740],
         [0.1065],
         [0.0661],
         [0.0238],
         [0.0130],
         [0.0086],
         [0.0123],
         [0.0071],
         [0.1592],
         [0.1613]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0780],
         [0.0552],
         [0.0478],
         [0.0438],
         [0.0272],
         [0.0308],
         [0.0295],
         [0.0037],
         [0.0101],
    


Evaluating:  46%|███████████████████████████████████████████████████████████████████████████████████                                                                                                | 928/2000 [00:43<00:50, 21.08it/s][A

reg attention sum per layer
tensor([[[0.1422],
         [0.0904],
         [0.0572],
         [0.0220],
         [0.0379],
         [0.0395],
         [0.0224],
         [0.0182],
         [0.0057],
         [0.0087],
         [0.0268],
         [0.0330]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0245],
         [0.0569],
         [0.1502],
         [0.0141],
         [0.0337],
         [0.0249],
         [0.0614],
         [0.0454],
         [0.0143],
         [0.0192],
         [0.0699],
         [0.0449]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0489],
         [0.0752],
         [0.0448],
         [0.0140],
         [0.0111],
         [0.0131],
         [0.0285],
         [0.0247],
         [0.0067],
         [0.0106],
         [0.0134],
         [0.0706]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0973],
         [0.0072],
         [0.0173],
         [0.0260],
         [0.0131],
         [0.0053],
         [0.0183],
         [0.00


Evaluating:  47%|███████████████████████████████████████████████████████████████████████████████████▎                                                                                               | 931/2000 [00:43<00:50, 21.11it/s][A

tensor([[[0.0135],
         [0.0113],
         [0.0357],
         [0.0167],
         [0.0132],
         [0.0037],
         [0.0096],
         [0.0011],
         [0.0042],
         [0.0035],
         [0.0183],
         [0.0467]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0285],
         [0.0526],
         [0.0411],
         [0.0168],
         [0.0152],
         [0.0192],
         [0.0212],
         [0.0048],
         [0.0146],
         [0.0102],
         [0.0155],
         [0.0236]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0192],
         [0.0913],
         [0.1098],
         [0.0317],
         [0.0139],
         [0.0145],
         [0.0129],
         [0.0080],
         [0.0022],
         [0.0043],
         [0.1002],
         [0.0491]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0024],
         [0.0234],
         [0.0289],
         [0.0062],
         [0.0040],
         [0.0021],
         [0.0107],
         [0.0050],
         [0.0016],
    


Evaluating:  47%|███████████████████████████████████████████████████████████████████████████████████▌                                                                                               | 934/2000 [00:43<00:50, 21.09it/s][A
Evaluating:  47%|███████████████████████████████████████████████████████████████████████████████████▊                                                                                               | 937/2000 [00:43<00:50, 21.11it/s][A

tensor([[[0.0105],
         [0.0409],
         [0.0889],
         [0.0264],
         [0.0085],
         [0.0043],
         [0.0108],
         [0.0024],
         [0.0036],
         [0.0019],
         [0.1915],
         [0.0166]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0105],
         [0.0144],
         [0.0461],
         [0.0124],
         [0.0077],
         [0.0057],
         [0.0093],
         [0.0205],
         [0.0032],
         [0.0046],
         [0.0082],
         [0.0233]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0067],
         [0.0061],
         [0.1056],
         [0.0084],
         [0.0084],
         [0.0016],
         [0.0241],
         [0.0042],
         [0.0055],
         [0.0123],
         [0.0187],
         [0.1232]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0329],
         [0.0224],
         [0.0276],
         [0.0108],
         [0.0112],
         [0.0124],
         [0.0089],
         [0.0049],
         [0.0008],
    


Evaluating:  47%|████████████████████████████████████████████████████████████████████████████████████▏                                                                                              | 940/2000 [00:44<00:50, 21.18it/s][A

tensor([[[0.0049],
         [0.0161],
         [0.1291],
         [0.0130],
         [0.0092],
         [0.0054],
         [0.0117],
         [0.0084],
         [0.0035],
         [0.0016],
         [0.0638],
         [0.0772]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0709],
         [0.1025],
         [0.0448],
         [0.0253],
         [0.0506],
         [0.0242],
         [0.0264],
         [0.0165],
         [0.0060],
         [0.0176],
         [0.0794],
         [0.0825]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0263],
         [0.0565],
         [0.1408],
         [0.0210],
         [0.0430],
         [0.0600],
         [0.0379],
         [0.0132],
         [0.0057],
         [0.0120],
         [0.1384],
         [0.0480]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0294],
         [0.0109],
         [0.0355],
         [0.0083],
         [0.0138],
         [0.0071],
         [0.0139],
         [0.0095],
         [0.0074],
    


Evaluating:  47%|████████████████████████████████████████████████████████████████████████████████████▍                                                                                              | 943/2000 [00:44<00:49, 21.18it/s][A

tensor([[[0.0929],
         [0.1089],
         [0.0649],
         [0.0569],
         [0.0241],
         [0.0139],
         [0.0496],
         [0.0162],
         [0.0181],
         [0.0248],
         [0.0165],
         [0.0295]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0124],
         [0.0157],
         [0.0239],
         [0.0086],
         [0.0225],
         [0.0303],
         [0.0056],
         [0.0082],
         [0.0038],
         [0.0082],
         [0.0233],
         [0.0198]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0275],
         [0.0260],
         [0.1021],
         [0.0275],
         [0.0211],
         [0.0237],
         [0.0106],
         [0.0170],
         [0.0042],
         [0.0064],
         [0.0382],
         [0.0553]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0133],
         [0.0117],
         [0.0281],
         [0.0056],
         [0.0095],
         [0.0067],
         [0.0154],
         [0.0023],
         [0.0050],
    


Evaluating:  47%|████████████████████████████████████████████████████████████████████████████████████▋                                                                                              | 946/2000 [00:44<00:49, 21.32it/s][A
Evaluating:  47%|████████████████████████████████████████████████████████████████████████████████████▉                                                                                              | 949/2000 [00:44<00:49, 21.41it/s][A

tensor([[[0.0590],
         [0.0322],
         [0.0244],
         [0.0147],
         [0.0119],
         [0.0108],
         [0.0221],
         [0.0028],
         [0.0016],
         [0.0044],
         [0.0195],
         [0.0836]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0178],
         [0.0360],
         [0.0407],
         [0.0165],
         [0.0210],
         [0.0067],
         [0.0084],
         [0.0176],
         [0.0100],
         [0.0095],
         [0.0265],
         [0.0299]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0150],
         [0.0049],
         [0.0563],
         [0.0066],
         [0.0324],
         [0.0057],
         [0.0048],
         [0.0051],
         [0.0067],
         [0.0037],
         [0.0094],
         [0.0290]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0531],
         [0.0546],
         [0.1297],
         [0.0292],
         [0.0157],
         [0.0104],
         [0.0290],
         [0.0201],
         [0.0111],
    


Evaluating:  48%|█████████████████████████████████████████████████████████████████████████████████████▏                                                                                             | 952/2000 [00:44<00:49, 21.34it/s][A

tensor([[[0.0468],
         [0.0211],
         [0.0429],
         [0.0084],
         [0.0256],
         [0.0209],
         [0.0454],
         [0.0069],
         [0.0058],
         [0.0256],
         [0.0326],
         [0.0116]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1052],
         [0.0146],
         [0.1287],
         [0.0082],
         [0.0028],
         [0.0419],
         [0.0378],
         [0.0052],
         [0.0016],
         [0.0113],
         [0.0008],
         [0.0186]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0343],
         [0.0323],
         [0.0732],
         [0.0278],
         [0.0648],
         [0.0230],
         [0.0340],
         [0.0152],
         [0.0076],
         [0.0136],
         [0.3078],
         [0.0517]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0516],
         [0.0444],
         [0.0485],
         [0.0178],
         [0.0458],
         [0.0462],
         [0.0126],
         [0.0136],
         [0.0247],
    


Evaluating:  48%|█████████████████████████████████████████████████████████████████████████████████████▍                                                                                             | 955/2000 [00:44<00:48, 21.39it/s][A


reg attention sum per layer
tensor([[[0.0681],
         [0.0475],
         [0.0484],
         [0.0167],
         [0.0311],
         [0.0264],
         [0.0183],
         [0.0312],
         [0.0039],
         [0.0257],
         [0.0230],
         [0.0194]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0178],
         [0.0251],
         [0.0339],
         [0.0141],
         [0.0089],
         [0.0160],
         [0.0195],
         [0.0025],
         [0.0029],
         [0.0036],
         [0.0269],
         [0.0243]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0265],
         [0.0339],
         [0.0522],
         [0.0202],
         [0.0237],
         [0.0158],
         [0.0287],
         [0.0109],
         [0.0065],
         [0.0104],
         [0.0130],
         [0.0408]]], device='cuda:0')
reg attention sum per layer



Evaluating:  48%|█████████████████████████████████████████████████████████████████████████████████████▋                                                                                             | 958/2000 [00:44<00:48, 21.28it/s][A
Evaluating:  48%|██████████████████████████████████████████████████████████████████████████████████████                                                                                             | 961/2000 [00:45<00:51, 20.36it/s][A

tensor([[[0.1066],
         [0.0861],
         [0.1627],
         [0.0844],
         [0.0889],
         [0.0389],
         [0.0130],
         [0.0174],
         [0.0143],
         [0.0164],
         [0.1362],
         [0.1010]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0223],
         [0.0310],
         [0.0409],
         [0.0197],
         [0.0352],
         [0.0031],
         [0.0378],
         [0.0157],
         [0.0168],
         [0.0147],
         [0.0123],
         [0.0209]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0213],
         [0.0057],
         [0.0555],
         [0.0130],
         [0.0249],
         [0.0092],
         [0.0059],
         [0.0145],
         [0.0013],
         [0.0101],
         [0.0151],
         [0.0171]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0520],
         [0.0822],
         [0.0721],
         [0.0197],
         [0.0144],
         [0.0095],
         [0.0147],
         [0.0041],
         [0.0018],
    


Evaluating:  48%|██████████████████████████████████████████████████████████████████████████████████████▎                                                                                            | 964/2000 [00:45<00:50, 20.35it/s][A

reg attention sum per layer
tensor([[[0.0228],
         [0.0232],
         [0.0362],
         [0.0131],
         [0.0135],
         [0.0149],
         [0.0160],
         [0.0137],
         [0.0046],
         [0.0052],
         [0.0084],
         [0.0106]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0391],
         [0.0684],
         [0.0372],
         [0.0107],
         [0.0160],
         [0.0186],
         [0.0219],
         [0.0155],
         [0.0049],
         [0.0067],
         [0.0161],
         [0.0209]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0187],
         [0.0178],
         [0.0077],
         [0.0084],
         [0.0103],
         [0.0225],
         [0.0129],
         [0.0078],
         [0.0044],
         [0.0126],
         [0.0138],
         [0.0103]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0244],
         [0.0226],
         [0.0473],
         [0.0069],
         [0.0215],
         [0.0143],
         [0.0131],
         [0.00


Evaluating:  48%|██████████████████████████████████████████████████████████████████████████████████████▌                                                                                            | 967/2000 [00:45<00:50, 20.47it/s][A

tensor([[[0.0431],
         [0.0484],
         [0.1060],
         [0.0122],
         [0.0343],
         [0.0051],
         [0.0373],
         [0.0041],
         [0.0115],
         [0.0057],
         [0.0172],
         [0.0336]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1225],
         [0.0363],
         [0.0153],
         [0.0106],
         [0.0169],
         [0.0123],
         [0.0515],
         [0.0283],
         [0.0060],
         [0.0092],
         [0.0056],
         [0.0131]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0761],
         [0.0245],
         [0.0109],
         [0.0122],
         [0.0138],
         [0.0479],
         [0.0448],
         [0.0039],
         [0.0019],
         [0.0102],
         [0.0172],
         [0.0147]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0197],
         [0.0189],
         [0.0316],
         [0.0146],
         [0.0116],
         [0.0090],
         [0.0089],
         [0.0085],
         [0.0022],
    


Evaluating:  48%|██████████████████████████████████████████████████████████████████████████████████████▊                                                                                            | 970/2000 [00:45<00:49, 20.72it/s][A
Evaluating:  49%|███████████████████████████████████████████████████████████████████████████████████████                                                                                            | 973/2000 [00:45<00:49, 20.81it/s][A

tensor([[[0.0389],
         [0.0143],
         [0.0217],
         [0.0142],
         [0.0102],
         [0.0067],
         [0.0162],
         [0.0147],
         [0.0047],
         [0.0108],
         [0.0108],
         [0.0320]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0595],
         [0.0540],
         [0.0308],
         [0.0086],
         [0.0089],
         [0.0099],
         [0.0481],
         [0.0240],
         [0.0126],
         [0.0116],
         [0.0138],
         [0.0363]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0117],
         [0.0291],
         [0.0069],
         [0.0033],
         [0.0034],
         [0.0046],
         [0.0270],
         [0.0014],
         [0.0024],
         [0.0060],
         [0.0366],
         [0.0129]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0153],
         [0.0280],
         [0.0530],
         [0.0134],
         [0.0112],
         [0.0103],
         [0.0079],
         [0.0068],
         [0.0019],
    


Evaluating:  49%|███████████████████████████████████████████████████████████████████████████████████████▎                                                                                           | 976/2000 [00:45<00:49, 20.79it/s][A

tensor([[[0.0133],
         [0.0120],
         [0.0394],
         [0.0162],
         [0.0236],
         [0.0158],
         [0.0252],
         [0.0051],
         [0.0028],
         [0.0104],
         [0.0792],
         [0.0507]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0707],
         [0.0958],
         [0.0654],
         [0.0270],
         [0.0206],
         [0.0085],
         [0.0323],
         [0.0164],
         [0.0044],
         [0.0139],
         [0.0223],
         [0.0951]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0545],
         [0.0026],
         [0.0582],
         [0.0139],
         [0.0208],
         [0.0048],
         [0.0090],
         [0.0070],
         [0.0043],
         [0.0034],
         [0.0092],
         [0.0232]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0241],
         [0.0322],
         [0.0749],
         [0.0181],
         [0.0217],
         [0.0104],
         [0.0182],
         [0.0306],
         [0.0065],
    


Evaluating:  49%|███████████████████████████████████████████████████████████████████████████████████████▌                                                                                           | 979/2000 [00:45<00:49, 20.82it/s][A

tensor([[[0.0272],
         [0.0144],
         [0.0175],
         [0.0155],
         [0.0215],
         [0.0246],
         [0.0187],
         [0.0055],
         [0.0049],
         [0.0076],
         [0.0126],
         [0.0218]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1006],
         [0.1077],
         [0.0400],
         [0.0105],
         [0.0101],
         [0.0061],
         [0.0298],
         [0.0145],
         [0.0171],
         [0.0094],
         [0.0117],
         [0.0270]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0523],
         [0.0337],
         [0.0999],
         [0.0236],
         [0.0352],
         [0.0046],
         [0.0169],
         [0.0095],
         [0.0171],
         [0.0218],
         [0.0165],
         [0.0321]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0196],
         [0.0726],
         [0.1011],
         [0.0187],
         [0.0104],
         [0.0126],
         [0.0097],
         [0.0062],
         [0.0038],
    


Evaluating:  49%|███████████████████████████████████████████████████████████████████████████████████████▉                                                                                           | 982/2000 [00:46<00:48, 21.02it/s][A
Evaluating:  49%|████████████████████████████████████████████████████████████████████████████████████████▏                                                                                          | 985/2000 [00:46<00:47, 21.24it/s][A

tensor([[[0.0166],
         [0.0280],
         [0.1865],
         [0.0188],
         [0.0232],
         [0.0166],
         [0.0074],
         [0.0145],
         [0.0061],
         [0.0036],
         [0.0360],
         [0.0586]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0145],
         [0.0309],
         [0.1307],
         [0.0439],
         [0.0260],
         [0.0134],
         [0.0195],
         [0.0066],
         [0.0117],
         [0.0100],
         [0.0416],
         [0.0422]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0271],
         [0.0445],
         [0.0369],
         [0.0066],
         [0.0213],
         [0.0266],
         [0.0211],
         [0.0137],
         [0.0068],
         [0.0088],
         [0.0180],
         [0.0353]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0234],
         [0.0826],
         [0.0892],
         [0.0088],
         [0.0296],
         [0.0074],
         [0.0160],
         [0.0075],
         [0.0086],
    


Evaluating:  49%|████████████████████████████████████████████████████████████████████████████████████████▍                                                                                          | 988/2000 [00:46<00:47, 21.25it/s][A

tensor([[[0.0241],
         [0.0522],
         [0.1366],
         [0.0206],
         [0.0294],
         [0.0305],
         [0.0319],
         [0.0059],
         [0.0100],
         [0.0097],
         [0.0465],
         [0.0341]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0731],
         [0.0614],
         [0.1157],
         [0.0301],
         [0.1022],
         [0.0368],
         [0.0346],
         [0.0918],
         [0.0750],
         [0.0334],
         [0.0387],
         [0.0312]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0177],
         [0.0541],
         [0.0463],
         [0.0113],
         [0.0319],
         [0.0132],
         [0.0229],
         [0.0258],
         [0.0095],
         [0.0128],
         [0.0188],
         [0.04


Evaluating:  50%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                                          | 991/2000 [00:46<00:47, 21.32it/s][A

tensor([[[0.0296],
         [0.0424],
         [0.0758],
         [0.0195],
         [0.0269],
         [0.0204],
         [0.0123],
         [0.0122],
         [0.0080],
         [0.0052],
         [0.0408],
         [0.0375]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0329],
         [0.0412],
         [0.0323],
         [0.0080],
         [0.0204],
         [0.0084],
         [0.0363],
         [0.0059],
         [0.0060],
         [0.0142],
         [0.0317],
         [0.0108]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0311],
         [0.0174],
         [0.0120],
         [0.0083],
         [0.0215],
         [0.0260],
         [0.0219],
         [0.0115],
         [0.0228],
         [0.0290],
         [0.0137],
         [0.0217]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0270],
         [0.0336],
         [0.0553],
         [0.0126],
         [0.0179],
         [0.0128],
         [0.0113],
         [0.0049],
         [0.0015],
    


Evaluating:  50%|████████████████████████████████████████████████████████████████████████████████████████▉                                                                                          | 994/2000 [00:46<00:47, 21.37it/s][A
Evaluating:  50%|█████████████████████████████████████████████████████████████████████████████████████████▏                                                                                         | 997/2000 [00:46<00:49, 20.44it/s][A

tensor([[[0.0093],
         [0.0143],
         [0.0660],
         [0.0083],
         [0.0148],
         [0.0031],
         [0.0093],
         [0.0065],
         [0.0037],
         [0.0021],
         [0.0178],
         [0.0241]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2911],
         [0.2535],
         [0.0700],
         [0.0362],
         [0.0377],
         [0.0202],
         [0.2439],
         [0.0302],
         [0.0351],
         [0.0416],
         [0.0393],
         [0.0615]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0806],
         [0.0136],
         [0.0446],
         [0.0095],
         [0.0250],
         [0.0084],
         [0.0533],
         [0.0548],
         [0.0473],
         [0.0281],
         [0.0032],
         [0.0442]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0354],
         [0.0623],
         [0.0694],
         [0.0299],
         [0.0296],
         [0.0078],
         [0.0089],
         [0.0115],
         [0.0048],
    


Evaluating:  50%|█████████████████████████████████████████████████████████████████████████████████████████                                                                                         | 1000/2000 [00:46<00:50, 19.76it/s][A

reg attention sum per layer
tensor([[[0.0081],
         [0.0261],
         [0.0260],
         [0.0047],
         [0.0070],
         [0.0040],
         [0.0376],
         [0.0034],
         [0.0029],
         [0.0039],
         [0.0290],
         [0.0083]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0313],
         [0.0259],
         [0.0455],
         [0.0165],
         [0.0227],
         [0.0176],
         [0.0239],
         [0.0085],
         [0.0032],
         [0.0065],
         [0.0095],
         [0.0158]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0473],
         [0.0932],
         [0.0451],
         [0.0166],
         [0.0238],
         [0.0171],
         [0.0263],
         [0.0720],
         [0.0112],
         [0.0085],
         [0.0659],
         [0.0451]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0108],
         [0.0155],
         [0.0344],
         [0.0154],
         [0.0129],
         [0.0066],
         [0.0040],
         [0.01


Evaluating:  50%|█████████████████████████████████████████████████████████████████████████████████████████▏                                                                                        | 1002/2000 [00:47<00:51, 19.19it/s][A
Evaluating:  50%|█████████████████████████████████████████████████████████████████████████████████████████▎                                                                                        | 1004/2000 [00:47<00:51, 19.28it/s][A

reg attention sum per layer
tensor([[[0.0018],
         [0.0047],
         [0.0344],
         [0.0036],
         [0.0139],
         [0.0059],
         [0.0049],
         [0.0073],
         [0.0065],
         [0.0013],
         [0.0126],
         [0.0089]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0244],
         [0.0884],
         [0.0312],
         [0.0065],
         [0.0263],
         [0.0434],
         [0.0184],
         [0.0331],
         [0.0026],
         [0.0101],
         [0.0464],
         [0.0223]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0153],
         [0.0735],
         [0.0785],
         [0.0149],
         [0.0119],
         [0.0083],
         [0.0105],
         [0.0128],
         [0.0037],
         [0.0048],
         [0.0297],
         [0.0719]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0345],
         [0.0354],
         [0.0355],
         [0.0193],
         [0.0094],
         [0.0107],
         [0.0161],
         [0.00


Evaluating:  50%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                                        | 1007/2000 [00:47<00:50, 19.66it/s][A
Evaluating:  50%|█████████████████████████████████████████████████████████████████████████████████████████▊                                                                                        | 1009/2000 [00:47<00:51, 19.15it/s][A


tensor([[[0.0205],
         [0.0226],
         [0.0670],
         [0.0277],
         [0.0240],
         [0.0061],
         [0.0064],
         [0.0207],
         [0.0168],
         [0.0050],
         [0.0424],
         [0.0728]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0739],
         [0.0248],
         [0.0367],
         [0.0138],
         [0.0180],
         [0.0193],
         [0.0112],
         [0.0117],
         [0.0076],
         [0.0088],
         [0.0059],
         [0.0259]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0329],
         [0.0089],
         [0.0473],
         [0.0130],
         [0.0190],
         [0.0179],
         [0.0311],
         [0.0073],
         [0.0173],
         [0.0101],
         [0.0075],
         [0.0140]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0239],
         [0.0172],
         [0.0460],
         [0.0080],
         [0.0204],
         [0.0113],
         [0.0106],
         [0.0314],
         [0.0088],
   


Evaluating:  51%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                                        | 1011/2000 [00:47<00:52, 18.92it/s][A
Evaluating:  51%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                                       | 1013/2000 [00:47<00:51, 19.08it/s][A

reg attention sum per layer
tensor([[[0.0157],
         [0.0304],
         [0.0322],
         [0.0111],
         [0.0254],
         [0.0333],
         [0.0178],
         [0.0172],
         [0.0063],
         [0.0089],
         [0.0275],
         [0.0260]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0882],
         [0.0389],
         [0.0719],
         [0.0201],
         [0.0238],
         [0.0342],
         [0.0190],
         [0.0204],
         [0.0138],
         [0.0096],
         [0.0180],
         [0.0304]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0321],
         [0.0250],
         [0.0118],
         [0.0138],
         [0.0156],
         [0.0154],
         [0.0339],
         [0.0070],
         [0.0027],
         [0.0121],
         [0.0104],
         [0.0187]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0210],
         [0.0931],
         [0.1551],
         [0.0319],
         [0.0237],
         [0.0067],
         [0.0076],
         [0.00


Evaluating:  51%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                                       | 1016/2000 [00:47<00:50, 19.67it/s][A

reg attention sum per layer
tensor([[[0.0334],
         [0.0644],
         [0.0909],
         [0.0224],
         [0.0482],
         [0.0053],
         [0.0258],
         [0.0064],
         [0.0253],
         [0.0081],
         [0.0192],
         [0.0620]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0259],
         [0.0225],
         [0.0191],
         [0.0490],
         [0.0220],
         [0.0167],
         [0.0154],
         [0.0129],
         [0.0069],
         [0.0159],
         [0.0620],
         [0.0217]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0276],
         [0.0224],
         [0.0201],
         [0.0114],
         [0.0116],
         [0.0074],
         [0.0157],
         [0.0039],
         [0.0048],
         [0.0061],
         [0.0095],
         [0.0182]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0458],
         [0.0515],
         [0.0595],
         [0.0395],
         [0.0432],
         [0.0191],
         [0.0360],
         [0.10


Evaluating:  51%|██████████████████████████████████████████████████████████████████████████████████████████▋                                                                                       | 1019/2000 [00:47<00:48, 20.06it/s][A

tensor([[[0.0221],
         [0.0236],
         [0.0628],
         [0.0111],
         [0.0143],
         [0.0026],
         [0.0183],
         [0.0051],
         [0.0688],
         [0.0130],
         [0.0076],
         [0.0927]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0185],
         [0.0144],
         [0.0156],
         [0.0078],
         [0.0086],
         [0.0047],
         [0.0087],
         [0.0123],
         [0.0031],
         [0.0046],
         [0.0091],
         [0.0113]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0062],
         [0.0188],
         [0.0658],
         [0.0092],
         [0.0467],
         [0.0197],
         [0.0043],
         [0.0080],
         [0.0063],
         [0.0024],
         [0.0221],
         [0.0187]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0063],
         [0.0118],
         [0.0098],
         [0.0061],
         [0.0230],
         [0.0183],
         [0.0035],
         [0.0054],
         [0.0004],
    


Evaluating:  51%|██████████████████████████████████████████████████████████████████████████████████████████▉                                                                                       | 1022/2000 [00:48<00:48, 20.06it/s][A
Evaluating:  51%|███████████████████████████████████████████████████████████████████████████████████████████▏                                                                                      | 1025/2000 [00:48<00:49, 19.51it/s][A

tensor([[[0.0389],
         [0.0193],
         [0.1068],
         [0.0168],
         [0.0123],
         [0.0150],
         [0.0215],
         [0.0064],
         [0.0023],
         [0.0033],
         [0.0369],
         [0.0519]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0797],
         [0.0052],
         [0.0179],
         [0.0109],
         [0.0599],
         [0.0116],
         [0.0130],
         [0.0064],
         [0.0019],
         [0.0051],
         [0.0070],
         [0.0107]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0561],
         [0.0262],
         [0.0541],
         [0.0198],
         [0.0389],
         [0.0151],
         [0.0467],
         [0.0052],
         [0.0118],
         [0.0075],
         [0.0296],
         [0.0406]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0213],
         [0.0819],
         [0.0235],
         [0.0199],
         [0.0466],
         [0.0202],
         [0.0244],
         [0.0025],
         [0.0121],
    


Evaluating:  51%|███████████████████████████████████████████████████████████████████████████████████████████▍                                                                                      | 1027/2000 [00:48<00:50, 19.38it/s][A

reg attention sum per layer
tensor([[[0.0211],
         [0.0244],
         [0.0248],
         [0.0042],
         [0.0342],
         [0.0029],
         [0.0077],
         [0.0124],
         [0.0019],
         [0.0061],
         [0.0181],
         [0.0091]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0221],
         [0.0227],
         [0.0625],
         [0.0214],
         [0.0167],
         [0.0105],
         [0.0179],
         [0.0163],
         [0.0346],
         [0.0116],
         [0.0072],
         [0.0637]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0528],
         [0.0560],
         [0.0413],
         [0.0274],
         [0.0809],
         [0.1145],
         [0.0205],
         [0.0058],
         [0.0121],
         [0.0109],
         [0.0175],
         [0.0351]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0261],
         [0.1259],
         [0.0740],
         [0.0231],
         [0.0219],
         [0.0105],
         [0.0112],
         [0.00


Evaluating:  52%|███████████████████████████████████████████████████████████████████████████████████████████▋                                                                                      | 1030/2000 [00:48<00:49, 19.58it/s][A
Evaluating:  52%|███████████████████████████████████████████████████████████████████████████████████████████▊                                                                                      | 1032/2000 [00:48<00:49, 19.43it/s][A

reg attention sum per layer
tensor([[[0.0850],
         [0.0320],
         [0.3615],
         [0.0361],
         [0.0307],
         [0.0173],
         [0.0135],
         [0.0413],
         [0.0059],
         [0.0071],
         [0.1094],
         [0.1126]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0143],
         [0.0316],
         [0.0275],
         [0.0102],
         [0.0514],
         [0.0152],
         [0.0316],
         [0.0089],
         [0.0044],
         [0.0095],
         [0.0235],
         [0.0243]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0332],
         [0.0310],
         [0.0142],
         [0.0159],
         [0.0233],
         [0.0233],
         [0.0281],
         [0.0254],
         [0.0100],
         [0.0185],
         [0.0093],
         [0.0208]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0423],
         [0.1090],
         [0.1837],
         [0.0390],
         [0.0407],
         [0.1073],
         [0.0174],
         [0.01


Evaluating:  52%|████████████████████████████████████████████████████████████████████████████████████████████                                                                                      | 1035/2000 [00:48<00:48, 19.77it/s][A

tensor([[[0.0168],
         [0.0489],
         [0.0402],
         [0.0214],
         [0.0103],
         [0.0070],
         [0.0099],
         [0.0037],
         [0.0015],
         [0.0052],
         [0.0501],
         [0.0296]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0533],
         [0.1261],
         [0.0362],
         [0.0146],
         [0.0245],
         [0.0176],
         [0.0309],
         [0.0202],
         [0.0106],
         [0.0139],
         [0.0618],
         [0.0361]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0061],
         [0.0120],
         [0.0351],
         [0.0073],
         [0.0130],
         [0.0093],
         [0.0042],
         [0.0012],
         [0.0007],
         [0.0016],
         [0.0238],
         [0.0336]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0311],
         [0.0172],
         [0.0406],
         [0.0208],
         [0.0106],
         [0.0162],
         [0.0244],
         [0.0243],
         [0.0076],
    


Evaluating:  52%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                     | 1038/2000 [00:48<00:47, 20.38it/s][A
Evaluating:  52%|████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                     | 1041/2000 [00:49<00:46, 20.78it/s][A

tensor([[[0.0142],
         [0.0168],
         [0.0446],
         [0.0066],
         [0.0254],
         [0.0101],
         [0.0086],
         [0.0102],
         [0.0083],
         [0.0072],
         [0.0146],
         [0.0175]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0296],
         [0.0566],
         [0.0402],
         [0.0102],
         [0.0174],
         [0.0197],
         [0.0317],
         [0.0435],
         [0.0059],
         [0.0080],
         [0.0372],
         [0.0330]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0139],
         [0.0128],
         [0.0541],
         [0.0053],
         [0.0081],
         [0.0027],
         [0.0056],
         [0.0050],
         [0.0041],
         [0.0044],
         [0.0211],
         [0.0224]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0337],
         [0.0292],
         [0.0509],
         [0.0086],
         [0.0226],
         [0.0107],
         [0.0083],
         [0.0072],
         [0.0134],
    


Evaluating:  52%|████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                     | 1044/2000 [00:49<00:45, 20.95it/s][A


reg attention sum per layer
tensor([[[0.0103],
         [0.0111],
         [0.0436],
         [0.0079],
         [0.0095],
         [0.0045],
         [0.0059],
         [0.0032],
         [0.0047],
         [0.0034],
         [0.0065],
         [0.0242]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0361],
         [0.0173],
         [0.0770],
         [0.0090],
         [0.0301],
         [0.0148],
         [0.0172],
         [0.0083],
         [0.0292],
         [0.0150],
         [0.0465],
         [0.0335]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0671],
         [0.0673],
         [0.0126],
         [0.0202],
         [0.0196],
         [0.0140],
         [0.0387],
         [0.0273],
         [0.0106],
         [0.0484],
         [0.0043],
         [0.0079]]], device='cuda:0')
reg attention sum per layer



Evaluating:  52%|█████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                    | 1047/2000 [00:49<00:46, 20.63it/s][A

tensor([[[0.0400],
         [0.0184],
         [0.0192],
         [0.0091],
         [0.0123],
         [0.0040],
         [0.0217],
         [0.0048],
         [0.0036],
         [0.0096],
         [0.0092],
         [0.0093]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0086],
         [0.0073],
         [0.0233],
         [0.0116],
         [0.0336],
         [0.0199],
         [0.0086],
         [0.0023],
         [0.0047],
         [0.0032],
         [0.1163],
         [0.0252]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0624],
         [0.0390],
         [0.0358],
         [0.0223],
         [0.0289],
         [0.0089],
         [0.0387],
         [0.0082],
         [0.0035],
         [0.0065],
         [0.0171],
         [0.0410]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0199],
         [0.0241],
         [0.1952],
         [0.0101],
         [0.1024],
         [0.0304],
         [0.0137],
         [0.0153],
         [0.0265],
    


Evaluating:  52%|█████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                    | 1050/2000 [00:49<00:46, 20.45it/s][A
Evaluating:  53%|█████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                    | 1053/2000 [00:49<00:45, 20.88it/s][A

tensor([[[0.1476],
         [0.0289],
         [0.0271],
         [0.0162],
         [0.0283],
         [0.0069],
         [0.0486],
         [0.0658],
         [0.0119],
         [0.0238],
         [0.0137],
         [0.0081]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0253],
         [0.0219],
         [0.0111],
         [0.0148],
         [0.0105],
         [0.0280],
         [0.0185],
         [0.0071],
         [0.0018],
         [0.0100],
         [0.0960],
         [0.0162]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0248],
         [0.0266],
         [0.0753],
         [0.0530],
         [0.0767],
         [0.0199],
         [0.0123],
         [0.0752],
         [0.0052],
         [0.0087],
         [0.0903],
         [0.0362]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0059],
         [0.0054],
         [0.0470],
         [0.0055],
         [0.0218],
         [0.0052],
         [0.0040],
         [0.0030],
         [0.0013],
    


Evaluating:  53%|█████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                    | 1056/2000 [00:49<00:45, 20.80it/s][A


reg attention sum per layer
tensor([[[0.0160],
         [0.0494],
         [0.0409],
         [0.0065],
         [0.0122],
         [0.0087],
         [0.0128],
         [0.0033],
         [0.0011],
         [0.0039],
         [0.0694],
         [0.0334]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0237],
         [0.0256],
         [0.0500],
         [0.0094],
         [0.0158],
         [0.0408],
         [0.0165],
         [0.0460],
         [0.0045],
         [0.0099],
         [0.0104],
         [0.0331]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0071],
         [0.0035],
         [0.0081],
         [0.0030],
         [0.0119],
         [0.0174],
         [0.0230],
         [0.0011],
         [0.0016],
         [0.0027],
         [0.0249],
         [0.0052]]], device='cuda:0')
reg attention sum per layer



Evaluating:  53%|██████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                   | 1059/2000 [00:49<00:44, 21.12it/s][A

tensor([[[0.0806],
         [0.0623],
         [0.0258],
         [0.0188],
         [0.0432],
         [0.0225],
         [0.0651],
         [0.0382],
         [0.0725],
         [0.0243],
         [0.0354],
         [0.1140]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0115],
         [0.0043],
         [0.0147],
         [0.0024],
         [0.0024],
         [0.0017],
         [0.0068],
         [0.0048],
         [0.0022],
         [0.0015],
         [0.0009],
         [0.0048]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0067],
         [0.0052],
         [0.0478],
         [0.0104],
         [0.0122],
         [0.0075],
         [0.0024],
         [0.0015],
         [0.0014],
         [0.0024],
         [0.0065],
         [0.0118]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0222],
         [0.0373],
         [0.0185],
         [0.0105],
         [0.0134],
         [0.0109],
         [0.0202],
         [0.0088],
         [0.0065],
    


Evaluating:  53%|██████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                   | 1062/2000 [00:50<00:44, 21.23it/s][A
Evaluating:  53%|██████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                   | 1065/2000 [00:50<00:43, 21.35it/s][A

reg attention sum per layer
tensor([[[0.0639],
         [0.0176],
         [0.0299],
         [0.0292],
         [0.0277],
         [0.0173],
         [0.0506],
         [0.0048],
         [0.0144],
         [0.0128],
         [0.0160],
         [0.0318]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0260],
         [0.0305],
         [0.0530],
         [0.0136],
         [0.0091],
         [0.0068],
         [0.0094],
         [0.0158],
         [0.0155],
         [0.0048],
         [0.0126],
         [0.0377]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0517],
         [0.0294],
         [0.0427],
         [0.0167],
         [0.0261],
         [0.0078],
         [0.0456],
         [0.0197],
         [0.0254],
         [0.0122],
         [0.0169],
         [0.0313]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0555],
         [0.0396],
         [0.0314],
         [0.0087],
         [0.0265],
         [0.0071],
         [0.0350],
         [0.01


Evaluating:  53%|███████████████████████████████████████████████████████████████████████████████████████████████                                                                                   | 1068/2000 [00:50<00:43, 21.39it/s][A


tensor([[[0.0187],
         [0.0672],
         [0.0173],
         [0.0085],
         [0.0210],
         [0.0056],
         [0.0219],
         [0.0008],
         [0.0016],
         [0.0130],
         [0.1362],
         [0.0142]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0512],
         [0.0583],
         [0.0828],
         [0.0109],
         [0.0223],
         [0.0068],
         [0.0556],
         [0.0057],
         [0.0058],
         [0.0029],
         [0.0211],
         [0.0271]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0234],
         [0.0367],
         [0.0759],
         [0.0219],
         [0.0679],
         [0.0186],
         [0.0193],
         [0.0174],
         [0.0037],
         [0.0105],
         [0.1085],
         [0.0471]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0477],
         [0.0969],
         [0.0554],
         [0.0221],
         [0.0476],
         [0.0587],
         [0.0338],
         [0.0167],
         [0.0114],
   


Evaluating:  54%|███████████████████████████████████████████████████████████████████████████████████████████████▎                                                                                  | 1071/2000 [00:50<00:43, 21.52it/s][A
Evaluating:  54%|███████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                  | 1074/2000 [00:50<00:42, 21.60it/s][A

tensor([[[0.0154],
         [0.0158],
         [0.0108],
         [0.0031],
         [0.0038],
         [0.0127],
         [0.0224],
         [0.0009],
         [0.0011],
         [0.0058],
         [0.0121],
         [0.0120]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0156],
         [0.0267],
         [0.0272],
         [0.0126],
         [0.0090],
         [0.0080],
         [0.0038],
         [0.0033],
         [0.0023],
         [0.0052],
         [0.0094],
         [0.0230]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0340],
         [0.0460],
         [0.0256],
         [0.0119],
         [0.0309],
         [0.0561],
         [0.0165],
         [0.0323],
         [0.0090],
         [0.0079],
         [0.0854],
         [0.0321]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0190],
         [0.0294],
         [0.0318],
         [0.0107],
         [0.0106],
         [0.0164],
         [0.0149],
         [0.0033],
         [0.0051],
    


Evaluating:  54%|███████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                  | 1077/2000 [00:50<00:42, 21.61it/s][A


tensor([[[0.0146],
         [0.0271],
         [0.0455],
         [0.0124],
         [0.0215],
         [0.0098],
         [0.0217],
         [0.0110],
         [0.0040],
         [0.0071],
         [0.0327],
         [0.0148]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0706],
         [0.0861],
         [0.0180],
         [0.0086],
         [0.0080],
         [0.0224],
         [0.0541],
         [0.0237],
         [0.0128],
         [0.0152],
         [0.0044],
         [0.0120]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0185],
         [0.0065],
         [0.1722],
         [0.0161],
         [0.0366],
         [0.0077],
         [0.0150],
         [0.0142],
         [0.0058],
         [0.0144],
         [0.0171],
         [0.0416]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0257],
         [0.0126],
         [0.0645],
         [0.0161],
         [0.1090],
         [0.0061],
         [0.0130],
         [0.0228],
         [0.0047],
   


Evaluating:  54%|████████████████████████████████████████████████████████████████████████████████████████████████                                                                                  | 1080/2000 [00:50<00:42, 21.62it/s][A


reg attention sum per layer
tensor([[[0.0825],
         [0.0120],
         [0.0354],
         [0.0061],
         [0.0069],
         [0.0047],
         [0.0636],
         [0.0024],
         [0.0194],
         [0.0120],
         [0.0023],
         [0.0252]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0156],
         [0.0195],
         [0.0701],
         [0.0143],
         [0.0290],
         [0.0379],
         [0.0124],
         [0.0200],
         [0.0031],
         [0.0023],
         [0.0393],
         [0.0543]]], device='cuda:0')
reg attention sum per layer



Evaluating:  54%|████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                 | 1083/2000 [00:51<00:43, 21.13it/s][A
Evaluating:  54%|████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                 | 1086/2000 [00:51<00:44, 20.37it/s][A

tensor([[[0.1277],
         [0.0494],
         [0.0703],
         [0.0259],
         [0.0259],
         [0.0354],
         [0.0237],
         [0.0226],
         [0.0115],
         [0.0170],
         [0.0111],
         [0.0393]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1087],
         [0.0806],
         [0.1351],
         [0.0252],
         [0.0137],
         [0.0094],
         [0.0211],
         [0.0102],
         [0.0017],
         [0.0093],
         [0.1127],
         [0.1595]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0148],
         [0.0162],
         [0.0117],
         [0.0122],
         [0.0017],
         [0.0045],
         [0.0081],
         [0.0050],
         [0.0006],
         [0.0063],
         [0.0251],
         [0.0230]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0169],
         [0.0342],
         [0.0456],
         [0.0188],
         [0.0355],
         [0.0118],
         [0.0050],
         [0.0025],
         [0.0033],
    


Evaluating:  54%|████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                 | 1089/2000 [00:51<00:45, 19.87it/s][A

reg attention sum per layer
tensor([[[0.0221],
         [0.0362],
         [0.0156],
         [0.0067],
         [0.0086],
         [0.0203],
         [0.0061],
         [0.0075],
         [0.0015],
         [0.0056],
         [0.0929],
         [0.0184]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0210],
         [0.0124],
         [0.0087],
         [0.0075],
         [0.0107],
         [0.0273],
         [0.0123],
         [0.0058],
         [0.0011],
         [0.0067],
         [0.0267],
         [0.0198]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0147],
         [0.0070],
         [0.0161],
         [0.0076],
         [0.0064],
         [0.0026],
         [0.0035],
         [0.0011],
         [0.0008],
         [0.0063],
     


Evaluating:  55%|█████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                                | 1092/2000 [00:51<00:45, 20.13it/s][A

tensor([[[0.0186],
         [0.0406],
         [0.0703],
         [0.0137],
         [0.0182],
         [0.0061],
         [0.0080],
         [0.0214],
         [0.0095],
         [0.0053],
         [0.0262],
         [0.0338]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0728],
         [0.0167],
         [0.0597],
         [0.0178],
         [0.0284],
         [0.0017],
         [0.0722],
         [0.0120],
         [0.0398],
         [0.0236],
         [0.0079],
         [0.0288]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0451],
         [0.0180],
         [0.0478],
         [0.0049],
         [0.0066],
         [0.0110],
         [0.0380],
         [0.0096],
         [0.0027],
         [0.0074],
         [0.0158],
         [0.0117]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0376],
         [0.0733],
         [0.1157],
         [0.0449],
         [0.0154],
         [0.0432],
         [0.0181],
         [0.0079],
         [0.0030],
    


Evaluating:  55%|█████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                | 1095/2000 [00:51<00:46, 19.59it/s][A
Evaluating:  55%|█████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                                | 1097/2000 [00:51<00:46, 19.44it/s][A

reg attention sum per layer
tensor([[[0.0132],
         [0.0049],
         [0.0185],
         [0.0102],
         [0.0039],
         [0.0096],
         [0.0114],
         [0.0017],
         [0.0006],
         [0.0048],
         [0.0514],
         [0.0351]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0136],
         [0.0106],
         [0.0224],
         [0.0236],
         [0.0129],
         [0.0058],
         [0.0128],
         [0.0121],
         [0.0022],
         [0.0021],
         [0.0015],
         [0.0243]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0066],
         [0.0100],
         [0.0072],
         [0.0037],
         [0.0142],
         [0.0031],
         [0.0119],
         [0.0034],
         [0.0021],
         [0.0048],
         [0.0064],
         [0.0057]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0323],
         [0.0318],
         [0.0182],
         [0.0093],
         [0.0119],
         [0.0131],
         [0.0108],
         [0.03


Evaluating:  55%|█████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                                | 1099/2000 [00:51<00:47, 18.79it/s][A
Evaluating:  55%|█████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                                | 1101/2000 [00:51<00:48, 18.47it/s][A

reg attention sum per layer
tensor([[[0.0568],
         [0.0477],
         [0.1469],
         [0.0543],
         [0.1142],
         [0.0205],
         [0.0165],
         [0.0263],
         [0.0138],
         [0.0128],
         [0.0377],
         [0.0608]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1661],
         [0.0217],
         [0.0753],
         [0.0252],
         [0.0237],
         [0.0175],
         [0.0509],
         [0.0275],
         [0.0104],
         [0.0503],
         [0.0251],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0241],
         [0.0159],
         [0.0689],
         [0.0357],
         [0.0366],
         [0.0222],
         [0.0098],
         [0.0031],
         [0.0012],
         [0.0050],
         [0.0934],
         [0.0440]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0327],
         [0.0122],
         [0.0117],
         [0.0060],
         [0.0072],
         [0.0014],
         [0.0123],
         [0.00


Evaluating:  55%|██████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                               | 1103/2000 [00:52<00:49, 18.24it/s][A
Evaluating:  55%|██████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                               | 1105/2000 [00:52<00:49, 18.24it/s][A

reg attention sum per layer
tensor([[[0.0363],
         [0.0408],
         [0.0431],
         [0.0111],
         [0.0215],
         [0.0059],
         [0.0456],
         [0.0070],
         [0.0137],
         [0.0093],
         [0.0148],
         [0.0218]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0975],
         [0.1345],
         [0.0554],
         [0.0429],
         [0.0232],
         [0.0199],
         [0.0419],
         [0.0074],
         [0.0592],
         [0.0253],
         [0.0179],
         [0.0411]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0247],
         [0.0110],
         [0.0453],
         [0.0144],
         [0.0178],
         [0.0190],
         [0.0054],
         [0.0027],
         [0.0013],
         [0.0044],
         [0.0158],
         [0.0247]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0619],
         [0.0181],
         [0.0391],
         [0.0168],
         [0.0376],
         [0.0196],
         [0.0163],
         [0.01


Evaluating:  55%|██████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                               | 1107/2000 [00:52<00:48, 18.34it/s][A
Evaluating:  55%|██████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                               | 1109/2000 [00:52<00:48, 18.56it/s][A

reg attention sum per layer
tensor([[[0.2263],
         [0.1961],
         [0.0539],
         [0.0129],
         [0.0433],
         [0.0118],
         [0.1011],
         [0.0080],
         [0.0293],
         [0.0815],
         [0.0299],
         [0.0479]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0210],
         [0.0421],
         [0.0559],
         [0.0158],
         [0.0294],
         [0.0114],
         [0.0077],
         [0.0056],
         [0.0020],
         [0.0024],
         [0.0416],
         [0.0369]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0208],
         [0.0101],
         [0.0516],
         [0.0094],
         [0.0069],
         [0.0024],
         [0.0182],
         [0.0085],
         [0.0140],
         [0.0025],
         [0.0119],
         [0.0310]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1088],
         [0.1102],
         [0.0497],
         [0.0185],
         [0.0249],
         [0.0064],
         [0.0356],
         [0.02


Evaluating:  56%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                               | 1111/2000 [00:52<00:48, 18.46it/s][A
Evaluating:  56%|███████████████████████████████████████████████████████████████████████████████████████████████████                                                                               | 1113/2000 [00:52<00:47, 18.49it/s][A

reg attention sum per layer
tensor([[[0.0554],
         [0.0219],
         [0.0608],
         [0.0295],
         [0.0243],
         [0.0203],
         [0.0075],
         [0.0069],
         [0.0008],
         [0.0030],
         [0.0291],
         [0.0623]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0291],
         [0.0553],
         [0.1984],
         [0.0193],
         [0.0362],
         [0.0047],
         [0.0196],
         [0.0445],
         [0.0349],
         [0.0068],
         [0.0277],
         [0.0507]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0543],
         [0.0165],
         [0.0542],
         [0.0224],
         [0.0454],
         [0.0097],
         [0.0092],
         [0.0082],
         [0.0129],
         [0.0068],
         [0.0260],
         [0.0504]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0169],
         [0.0190],
         [0.1323],
         [0.0133],
         [0.0110],
         [0.0066],
         [0.0132],
         [0.00


Evaluating:  56%|███████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                              | 1115/2000 [00:52<00:46, 18.88it/s][A
Evaluating:  56%|███████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                              | 1117/2000 [00:52<00:46, 19.05it/s][A

reg attention sum per layer
tensor([[[0.0150],
         [0.0175],
         [0.0148],
         [0.0041],
         [0.0271],
         [0.0302],
         [0.0171],
         [0.0189],
         [0.0059],
         [0.0053],
         [0.0540],
         [0.0238]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0094],
         [0.0183],
         [0.0270],
         [0.0040],
         [0.0076],
         [0.0028],
         [0.0113],
         [0.0030],
         [0.0141],
         [0.0081],
         [0.0050],
         [0.0051]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0426],
         [0.0417],
         [0.0430],
         [0.0160],
         [0.0450],
         [0.0198],
         [0.0228],
         [0.0325],
         [0.0126],
         [0.0258],
         [0.0149],
         [0.0263]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1267],
         [0.0313],
         [0.0328],
         [0.0083],
         [0.0313],
         [0.0137],
         [0.0568],
         [0.01


Evaluating:  56%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                              | 1119/2000 [00:52<00:46, 18.91it/s][A
Evaluating:  56%|███████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                              | 1121/2000 [00:53<00:46, 18.96it/s][A

reg attention sum per layer
tensor([[[0.0108],
         [0.0175],
         [0.1180],
         [0.0148],
         [0.0472],
         [0.0164],
         [0.0103],
         [0.0036],
         [0.0035],
         [0.0059],
         [0.1034],
         [0.0495]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0371],
         [0.0349],
         [0.0351],
         [0.0106],
         [0.0055],
         [0.0048],
         [0.0144],
         [0.0126],
         [0.0017],
         [0.0060],
         [0.0267],
         [0.0125]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0375],
         [0.0176],
         [0.0161],
         [0.0071],
         [0.0189],
         [0.0030],
         [0.0120],
         [0.0080],
         [0.0049],
         [0.0074],
         [0.0093],
         [0.0089]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0375],
         [0.0762],
         [0.0515],
         [0.0086],
         [0.0074],
         [0.0256],
         [0.0174],
         [0.00


Evaluating:  56%|████████████████████████████████████████████████████████████████████████████████████████████████████                                                                              | 1124/2000 [00:53<00:44, 19.67it/s][A

tensor([[[0.0201],
         [0.0088],
         [0.0127],
         [0.0036],
         [0.0118],
         [0.0155],
         [0.0100],
         [0.0058],
         [0.0072],
         [0.0060],
         [0.0112],
         [0.0126]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0284],
         [0.0211],
         [0.0210],
         [0.0156],
         [0.0389],
         [0.0259],
         [0.0174],
         [0.0111],
         [0.0287],
         [0.0109],
         [0.0126],
         [0.0127]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0675],
         [0.0286],
         [0.0445],
         [0.0116],
         [0.0164],
         [0.0090],
         [0.0138],
         [0.0095],
         [0.0069],
         [0.0146],
         [0.0137],
         [0.0209]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0855],
         [0.0504],
         [0.0951],
         [0.0314],
         [0.0644],
         [0.0226],
         [0.0209],
         [0.0510],
         [0.0079],
    


Evaluating:  56%|████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 1127/2000 [00:53<00:42, 20.35it/s][A
Evaluating:  56%|████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                             | 1130/2000 [00:53<00:42, 20.26it/s][A

reg attention sum per layer
tensor([[[0.0072],
         [0.0127],
         [0.0590],
         [0.0119],
         [0.0092],
         [0.0044],
         [0.0080],
         [0.0039],
         [0.0048],
         [0.0023],
         [0.0509],
         [0.0195]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0132],
         [0.0137],
         [0.0289],
         [0.0038],
         [0.0078],
         [0.0031],
         [0.0149],
         [0.0025],
         [0.0044],
         [0.0045],
         [0.0114],
         [0.0055]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0934],
         [0.0645],
         [0.0457],
         [0.0374],
         [0.0217],
         [0.0060],
         [0.0129],
         [0.0191],
         [0.0048],
         [0.0151],
         [0.0170],
         [0.0416]]], device='cuda:0')
reg attention sum per layer



Evaluating:  57%|████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                             | 1133/2000 [00:53<00:42, 20.24it/s][A

tensor([[[0.4691],
         [0.0373],
         [0.0159],
         [0.0227],
         [0.0805],
         [0.0206],
         [0.0695],
         [0.0233],
         [0.0218],
         [0.0696],
         [0.0025],
         [0.0108]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0654],
         [0.1293],
         [0.0491],
         [0.0433],
         [0.0209],
         [0.0148],
         [0.0361],
         [0.0162],
         [0.0055],
         [0.0086],
         [0.0115],
         [0.0586]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0253],
         [0.0317],
         [0.1593],
         [0.0386],
         [0.0264],
         [0.0186],
         [0.0316],
         [0.0027],
         [0.0085],
         [0.0121],
         [0.0299],
         [0.0757]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0503],
         [0.0394],
         [0.0200],
         [0.0099],
         [0.0054],
         [0.0087],
         [0.0339],
         [0.0063],
         [0.0034],
    


Evaluating:  57%|█████████████████████████████████████████████████████████████████████████████████████████████████████                                                                             | 1136/2000 [00:53<00:42, 20.39it/s][A

tensor([[[0.0038],
         [0.0039],
         [0.0082],
         [0.0010],
         [0.0040],
         [0.0034],
         [0.0045],
         [0.0009],
         [0.0005],
         [0.0008],
         [0.0048],
         [0.0121]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0833],
         [0.0751],
         [0.1486],
         [0.0708],
         [0.0644],
         [0.0356],
         [0.0331],
         [0.0135],
         [0.0067],
         [0.0060],
         [0.0607],
         [0.0503]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0154],
         [0.0127],
         [0.0091],
         [0.0149],
         [0.0070],
         [0.0126],
         [0.0089],
         [0.0057],
         [0.0023],
         [0.0086],
         [0.0223],
         [0.02


Evaluating:  57%|█████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                            | 1139/2000 [00:53<00:41, 20.75it/s][A
Evaluating:  57%|█████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                            | 1142/2000 [00:54<00:40, 21.10it/s][A

tensor([[[0.0463],
         [0.0429],
         [0.0313],
         [0.0146],
         [0.0321],
         [0.0389],
         [0.0197],
         [0.0387],
         [0.0117],
         [0.0152],
         [0.0517],
         [0.0372]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0293],
         [0.0325],
         [0.0751],
         [0.0331],
         [0.2025],
         [0.0262],
         [0.0138],
         [0.0213],
         [0.0083],
         [0.0188],
         [0.1228],
         [0.0570]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0756],
         [0.0179],
         [0.0184],
         [0.0256],
         [0.0363],
         [0.0094],
         [0.0792],
         [0.0473],
         [0.0260],
         [0.0202],
         [0.0052],
         [0.0100]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0354],
         [0.0378],
         [0.0419],
         [0.0300],
         [0.0293],
         [0.0046],
         [0.0156],
         [0.0068],
         [0.0022],
    


Evaluating:  57%|█████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                            | 1145/2000 [00:54<00:40, 21.35it/s][A


reg attention sum per layer
tensor([[[0.0355],
         [0.0261],
         [0.0368],
         [0.0119],
         [0.0067],
         [0.0054],
         [0.0195],
         [0.0121],
         [0.0175],
         [0.0099],
         [0.0191],
         [0.1373]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0754],
         [0.0244],
         [0.0304],
         [0.0051],
         [0.0154],
         [0.0157],
         [0.0197],
         [0.0067],
         [0.0042],
         [0.0090],
         [0.0089],
         [0.0242]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0158],
         [0.0359],
         [0.0404],
         [0.0154],
         [0.0103],
         [0.0213],
         [0.0108],
         [0.0036],
         [0.0075],
         [0.0070],
         [0.0237],
         [0.0332]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0109],
         [0.0136],
         [0.0195],
         [0.0183],
         [0.0187],
         [0.0064],
         [0.0265],
         [0.0


Evaluating:  57%|██████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                           | 1148/2000 [00:54<00:39, 21.48it/s][A
Evaluating:  58%|██████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                           | 1151/2000 [00:54<00:39, 21.39it/s][A

reg attention sum per layer
tensor([[[0.0279],
         [0.0425],
         [0.0226],
         [0.0146],
         [0.0308],
         [0.0060],
         [0.0207],
         [0.0099],
         [0.0118],
         [0.0157],
         [0.0074],
         [0.0131]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1146],
         [0.0830],
         [0.0051],
         [0.0218],
         [0.0077],
         [0.0052],
         [0.0255],
         [0.0055],
         [0.0020],
         [0.0123],
         [0.0101],
         [0.0113]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0019],
         [0.0023],
         [0.0026],
         [0.0016],
         [0.0039],
         [0.0023],
         [0.0017],
         [0.0024],
         [0.0005],
         [0.0006],
         [0.0037],
         [0.0024]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0126],
         [0.0144],
         [0.0148],
         [0.0070],
         [0.0044],
         [0.0070],
         [0.0009],
         [0.00


Evaluating:  58%|██████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                           | 1154/2000 [00:54<00:39, 21.51it/s][A

tensor([[[0.0709],
         [0.2065],
         [0.0384],
         [0.0113],
         [0.0150],
         [0.0176],
         [0.0833],
         [0.0408],
         [0.0161],
         [0.0126],
         [0.0102],
         [0.0256]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0390],
         [0.0106],
         [0.0088],
         [0.0080],
         [0.0125],
         [0.0103],
         [0.0253],
         [0.0084],
         [0.0086],
         [0.0057],
         [0.0028],
         [0.0118]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0086],
         [0.0258],
         [0.0467],
         [0.0028],
         [0.0037],
         [0.0027],
         [0.0194],
         [0.0053],
         [0.0052],
         [0.0066],
         [0.0293],
         [0.0107]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0814],
         [0.0300],
         [0.0126],
         [0.0320],
         [0.0134],
         [0.0055],
         [0.0481],
         [0.0469],
         [0.0130],
    


Evaluating:  58%|██████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                           | 1157/2000 [00:54<00:39, 21.32it/s][A

tensor([[[0.0026],
         [0.0017],
         [0.0280],
         [0.0014],
         [0.0046],
         [0.0089],
         [0.0075],
         [0.0012],
         [0.0012],
         [0.0014],
         [0.0127],
         [0.0128]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0490],
         [0.0133],
         [0.0821],
         [0.0172],
         [0.0244],
         [0.0060],
         [0.0068],
         [0.0072],
         [0.0072],
         [0.0065],
         [0.0171],
         [0.0467]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0113],
         [0.0098],
         [0.0134],
         [0.0077],
         [0.0043],
         [0.0143],
         [0.0125],
         [0.0037],
         [0.0011],
         [0.0019],
         [0.1180],
         [0.0183]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0178],
         [0.0198],
         [0.0453],
         [0.0273],
         [0.0228],
         [0.0217],
         [0.0335],
         [0.0075],
         [0.0024],
    


Evaluating:  58%|███████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                          | 1160/2000 [00:54<00:39, 21.19it/s][A
Evaluating:  58%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                          | 1163/2000 [00:54<00:39, 21.37it/s][A

tensor([[[0.0192],
         [0.0672],
         [0.0962],
         [0.0132],
         [0.0128],
         [0.0089],
         [0.0590],
         [0.0615],
         [0.0093],
         [0.0099],
         [0.0212],
         [0.0487]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0694],
         [0.0669],
         [0.0050],
         [0.0136],
         [0.0023],
         [0.0059],
         [0.0188],
         [0.0045],
         [0.0007],
         [0.0048],
         [0.0112],
         [0.0094]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0221],
         [0.0349],
         [0.0377],
         [0.0223],
         [0.0373],
         [0.0155],
         [0.0149],
         [0.0039],
         [0.0018],
         [0.0100],
         [0.0388],
         [0.0517]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0126],
         [0.0170],
         [0.1010],
         [0.0141],
         [0.0114],
         [0.0066],
         [0.0115],
         [0.0073],
         [0.0024],
    


Evaluating:  58%|███████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                          | 1166/2000 [00:55<00:38, 21.45it/s][A

tensor([[[0.0457],
         [0.0171],
         [0.1076],
         [0.0344],
         [0.0918],
         [0.0567],
         [0.0247],
         [0.0757],
         [0.0155],
         [0.0210],
         [0.0364],
         [0.0284]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0356],
         [0.0320],
         [0.0317],
         [0.0082],
         [0.0095],
         [0.0070],
         [0.0089],
         [0.0181],
         [0.0035],
         [0.0095],
         [0.0083],
         [0.0267]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0025],
         [0.0100],
         [0.0073],
         [0.0021],
         [0.0047],
         [0.0094],
         [0.0021],
         [0.0056],
         [0.0004],
         [0.0010],
         [0.0108],
         [0.0130]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0271],
         [0.1085],
         [0.0263],
         [0.0096],
         [0.0076],
         [0.0074],
         [0.0084],
         [0.0146],
         [0.0039],
    


Evaluating:  58%|████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                          | 1169/2000 [00:55<00:39, 20.79it/s][A


tensor([[[0.0405],
         [0.0390],
         [0.0311],
         [0.0077],
         [0.0084],
         [0.0051],
         [0.0308],
         [0.0048],
         [0.0044],
         [0.0048],
         [0.0138],
         [0.0147]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0815],
         [0.0379],
         [0.0224],
         [0.0286],
         [0.0242],
         [0.0062],
         [0.0182],
         [0.0165],
         [0.0068],
         [0.0152],
         [0.0177],
         [0.0211]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0321],
         [0.0233],
         [0.0176],
         [0.0213],
         [0.0178],
         [0.0173],
         [0.0098],
         [0.0659],
         [0.0030],
         [0.0037],
         [0.0721],
         [0.0324]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0014],
         [0.0043],
         [0.0058],
         [0.0010],
         [0.0043],
         [0.0045],
         [0.0024],
         [0.0008],
         [0.0044],
   


Evaluating:  59%|████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                         | 1172/2000 [00:55<00:39, 20.95it/s][A
Evaluating:  59%|████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                         | 1175/2000 [00:55<00:38, 21.29it/s][A

tensor([[[0.1540],
         [0.0290],
         [0.0408],
         [0.0094],
         [0.0084],
         [0.0142],
         [0.0234],
         [0.0109],
         [0.0039],
         [0.0182],
         [0.0043],
         [0.0129]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0288],
         [0.0249],
         [0.0131],
         [0.0222],
         [0.0097],
         [0.0065],
         [0.0239],
         [0.0053],
         [0.0014],
         [0.0061],
         [0.1032],
         [0.0168]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0268],
         [0.0328],
         [0.1472],
         [0.0183],
         [0.0474],
         [0.0065],
         [0.0210],
         [0.0097],
         [0.0269],
         [0.0096],
         [0.0213],
         [0.0427]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0422],
         [0.0274],
         [0.0619],
         [0.0168],
         [0.0216],
         [0.0102],
         [0.0257],
         [0.0115],
         [0.0105],
    


Evaluating:  59%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                         | 1178/2000 [00:55<00:38, 21.35it/s][A


reg attention sum per layer
tensor([[[0.1450],
         [0.0548],
         [0.0798],
         [0.0365],
         [0.1058],
         [0.0170],
         [0.0680],
         [0.0569],
         [0.0661],
         [0.0725],
         [0.0168],
         [0.0780]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0579],
         [0.0202],
         [0.1381],
         [0.0198],
         [0.0831],
         [0.0226],
         [0.0321],
         [0.0253],
         [0.0208],
         [0.0207],
         [0.0199],
         [0.0776]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0314],
         [0.0459],
         [0.0673],
         [0.0101],
         [0.0386],
         [0.0214],
         [0.0177],
         [0.0089],
         [0.0029],
         [0.0029],
         [0.0317],
         [0.0251]]], device='cuda:0')
reg attention sum per layer



Evaluating:  59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                         | 1181/2000 [00:55<00:39, 20.56it/s][A

tensor([[[0.0113],
         [0.0172],
         [0.0406],
         [0.0092],
         [0.0363],
         [0.0040],
         [0.0133],
         [0.0045],
         [0.0040],
         [0.0076],
         [0.0515],
         [0.0283]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0129],
         [0.0367],
         [0.5005],
         [0.0874],
         [0.1020],
         [0.0082],
         [0.0113],
         [0.0068],
         [0.0196],
         [0.0059],
         [0.1381],
         [0.3629]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0174],
         [0.0291],
         [0.0308],
         [0.0105],
         [0.0116],
         [0.0151],
         [0.0207],
         [0.0038],
         [0.0022],
         [0.0185],
         [0.0215],
         [0.0178]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0281],
         [0.0219],
         [0.0892],
         [0.0078],
         [0.0278],
         [0.0123],
         [0.0100],
         [0.0066],
         [0.0058],
    


Evaluating:  59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                        | 1184/2000 [00:56<00:40, 20.26it/s][A
Evaluating:  59%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                        | 1187/2000 [00:56<00:39, 20.70it/s][A

tensor([[[0.0630],
         [0.0818],
         [0.0692],
         [0.0181],
         [0.0352],
         [0.0090],
         [0.0752],
         [0.0244],
         [0.0094],
         [0.0244],
         [0.0143],
         [0.0278]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0422],
         [0.0379],
         [0.0254],
         [0.0261],
         [0.0321],
         [0.0243],
         [0.0374],
         [0.0112],
         [0.0102],
         [0.0137],
         [0.0302],
         [0.0312]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0540],
         [0.0284],
         [0.0233],
         [0.0253],
         [0.0564],
         [0.0119],
         [0.0268],
         [0.0159],
         [0.0101],
         [0.0139],
         [0.0286],
         [0.0435]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1315],
         [0.0325],
         [0.0227],
         [0.0255],
         [0.0212],
         [0.0842],
         [0.0426],
         [0.0168],
         [0.0031],
    


Evaluating:  60%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                        | 1190/2000 [00:56<00:38, 21.01it/s][A

tensor([[[0.0287],
         [0.0596],
         [0.0670],
         [0.0077],
         [0.0169],
         [0.0167],
         [0.0290],
         [0.0650],
         [0.0175],
         [0.0106],
         [0.0187],
         [0.0523]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0279],
         [0.0257],
         [0.0227],
         [0.0387],
         [0.0219],
         [0.0231],
         [0.0154],
         [0.0222],
         [0.0086],
         [0.0110],
         [0.0102],
         [0.0315]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0310],
         [0.0533],
         [0.0507],
         [0.0231],
         [0.0312],
         [0.0196],
         [0.0299],
         [0.0167],
         [0.0082],
         [0.0147],
         [0.0871],
         [0.0456]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0127],
         [0.0269],
         [0.0217],
         [0.0090],
         [0.0079],
         [0.0103],
         [0.0075],
         [0.0109],
         [0.0050],
    


Evaluating:  60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                       | 1193/2000 [00:56<00:37, 21.33it/s][A

tensor([[[0.0150],
         [0.0278],
         [0.0390],
         [0.0138],
         [0.0137],
         [0.0232],
         [0.0216],
         [0.0097],
         [0.0031],
         [0.0030],
         [0.0600],
         [0.0300]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0217],
         [0.0143],
         [0.1575],
         [0.0299],
         [0.0097],
         [0.0224],
         [0.0172],
         [0.0048],
         [0.0022],
         [0.0106],
         [0.0794],
         [0.0765]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0357],
         [0.0548],
         [0.0577],
         [0.0256],
         [0.0184],
         [0.0072],
         [0.0076],
         [0.0110],
         [0.0027],
         [0.0026],
         [0.0477],
         [0.0734]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0141],
         [0.0283],
         [0.0609],
         [0.0055],
         [0.0045],
         [0.0169],
         [0.0175],
         [0.0041],
         [0.0010],
    


Evaluating:  60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                       | 1196/2000 [00:56<00:37, 21.33it/s][A
Evaluating:  60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                       | 1199/2000 [00:56<00:37, 21.42it/s][A


reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0235],
         [0.0390],
         [0.0782],
         [0.0181],
         [0.0133],
         [0.0224],
         [0.0252],
         [0.0178],
         [0.0049],
         [0.0052],
         [0.0932],
         [0.0205]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0370],
         [0.0256],
         [0.0687],
         [0.0289],
         [0.0313],
         [0.0106],
         [0.0110],
         [0.0130],
         [0.0075],
         [0.0070],
         [0.0514],
         [0.0451]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0175],
         [0.0128],
         [0.1194],
         [0.0263],
         [0.0300],
         [0.0091],
         [0.0022],
         [0.0152],
         [0.0111],
         [0.0038],
    


Evaluating:  60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                       | 1202/2000 [00:56<00:36, 21.58it/s][A


reg attention sum per layer
tensor([[[0.0368],
         [0.0164],
         [0.0108],
         [0.0065],
         [0.0106],
         [0.0098],
         [0.0119],
         [0.0213],
         [0.0026],
         [0.0053],
         [0.0046],
         [0.0181]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0340],
         [0.0560],
         [0.0311],
         [0.0078],
         [0.0079],
         [0.0286],
         [0.0162],
         [0.0097],
         [0.0013],
         [0.0068],
         [0.0080],
         [0.0378]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0181],
         [0.0355],
         [0.0980],
         [0.0137],
         [0.0729],
         [0.0266],
         [0.0290],
         [0.0235],
         [0.0177],
         [0.0125],
         [0.0759],
         [0.0458]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0042],
         [0.0049],
         [0.0025],
         [0.0006],
         [0.0010],
         [0.0005],
         [0.0139],
         [0.0


Evaluating:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                      | 1205/2000 [00:56<00:36, 21.65it/s][A
Evaluating:  60%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                      | 1208/2000 [00:57<00:36, 21.64it/s][A

reg attention sum per layer
tensor([[[0.0585],
         [0.1336],
         [0.0498],
         [0.0182],
         [0.0301],
         [0.0087],
         [0.0481],
         [0.0509],
         [0.0200],
         [0.0177],
         [0.0268],
         [0.0366]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0208],
         [0.0172],
         [0.0212],
         [0.0128],
         [0.0175],
         [0.0022],
         [0.0055],
         [0.0084],
         [0.0049],
         [0.0086],
         [0.0069],
         [0.0070]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0328],
         [0.0474],
         [0.0535],
         [0.0308],
         [0.0342],
         [0.0174],
         [0.0140],
         [0.0499],
         [0.0092],
         [0.0094],
         [0.0532],
         [0.0362]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0321],
         [0.0838],
         [0.0452],
         [0.0205],
         [0.0185],
         [0.0149],
         [0.0137],
         [0.00


Evaluating:  61%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                      | 1211/2000 [00:57<00:36, 21.60it/s][A

tensor([[[0.0268],
         [0.0156],
         [0.0048],
         [0.0078],
         [0.0031],
         [0.0130],
         [0.0048],
         [0.0016],
         [0.0010],
         [0.0038],
         [0.0067],
         [0.0241]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0143],
         [0.0389],
         [0.1245],
         [0.0288],
         [0.0351],
         [0.0143],
         [0.0042],
         [0.0176],
         [0.0038],
         [0.0063],
         [0.0389],
         [0.1425]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0167],
         [0.0109],
         [0.0153],
         [0.0047],
         [0.0111],
         [0.0114],
         [0.0149],
         [0.0033],
         [0.0025],
         [0.0075],
         [0.0116],
         [0.0225]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0676],
         [0.0285],
         [0.0335],
         [0.0152],
         [0.0148],
         [0.0071],
         [0.0329],
         [0.0398],
         [0.0069],
    


Evaluating:  61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                      | 1214/2000 [00:57<00:36, 21.66it/s][A


tensor([[[0.0161],
         [0.0164],
         [0.1043],
         [0.0214],
         [0.0473],
         [0.0174],
         [0.0166],
         [0.0033],
         [0.0075],
         [0.0070],
         [0.0264],
         [0.0321]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0176],
         [0.0245],
         [0.0125],
         [0.0113],
         [0.0236],
         [0.0054],
         [0.0274],
         [0.0220],
         [0.0080],
         [0.0063],
         [0.0170],
         [0.0054]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1012],
         [0.0317],
         [0.0221],
         [0.0114],
         [0.0104],
         [0.0136],
         [0.0304],
         [0.0149],
         [0.0024],
         [0.0125],
         [0.0149],
         [0.0172]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0200],
         [0.0299],
         [0.0455],
         [0.0085],
         [0.0158],
         [0.0211],
         [0.0165],
         [0.0027],
         [0.0036],
    

Evaluating:  61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                     | 1217/2000 [00:57<00:35, 21.79it/s][A
Evaluating:  61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                     | 1220/2000 [00:57<00:35, 21.79it/s][A

reg attention sum per layer
tensor([[[0.0099],
         [0.0165],
         [0.0169],
         [0.0034],
         [0.0074],
         [0.0065],
         [0.0022],
         [0.0037],
         [0.0004],
         [0.0014],
         [0.0213],
         [0.0119]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0117],
         [0.0104],
         [0.0288],
         [0.0111],
         [0.0071],
         [0.0165],
         [0.0146],
         [0.0042],
         [0.0005],
         [0.0028],
         [0.1078],
         [0.0211]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1582],
         [0.0439],
         [0.0544],
         [0.0451],
         [0.0404],
         [0.0124],
         [0.0342],
         [0.0276],
         [0.0303],
         [0.0228],
         [0.0073],
         [0.0640]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0336],
         [0.1502],
         [0.0515],
         [0.0080],
         [0.0442],
         [0.0093],
         [0.0935],
         [0.02


Evaluating:  61%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 1223/2000 [00:57<00:36, 21.42it/s][A

tensor([[[0.1242],
         [0.0800],
         [0.0340],
         [0.0337],
         [0.0550],
         [0.0170],
         [0.0399],
         [0.1176],
         [0.0211],
         [0.0325],
         [0.0173],
         [0.0322]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0485],
         [0.0507],
         [0.0257],
         [0.0157],
         [0.0538],
         [0.0434],
         [0.0369],
         [0.0185],
         [0.0061],
         [0.0097],
         [0.0326],
         [0.0412]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0639],
         [0.0438],
         [0.0629],
         [0.0260],
         [0.0271],
         [0.0119],
         [0.0251],
         [0.0340],
         [0.0369],
         [0.0088],
         [0.0115],
         [0.0686]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0194],
         [0.0357],
         [0.0378],
         [0.0105],
         [0.0283],
         [0.0107],
         [0.0062],
         [0.0027],
         [0.0010],
    


Evaluating:  61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                     | 1226/2000 [00:57<00:36, 21.08it/s][A
Evaluating:  61%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 1229/2000 [00:58<00:36, 20.89it/s][A

reg attention sum per layer
tensor([[[0.0066],
         [0.0184],
         [0.0126],
         [0.0020],
         [0.0023],
         [0.0032],
         [0.0042],
         [0.0012],
         [0.0004],
         [0.0005],
         [0.0069],
         [0.0093]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0335],
         [0.0637],
         [0.0457],
         [0.0186],
         [0.0372],
         [0.0103],
         [0.0444],
         [0.0101],
         [0.0103],
         [0.0146],
         [0.0056],
         [0.0427]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0578],
         [0.0648],
         [0.1181],
         [0.0104],
         [0.0266],
         [0.0048],
         [0.0173],
         [0.0038],
         [0.0053],
         [0.0037],
         [0.0351],
         [0.0802]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0307],
         [0.0294],
         [0.0108],
         [0.0159],
         [0.0368],
         [0.0275],
         [0.0081],
         [0.00


Evaluating:  62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 1232/2000 [00:58<00:36, 20.85it/s][A

tensor([[[0.0232],
         [0.0595],
         [0.0508],
         [0.0302],
         [0.0424],
         [0.0540],
         [0.0524],
         [0.0107],
         [0.0099],
         [0.0129],
         [0.1313],
         [0.0300]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0368],
         [0.0202],
         [0.0198],
         [0.0156],
         [0.0119],
         [0.0097],
         [0.0185],
         [0.0645],
         [0.0092],
         [0.0135],
         [0.0060],
         [0.0378]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0508],
         [0.0342],
         [0.0137],
         [0.0095],
         [0.0100],
         [0.0118],
         [0.0281],
         [0.0011],
         [0.0012],
         [0.0088],
         [0.0246],
         [0.0147]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1506],
         [0.0492],
         [0.0458],
         [0.0307],
         [0.0137],
         [0.0114],
         [0.0266],
         [0.0142],
         [0.0024],
    


Evaluating:  62%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 1235/2000 [00:58<00:36, 20.82it/s][A

tensor([[[0.0197],
         [0.0399],
         [0.0207],
         [0.0073],
         [0.0086],
         [0.0036],
         [0.0153],
         [0.0044],
         [0.0064],
         [0.0037],
         [0.0146],
         [0.0250]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1071],
         [0.1814],
         [0.0350],
         [0.0305],
         [0.0157],
         [0.0168],
         [0.1244],
         [0.1092],
         [0.0087],
         [0.0347],
         [0.0074],
         [0.0238]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0077],
         [0.0160],
         [0.0110],
         [0.0067],
         [0.0026],
         [0.0115],
         [0.0059],
         [0.0049],
         [0.0006],
         [0.0013],
         [0.0154],
         [0.0292]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0204],
         [0.0116],
         [0.0165],
         [0.0015],
         [0.0168],
         [0.0092],
         [0.0128],
         [0.0013],
         [0.0036],
    


Evaluating:  62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                                   | 1238/2000 [00:58<00:36, 20.97it/s][A
Evaluating:  62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                   | 1241/2000 [00:58<00:36, 21.05it/s][A

tensor([[[0.0323],
         [0.0298],
         [0.1567],
         [0.0478],
         [0.0640],
         [0.0287],
         [0.0048],
         [0.0053],
         [0.0030],
         [0.0041],
         [0.2053],
         [0.1218]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0352],
         [0.0455],
         [0.0814],
         [0.0852],
         [0.0925],
         [0.0191],
         [0.0343],
         [0.0129],
         [0.0151],
         [0.0059],
         [0.0653],
         [0.0686]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0136],
         [0.0313],
         [0.0534],
         [0.0061],
         [0.0106],
         [0.0068],
         [0.0109],
         [0.0063],
         [0.0071],
         [0.0050],
         [0.0102],
         [0.02


Evaluating:  62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                   | 1244/2000 [00:58<00:35, 21.18it/s][A

tensor([[[0.0051],
         [0.0076],
         [0.0395],
         [0.0049],
         [0.0098],
         [0.0056],
         [0.0180],
         [0.0053],
         [0.0062],
         [0.0061],
         [0.0209],
         [0.0417]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0183],
         [0.0116],
         [0.0629],
         [0.0087],
         [0.0272],
         [0.0087],
         [0.0088],
         [0.0037],
         [0.0153],
         [0.0102],
         [0.0135],
         [0.0171]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0451],
         [0.0426],
         [0.0565],
         [0.0136],
         [0.0441],
         [0.0081],
         [0.0080],
         [0.0586],
         [0.0038],
         [0.0131],
         [0.0253],
         [0.0170]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0593],
         [0.0544],
         [0.0419],
         [0.0118],
         [0.0332],
         [0.0071],
         [0.0197],
         [0.0081],
         [0.0073],
    


Evaluating:  62%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                   | 1247/2000 [00:58<00:35, 21.36it/s][A

tensor([[[0.0097],
         [0.0816],
         [0.0056],
         [0.0037],
         [0.0009],
         [0.0043],
         [0.0256],
         [0.0006],
         [0.0018],
         [0.0044],
         [0.0500],
         [0.0061]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0239],
         [0.0325],
         [0.0973],
         [0.0221],
         [0.0156],
         [0.0127],
         [0.0173],
         [0.0197],
         [0.0184],
         [0.0049],
         [0.0323],
         [0.0506]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0748],
         [0.0407],
         [0.2082],
         [0.0607],
         [0.1218],
         [0.0723],
         [0.0794],
         [0.0077],
         [0.0036],
         [0.0066],
         [0.5351],
         [0.2393]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0210],
         [0.0189],
         [0.0491],
         [0.0181],
         [0.0211],
         [0.0047],
         [0.0136],
         [0.0044],
         [0.0031],
    


Evaluating:  62%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                  | 1250/2000 [00:59<00:35, 20.91it/s][A
Evaluating:  63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 1253/2000 [00:59<00:35, 20.90it/s][A

tensor([[[0.1273],
         [0.0714],
         [0.0397],
         [0.0158],
         [0.0572],
         [0.0426],
         [0.0262],
         [0.0365],
         [0.0130],
         [0.0467],
         [0.0201],
         [0.0354]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0024],
         [0.0053],
         [0.0110],
         [0.0012],
         [0.0017],
         [0.0044],
         [0.0045],
         [0.0012],
         [0.0012],
         [0.0012],
         [0.0029],
         [0.0139]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0304],
         [0.0182],
         [0.0203],
         [0.0073],
         [0.0090],
         [0.0019],
         [0.0435],
         [0.0125],
         [0.0035],
         [0.0077],
         [0.0128],
         [0.0263]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0354],
         [0.0677],
         [0.0643],
         [0.0154],
         [0.0379],
         [0.0285],
         [0.0123],
         [0.0312],
         [0.0026],
    


Evaluating:  63%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                  | 1256/2000 [00:59<00:35, 20.98it/s][A

tensor([[[0.0619],
         [0.0316],
         [0.0340],
         [0.0228],
         [0.0446],
         [0.0099],
         [0.0436],
         [0.0353],
         [0.0269],
         [0.0257],
         [0.0122],
         [0.0258]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0118],
         [0.0152],
         [0.0532],
         [0.0106],
         [0.0152],
         [0.0157],
         [0.0219],
         [0.0090],
         [0.0178],
         [0.0141],
         [0.0263],
         [0.0203]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0326],
         [0.0646],
         [0.0303],
         [0.0417],
         [0.0433],
         [0.0495],
         [0.0076],
         [0.0344],
         [0.0043],
         [0.0199],
         [0.0531],
         [0.0480]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0222],
         [0.0290],
         [0.0623],
         [0.0544],
         [0.0148],
         [0.0137],
         [0.0070],
         [0.0054],
         [0.0006],
    


Evaluating:  63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                  | 1259/2000 [00:59<00:34, 21.22it/s][A
Evaluating:  63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 1262/2000 [00:59<00:34, 21.44it/s][A


reg attention sum per layer
tensor([[[0.0122],
         [0.0106],
         [0.0415],
         [0.0073],
         [0.0063],
         [0.0049],
         [0.0166],
         [0.0057],
         [0.0066],
         [0.0094],
         [0.0134],
         [0.0113]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1014],
         [0.1203],
         [0.0632],
         [0.0202],
         [0.0123],
         [0.0139],
         [0.0867],
         [0.0322],
         [0.0200],
         [0.0158],
         [0.0113],
         [0.0315]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0076],
         [0.0168],
         [0.0198],
         [0.0043],
         [0.0056],
         [0.0024],
         [0.0214],
         [0.0026],
         [0.0054],
         [0.0085],
         [0.0093],
         [0.0169]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0261],
         [0.0427],
         [0.1503],
         [0.0306],
         [0.0571],
         [0.0153],
         [0.0425],
         [0.0


Evaluating:  63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                 | 1265/2000 [00:59<00:34, 21.50it/s][A

reg attention sum per layer
tensor([[[0.0548],
         [0.0691],
         [0.1292],
         [0.0266],
         [0.0569],
         [0.0630],
         [0.1116],
         [0.0461],
         [0.0212],
         [0.0229],
         [0.0899],
         [0.0951]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0115],
         [0.1330],
         [0.0593],
         [0.0157],
         [0.0141],
         [0.0049],
         [0.0257],
         [0.0090],
         [0.0094],
         [0.0073],
         [0.0500],
         [0.3711]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0209],
         [0.0168],
         [0.0491],
         [0.0095],
         [0.0199],
         [0.0350],
         [0.0212],
         [0.0032],
         [0.0041],
         [0.0048],
         [0.0351],
         [0.0128]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0122],
         [0.0237],
         [0.0351],
         [0.0068],
         [0.0069],
         [0.0100],
         [0.0021],
         [0.00


Evaluating:  63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                 | 1268/2000 [00:59<00:33, 21.59it/s][A

tensor([[[0.0360],
         [0.0163],
         [0.0337],
         [0.0301],
         [0.0460],
         [0.0269],
         [0.0910],
         [0.0063],
         [0.0115],
         [0.0180],
         [0.1404],
         [0.0771]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0425],
         [0.0851],
         [0.0366],
         [0.0081],
         [0.0149],
         [0.0042],
         [0.0106],
         [0.0205],
         [0.0018],
         [0.0203],
         [0.0120],
         [0.0248]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0288],
         [0.0685],
         [0.0506],
         [0.0187],
         [0.0384],
         [0.0246],
         [0.0250],
         [0.0195],
         [0.0230],
         [0.0259],
         [0.0155],
         [0.0247]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0163],
         [0.0733],
         [0.0472],
         [0.0060],
         [0.0480],
         [0.0172],
         [0.0128],
         [0.0213],
         [0.0094],
    


Evaluating:  64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                 | 1271/2000 [01:00<00:33, 21.56it/s][A
Evaluating:  64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 1274/2000 [01:00<00:33, 21.63it/s][A

tensor([[[0.0270],
         [0.0237],
         [0.0265],
         [0.0105],
         [0.0162],
         [0.0620],
         [0.0329],
         [0.0122],
         [0.0050],
         [0.0083],
         [0.0238],
         [0.0274]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0203],
         [0.0273],
         [0.0930],
         [0.0159],
         [0.0283],
         [0.0278],
         [0.0117],
         [0.0384],
         [0.0052],
         [0.0071],
         [0.0330],
         [0.0270]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0176],
         [0.0420],
         [0.0828],
         [0.0290],
         [0.0333],
         [0.0102],
         [0.0325],
         [0.0127],
         [0.0223],
         [0.0072],
         [0.0151],
         [0.1024]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0803],
         [0.0346],
         [0.0133],
         [0.0169],
         [0.0041],
         [0.0119],
         [0.0513],
         [0.0124],
         [0.0051],
    


Evaluating:  64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                                | 1277/2000 [01:00<00:33, 21.68it/s][A


reg attention sum per layer
tensor([[[0.0119],
         [0.0206],
         [0.0716],
         [0.0370],
         [0.0613],
         [0.0065],
         [0.0179],
         [0.0107],
         [0.0259],
         [0.0121],
         [0.0507],
         [0.1369]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0250],
         [0.0331],
         [0.0247],
         [0.0055],
         [0.0220],
         [0.0143],
         [0.0179],
         [0.0075],
         [0.0018],
         [0.0034],
         [0.0430],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0647],
         [0.1671],
         [0.0466],
         [0.0229],
         [0.0222],
         [0.0117],
         [0.0343],
         [0.0163],
         [0.0328],
         [0.0287],
         [0.0091],
         [0.0764]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0351],
         [0.0117],
         [0.0571],
         [0.0151],
         [0.0393],
         [0.0338],
         [0.0184],
         [0.0


Evaluating:  64%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                | 1280/2000 [01:00<00:33, 21.57it/s][A
Evaluating:  64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                               | 1283/2000 [01:00<00:33, 21.55it/s][A

reg attention sum per layer
tensor([[[0.0161],
         [0.0186],
         [0.0178],
         [0.0153],
         [0.0063],
         [0.0026],
         [0.0147],
         [0.0011],
         [0.0007],
         [0.0014],
         [0.0149],
         [0.0238]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0431],
         [0.0698],
         [0.0398],
         [0.0149],
         [0.0319],
         [0.0158],
         [0.0173],
         [0.0306],
         [0.0158],
         [0.0092],
         [0.0128],
         [0.0716]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0059],
         [0.0249],
         [0.0181],
         [0.0053],
         [0.0086],
         [0.0062],
         [0.0158],
         [0.0006],
         [0.0025],
         [0.0031],
         [0.0257],
         [0.0062]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0200],
         [0.0377],
         [0.0267],
         [0.0083],
         [0.0151],
         [0.0093],
         [0.0071],
         [0.05


Evaluating:  64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                               | 1286/2000 [01:00<00:33, 21.53it/s][A

tensor([[[0.0197],
         [0.0296],
         [0.0160],
         [0.0053],
         [0.0240],
         [0.0052],
         [0.0075],
         [0.0052],
         [0.0104],
         [0.0049],
         [0.0122],
         [0.0205]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0310],
         [0.0717],
         [0.0345],
         [0.0106],
         [0.0243],
         [0.0071],
         [0.0161],
         [0.0069],
         [0.0049],
         [0.0112],
         [0.0324],
         [0.0213]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0665],
         [0.0495],
         [0.0340],
         [0.0173],
         [0.0219],
         [0.0252],
         [0.0326],
         [0.0138],
         [0.0189],
         [0.0250],
         [0.0199],
         [0.0219]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0130],
         [0.0163],
         [0.0237],
         [0.0080],
         [0.0174],
         [0.0044],
         [0.0328],
         [0.0098],
         [0.0118],
    


Evaluating:  64%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                               | 1289/2000 [01:00<00:33, 21.42it/s][A

tensor([[[0.0405],
         [0.0418],
         [0.0513],
         [0.0192],
         [0.0340],
         [0.0158],
         [0.0372],
         [0.0209],
         [0.0073],
         [0.0172],
         [0.0111],
         [0.0894]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0199],
         [0.0601],
         [0.0692],
         [0.0134],
         [0.0096],
         [0.0106],
         [0.0090],
         [0.0103],
         [0.0051],
         [0.0026],
         [0.0100],
         [0.0440]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0656],
         [0.0167],
         [0.0305],
         [0.0079],
         [0.0415],
         [0.0060],
         [0.0320],
         [0.0182],
         [0.0148],
         [0.0375],
         [0.0084],
         [0.0246]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0304],
         [0.0271],
         [0.0301],
         [0.0059],
         [0.0100],
         [0.0017],
         [0.0104],
         [0.0136],
         [0.0038],
    


Evaluating:  65%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                               | 1292/2000 [01:01<00:33, 21.35it/s][A
Evaluating:  65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                              | 1295/2000 [01:01<00:32, 21.39it/s][A

tensor([[[0.0094],
         [0.0040],
         [0.0252],
         [0.0081],
         [0.0084],
         [0.0036],
         [0.0093],
         [0.0040],
         [0.0074],
         [0.0015],
         [0.0031],
         [0.0176]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2649],
         [0.1179],
         [0.0525],
         [0.0443],
         [0.0161],
         [0.0184],
         [0.0385],
         [0.0217],
         [0.0072],
         [0.0130],
         [0.0388],
         [0.0700]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0563],
         [0.0876],
         [0.0960],
         [0.0222],
         [0.0565],
         [0.0723],
         [0.0131],
         [0.0289],
         [0.0151],
         [0.0123],
         [0.0548],
         [0.0389]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0553],
         [0.0733],
         [0.0611],
         [0.0192],
         [0.0318],
         [0.0117],
         [0.0359],
         [0.0271],
         [0.0053],
    


Evaluating:  65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                              | 1298/2000 [01:01<00:32, 21.51it/s][A

tensor([[[0.0166],
         [0.0251],
         [0.0670],
         [0.0126],
         [0.0044],
         [0.0050],
         [0.0073],
         [0.0043],
         [0.0030],
         [0.0036],
         [0.0146],
         [0.0346]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0113],
         [0.0376],
         [0.0218],
         [0.0137],
         [0.0041],
         [0.0059],
         [0.0071],
         [0.0013],
         [0.0023],
         [0.0016],
         [0.0237],
         [0.0164]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0130],
         [0.0413],
         [0.0470],
         [0.0152],
         [0.0412],
         [0.0212],
         [0.0191],
         [0.0045],
         [0.0029],
         [0.0039],
         [0.2169],
         [0.0435]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0345],
         [0.0429],
         [0.0349],
         [0.0176],
         [0.0266],
         [0.0052],
         [0.0219],
         [0.0174],
         [0.0070],
    


Evaluating:  65%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 1301/2000 [01:01<00:32, 21.64it/s][A

tensor([[[0.0075],
         [0.0246],
         [0.0332],
         [0.0210],
         [0.0331],
         [0.0039],
         [0.0124],
         [0.0060],
         [0.0305],
         [0.0094],
         [0.0127],
         [0.1297]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0380],
         [0.0405],
         [0.0602],
         [0.0149],
         [0.0245],
         [0.0096],
         [0.0097],
         [0.0183],
         [0.0122],
         [0.0092],
         [0.0115],
         [0.0273]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0175],
         [0.0095],
         [0.0068],
         [0.0024],
         [0.0010],
         [0.0038],
         [0.0042],
         [0.0075],
         [0.0006],
         [0.0010],
         [0.0162],
         [0.0087]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0599],
         [0.0374],
         [0.0218],
         [0.0150],
         [0.0375],
         [0.0244],
         [0.0578],
         [0.0055],
         [0.0422],
    


Evaluating:  65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                              | 1304/2000 [01:01<00:32, 21.74it/s][A
Evaluating:  65%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                             | 1307/2000 [01:01<00:31, 21.80it/s][A


reg attention sum per layer
tensor([[[0.0030],
         [0.0130],
         [0.0623],
         [0.0019],
         [0.0065],
         [0.0043],
         [0.0075],
         [0.0013],
         [0.0009],
         [0.0011],
         [0.0146],
         [0.0352]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0380],
         [0.0449],
         [0.1465],
         [0.0265],
         [0.0763],
         [0.0026],
         [0.0387],
         [0.0642],
         [0.0532],
         [0.0248],
         [0.0315],
         [0.1126]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0317],
         [0.0259],
         [0.0223],
         [0.0056],
         [0.0181],
         [0.0295],
         [0.0235],
         [0.0106],
         [0.0129],
         [0.0228],
         [0.0170],
         [0.0093]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0261],
         [0.0209],
         [0.0416],
         [0.0100],
         [0.0119],
         [0.0124],
         [0.0071],
         [0.0


Evaluating:  66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                             | 1310/2000 [01:01<00:31, 21.80it/s][A

reg attention sum per layer
tensor([[[0.0369],
         [0.0084],
         [0.0277],
         [0.0400],
         [0.0258],
         [0.0145],
         [0.0092],
         [0.0156],
         [0.0022],
         [0.0130],
         [0.0079],
         [0.0087]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0077],
         [0.0125],
         [0.0152],
         [0.0094],
         [0.0065],
         [0.0083],
         [0.0062],
         [0.0032],
         [0.0020],
         [0.0025],
         [0.0563],
         [0.0147]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0372],
         [0.1119],
         [0.0902],
         [0.0461],
         [0.0395],
         [0.0528],
         [0.0432],
         [0.0175],
         [0.0030],
         [0.0042],
         [0.5595],
         [0.0871]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0464],
         [0.0980],
         [0.0526],
         [0.0439],
         [0.0331],
         [0.0131],
         [0.0220],
         [0.02


Evaluating:  66%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                             | 1313/2000 [01:02<00:31, 21.66it/s][A
Evaluating:  66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                             | 1316/2000 [01:02<00:31, 21.61it/s][A

reg attention sum per layer
tensor([[[0.0469],
         [0.0130],
         [0.0212],
         [0.0158],
         [0.0108],
         [0.0278],
         [0.0329],
         [0.0129],
         [0.0089],
         [0.0148],
         [0.0070],
         [0.0123]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0535],
         [0.0543],
         [0.0401],
         [0.0176],
         [0.0208],
         [0.0033],
         [0.0219],
         [0.0018],
         [0.0121],
         [0.0093],
         [0.0332],
         [0.0208]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0252],
         [0.0499],
         [0.0401],
         [0.0173],
         [0.0316],
         [0.0123],
         [0.0272],
         [0.0237],
         [0.0125],
         [0.0102],
         [0.0129],
         [0.0279]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1149],
         [0.0829],
         [0.0148],
         [0.0122],
         [0.0136],
         [0.0065],
         [0.0311],
         [0.02


Evaluating:  66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                            | 1319/2000 [01:02<00:31, 21.52it/s][A

tensor([[[0.0265],
         [0.0388],
         [0.0406],
         [0.0219],
         [0.0099],
         [0.0140],
         [0.0150],
         [0.0031],
         [0.0033],
         [0.0038],
         [0.0575],
         [0.0275]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0478],
         [0.0729],
         [0.0423],
         [0.0150],
         [0.0129],
         [0.0154],
         [0.0829],
         [0.0155],
         [0.0070],
         [0.0161],
         [0.0073],
         [0.0266]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1016],
         [0.0357],
         [0.0346],
         [0.0177],
         [0.0860],
         [0.0138],
         [0.0169],
         [0.0165],
         [0.0367],
         [0.0195],
         [0.0115],
         [0.0995]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2371],
         [0.0173],
         [0.0336],
         [0.0369],
         [0.0125],
         [0.0224],
         [0.0279],
         [0.0725],
         [0.0059],
    


Evaluating:  66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 1322/2000 [01:02<00:31, 21.60it/s][A


tensor([[[0.0843],
         [0.1447],
         [0.1301],
         [0.0484],
         [0.0248],
         [0.0561],
         [0.0272],
         [0.0126],
         [0.0043],
         [0.0089],
         [0.2978],
         [0.0914]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0352],
         [0.0377],
         [0.0292],
         [0.0171],
         [0.0097],
         [0.0367],
         [0.0123],
         [0.0123],
         [0.0024],
         [0.0115],
         [0.0420],
         [0.0238]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0377],
         [0.0495],
         [0.0314],
         [0.0121],
         [0.0071],
         [0.0068],
         [0.0180],
         [0.0036],
         [0.0015],
         [0.0025],
         [0.0084],
         [0.0188]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0080],
         [0.0180],
         [0.0974],
         [0.0149],
         [0.0231],
         [0.0080],
         [0.0157],
         [0.0080],
         [0.0056],
   


Evaluating:  66%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                            | 1325/2000 [01:02<00:31, 21.66it/s][A
Evaluating:  66%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                           | 1328/2000 [01:02<00:31, 21.63it/s][A

tensor([[[0.0254],
         [0.0141],
         [0.0287],
         [0.0115],
         [0.0669],
         [0.0055],
         [0.0172],
         [0.0028],
         [0.0023],
         [0.0152],
         [0.0426],
         [0.0337]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0257],
         [0.0367],
         [0.0599],
         [0.0099],
         [0.0198],
         [0.0174],
         [0.0065],
         [0.0054],
         [0.0021],
         [0.0072],
         [0.0441],
         [0.0447]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0344],
         [0.0388],
         [0.0233],
         [0.0191],
         [0.0652],
         [0.0280],
         [0.0253],
         [0.0053],
         [0.0050],
         [0.0233],
         [0.0334],
         [0.0138]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0179],
         [0.0189],
         [0.0551],
         [0.0097],
         [0.0217],
         [0.0136],
         [0.0084],
         [0.0150],
         [0.0036],
    


Evaluating:  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                           | 1331/2000 [01:02<00:30, 21.63it/s][A

tensor([[[0.0292],
         [0.0472],
         [0.0344],
         [0.0113],
         [0.0355],
         [0.0261],
         [0.0235],
         [0.0126],
         [0.0107],
         [0.0103],
         [0.0199],
         [0.0114]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0109],
         [0.0056],
         [0.0101],
         [0.0052],
         [0.0028],
         [0.0010],
         [0.0058],
         [0.0041],
         [0.0028],
         [0.0087],
         [0.0017],
         [0.0134]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0361],
         [0.0230],
         [0.0316],
         [0.0151],
         [0.0269],
         [0.0467],
         [0.0242],
         [0.0038],
         [0.0081],
         [0.0093],
         [0.0252],
         [0.0206]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0381],
         [0.0294],
         [0.0702],
         [0.0125],
         [0.0188],
         [0.0076],
         [0.0443],
         [0.0293],
         [0.0285],
    


Evaluating:  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                           | 1334/2000 [01:03<00:30, 21.64it/s][A
Evaluating:  67%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                           | 1337/2000 [01:03<00:30, 21.78it/s][A

reg attention sum per layer
tensor([[[0.0264],
         [0.0122],
         [0.0177],
         [0.0151],
         [0.0465],
         [0.0241],
         [0.0244],
         [0.0122],
         [0.0082],
         [0.0129],
         [0.0185],
         [0.0178]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0578],
         [0.0316],
         [0.0418],
         [0.0304],
         [0.0291],
         [0.0392],
         [0.0609],
         [0.0028],
         [0.0066],
         [0.0147],
         [0.1559],
         [0.0330]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0268],
         [0.0178],
         [0.0205],
         [0.0041],
         [0.0095],
         [0.0074],
         [0.0207],
         [0.0055],
         [0.0088],
         [0.0041],
         [0.0059],
         [0.0095]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0223],
         [0.0897],
         [0.0589],
         [0.0125],
         [0.0148],
         [0.0447],
         [0.0579],
         [0.03


Evaluating:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                          | 1340/2000 [01:03<00:30, 21.74it/s][A


tensor([[[0.0235],
         [0.0211],
         [0.0369],
         [0.0198],
         [0.0611],
         [0.0211],
         [0.0113],
         [0.0027],
         [0.0030],
         [0.0104],
         [0.0596],
         [0.0160]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0619],
         [0.0163],
         [0.0313],
         [0.0165],
         [0.0367],
         [0.0134],
         [0.0245],
         [0.0104],
         [0.0186],
         [0.0186],
         [0.0066],
         [0.0334]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0167],
         [0.0147],
         [0.0453],
         [0.0110],
         [0.0286],
         [0.0073],
         [0.0044],
         [0.0098],
         [0.0015],
         [0.0075],
         [0.0411],
         [0.0257]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0141],
         [0.0274],
         [0.0745],
         [0.0245],
         [0.0168],
         [0.0182],
         [0.0127],
         [0.0020],
         [0.0026],
   


Evaluating:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                          | 1343/2000 [01:03<00:30, 21.57it/s][A


tensor([[[0.0394],
         [0.0130],
         [0.0435],
         [0.0102],
         [0.0417],
         [0.0038],
         [0.0256],
         [0.0045],
         [0.0129],
         [0.0354],
         [0.0037],
         [0.0216]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0264],
         [0.0033],
         [0.0046],
         [0.0042],
         [0.0030],
         [0.0016],
         [0.0100],
         [0.0035],
         [0.0039],
         [0.0065],
         [0.0014],
         [0.0034]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0096],
         [0.0256],
         [0.0147],
         [0.0106],
         [0.0090],
         [0.0138],
         [0.0407],
         [0.0055],
         [0.0054],
         [0.0058],
         [0.0452],
         [0.0281]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0494],
         [0.0367],
         [0.0210],
         [0.0191],
         [0.0207],
         [0.0273],
         [0.0435],
         [0.0289],
         [0.0555],
   


Evaluating:  67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                          | 1346/2000 [01:03<00:31, 21.05it/s][A
Evaluating:  67%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                          | 1349/2000 [01:03<00:30, 21.22it/s][A


tensor([[[0.0317],
         [0.0113],
         [0.0891],
         [0.0515],
         [0.0401],
         [0.0117],
         [0.0261],
         [0.0195],
         [0.0056],
         [0.0063],
         [0.0264],
         [0.1076]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0055],
         [0.0030],
         [0.0064],
         [0.0017],
         [0.0178],
         [0.0120],
         [0.0067],
         [0.0009],
         [0.0020],
         [0.0035],
         [0.0043],
         [0.0022]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0213],
         [0.0299],
         [0.0848],
         [0.0046],
         [0.0205],
         [0.0026],
         [0.0109],
         [0.0060],
         [0.0039],
         [0.0033],
         [0.0191],
         [0.0159]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0823],
         [0.0566],
         [0.1071],
         [0.0338],
         [0.0260],
         [0.0387],
         [0.0636],
         [0.0101],
         [0.0280],
   


Evaluating:  68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                         | 1352/2000 [01:03<00:30, 21.30it/s][A

tensor([[[0.0522],
         [0.0099],
         [0.0174],
         [0.0063],
         [0.0078],
         [0.0065],
         [0.0113],
         [0.0005],
         [0.0014],
         [0.0019],
         [0.0064],
         [0.0092]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0434],
         [0.0206],
         [0.0377],
         [0.0124],
         [0.0150],
         [0.0050],
         [0.0575],
         [0.0118],
         [0.0224],
         [0.0213],
         [0.0073],
         [0.0553]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0265],
         [0.0592],
         [0.0345],
         [0.0172],
         [0.0379],
         [0.0235],
         [0.0192],
         [0.0187],
         [0.0045],
         [0.0109],
         [0.0201],
         [0.0258]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1643],
         [0.0326],
         [0.0367],
         [0.0216],
         [0.0349],
         [0.0344],
         [0.0458],
         [0.0246],
         [0.0036],
    


Evaluating:  68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                         | 1355/2000 [01:03<00:30, 21.35it/s][A
Evaluating:  68%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                         | 1358/2000 [01:04<00:29, 21.53it/s][A

reg attention sum per layer
tensor([[[0.0158],
         [0.0112],
         [0.0278],
         [0.0089],
         [0.0166],
         [0.0064],
         [0.0217],
         [0.0078],
         [0.0016],
         [0.0078],
         [0.0309],
         [0.0073]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0317],
         [0.0360],
         [0.0420],
         [0.0171],
         [0.0339],
         [0.0120],
         [0.0121],
         [0.0070],
         [0.0121],
         [0.0094],
         [0.0226],
         [0.0229]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1001],
         [0.0547],
         [0.0232],
         [0.0264],
         [0.0293],
         [0.0136],
         [0.0362],
         [0.0531],
         [0.0286],
         [0.0551],
         [0.0065],
         [0.0258]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0947],
         [0.0711],
         [0.1208],
         [0.0300],
         [0.0579],
         [0.0126],
         [0.0489],
         [0.05


Evaluating:  68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                        | 1361/2000 [01:04<00:29, 21.61it/s][A

tensor([[[0.0037],
         [0.0261],
         [0.0556],
         [0.0437],
         [0.0154],
         [0.0110],
         [0.0063],
         [0.0018],
         [0.0020],
         [0.0013],
         [0.1354],
         [0.0583]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0533],
         [0.0525],
         [0.0302],
         [0.0190],
         [0.0255],
         [0.0106],
         [0.0182],
         [0.0379],
         [0.0065],
         [0.0114],
         [0.0317],
         [0.0164]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0550],
         [0.0149],
         [0.0943],
         [0.0122],
         [0.0050],
         [0.0138],
         [0.0204],
         [0.0045],
         [0.0030],
         [0.0027],
         [0.0142],
         [0.0766]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0085],
         [0.0114],
         [0.0156],
         [0.0106],
         [0.0051],
         [0.0107],
         [0.0032],
         [0.0020],
         [0.0012],
    


Evaluating:  68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                        | 1364/2000 [01:04<00:29, 21.67it/s][A

tensor([[[0.0423],
         [0.0792],
         [0.0441],
         [0.0239],
         [0.0170],
         [0.0100],
         [0.0259],
         [0.0368],
         [0.0072],
         [0.0049],
         [0.0153],
         [0.0317]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0384],
         [0.0162],
         [0.0835],
         [0.0072],
         [0.0331],
         [0.0117],
         [0.0239],
         [0.0202],
         [0.0477],
         [0.0057],
         [0.0055],
         [0.0622]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0179],
         [0.0099],
         [0.0544],
         [0.0066],
         [0.0164],
         [0.0083],
         [0.0189],
         [0.0174],
         [0.0069],
         [0.0049],
         [0.0194],
         [0.0205]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0505],
         [0.0311],
         [0.0320],
         [0.0146],
         [0.0105],
         [0.0058],
         [0.0237],
         [0.0135],
         [0.0065],
    


Evaluating:  68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                        | 1367/2000 [01:04<00:29, 21.80it/s][A
Evaluating:  68%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                        | 1370/2000 [01:04<00:28, 21.80it/s][A

reg attention sum per layer
tensor([[[0.0056],
         [0.0162],
         [0.0102],
         [0.0163],
         [0.0014],
         [0.0078],
         [0.0095],
         [0.0061],
         [0.0008],
         [0.0010],
         [0.0068],
         [0.0123]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0066],
         [0.0401],
         [0.0857],
         [0.0080],
         [0.0115],
         [0.0133],
         [0.0043],
         [0.0052],
         [0.0047],
         [0.0027],
         [0.0310],
         [0.0488]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0066],
         [0.0117],
         [0.0561],
         [0.0041],
         [0.0170],
         [0.0142],
         [0.0082],
         [0.0285],
         [0.0084],
         [0.0053],
         [0.0124],
         [0.0305]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0157],
         [0.0220],
         [0.0371],
         [0.0361],
         [0.0532],
         [0.0322],
         [0.0290],
         [0.00


Evaluating:  69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                       | 1373/2000 [01:04<00:28, 21.89it/s][A

tensor([[[0.0901],
         [0.0467],
         [0.0343],
         [0.0305],
         [0.0215],
         [0.0072],
         [0.0339],
         [0.0437],
         [0.0072],
         [0.0138],
         [0.0044],
         [0.0300]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0693],
         [0.0891],
         [0.0495],
         [0.0168],
         [0.0287],
         [0.0085],
         [0.0347],
         [0.0069],
         [0.0244],
         [0.0153],
         [0.0084],
         [0.0350]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0176],
         [0.0046],
         [0.0110],
         [0.0012],
         [0.0020],
         [0.0016],
         [0.0068],
         [0.0031],
         [0.0019],
         [0.0036],
         [0.0028],
         [0.0073]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1062],
         [0.2205],
         [0.0571],
         [0.0262],
         [0.0512],
         [0.0322],
         [0.0360],
         [0.0442],
         [0.0162],
    


Evaluating:  69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                       | 1376/2000 [01:04<00:28, 21.77it/s][A
Evaluating:  69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                       | 1379/2000 [01:05<00:28, 21.68it/s][A

tensor([[[0.0430],
         [0.0248],
         [0.1330],
         [0.0421],
         [0.0289],
         [0.0020],
         [0.0295],
         [0.0155],
         [0.0018],
         [0.0159],
         [0.0789],
         [0.0904]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0368],
         [0.0194],
         [0.0252],
         [0.0116],
         [0.0184],
         [0.0105],
         [0.0384],
         [0.0122],
         [0.0131],
         [0.0124],
         [0.0118],
         [0.0174]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0188],
         [0.0383],
         [0.0298],
         [0.0085],
         [0.0058],
         [0.0074],
         [0.0065],
         [0.0143],
         [0.0032],
         [0.0078],
         [0.0297],
         [0.0255]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0264],
         [0.0372],
         [0.0252],
         [0.0108],
         [0.0252],
         [0.0098],
         [0.0248],
         [0.0086],
         [0.0054],
    


Evaluating:  69%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                       | 1382/2000 [01:05<00:28, 21.39it/s][A


tensor([[[0.0408],
         [0.1366],
         [0.4045],
         [0.0451],
         [0.0118],
         [0.0158],
         [0.0277],
         [0.0099],
         [0.0084],
         [0.0128],
         [0.0656],
         [0.2957]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0512],
         [0.0285],
         [0.1006],
         [0.0234],
         [0.0270],
         [0.0047],
         [0.0537],
         [0.0045],
         [0.0631],
         [0.0109],
         [0.0159],
         [0.0276]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0358],
         [0.0215],
         [0.0135],
         [0.0041],
         [0.0062],
         [0.0087],
         [0.0211],
         [0.0053],
         [0.0060],
         [0.0054],
         [0.0054],
         [0.0090]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0224],
         [0.0174],
         [0.0572],
         [0.0183],
         [0.0884],
         [0.0332],
         [0.0144],
         [0.0126],
         [0.0116],
   


Evaluating:  69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                      | 1385/2000 [01:05<00:28, 21.51it/s][A

tensor([[[0.0219],
         [0.0120],
         [0.0569],
         [0.0379],
         [0.0434],
         [0.0130],
         [0.0049],
         [0.0084],
         [0.0037],
         [0.0056],
         [0.0104],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0296],
         [0.0309],
         [0.0426],
         [0.0172],
         [0.0182],
         [0.0112],
         [0.0184],
         [0.0320],
         [0.0031],
         [0.0096],
         [0.0646],
         [0.0170]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0243],
         [0.0422],
         [0.0729],
         [0.0449],
         [0.0150],
         [0.0082],
         [0.0141],
         [0.0344],
         [0.0057],
         [0.0040],
         [0.0807],
         [0.0376]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0152],
         [0.0179],
         [0.0243],
         [0.0039],
         [0.0063],
         [0.0025],
         [0.0054],
         [0.0139],
         [0.0031],
    


Evaluating:  69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                      | 1388/2000 [01:05<00:28, 21.55it/s][A
Evaluating:  70%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                      | 1391/2000 [01:05<00:28, 21.53it/s][A


reg attention sum per layer
tensor([[[0.1445],
         [0.0719],
         [0.0224],
         [0.0174],
         [0.0118],
         [0.0118],
         [0.0238],
         [0.0049],
         [0.0036],
         [0.0039],
         [0.0135],
         [0.0237]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0242],
         [0.0542],
         [0.0547],
         [0.0125],
         [0.0354],
         [0.0136],
         [0.0169],
         [0.0141],
         [0.0048],
         [0.0079],
         [0.0151],
         [0.0284]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0025],
         [0.0138],
         [0.0243],
         [0.0049],
         [0.0029],
         [0.0013],
         [0.0019],
         [0.0009],
         [0.0003],
         [0.0004],
         [0.0163],
         [0.0152]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0541],
         [0.0926],
         [0.0336],
         [0.0078],
         [0.0161],
         [0.0136],
         [0.0543],
         [0.0


Evaluating:  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                      | 1394/2000 [01:05<00:27, 21.66it/s][A


reg attention sum per layer
tensor([[[0.0363],
         [0.0221],
         [0.0384],
         [0.0089],
         [0.0246],
         [0.0158],
         [0.0153],
         [0.0134],
         [0.0122],
         [0.0082],
         [0.0193],
         [0.0378]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0145],
         [0.0120],
         [0.0276],
         [0.0060],
         [0.0170],
         [0.0203],
         [0.0147],
         [0.0270],
         [0.0044],
         [0.0064],
         [0.0269],
         [0.0235]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0093],
         [0.0225],
         [0.0721],
         [0.0025],
         [0.0117],
         [0.0076],
         [0.0117],
         [0.0022],
         [0.0074],
         [0.0073],
         [0.0726],
         [0.0273]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0277],
         [0.0232],
         [0.0996],
         [0.0244],
         [0.0298],
         [0.0048],
         [0.0290],
         [0.0


Evaluating:  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                     | 1397/2000 [01:05<00:27, 21.75it/s][A
Evaluating:  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                     | 1400/2000 [01:06<00:27, 21.76it/s][A

reg attention sum per layer
tensor([[[0.0902],
         [0.0572],
         [0.0813],
         [0.0544],
         [0.0475],
         [0.0314],
         [0.0192],
         [0.0543],
         [0.0069],
         [0.0186],
         [0.0447],
         [0.0782]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0251],
         [0.0690],
         [0.0586],
         [0.0144],
         [0.0269],
         [0.0145],
         [0.0229],
         [0.0047],
         [0.0070],
         [0.0236],
         [0.0532],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0204],
         [0.0565],
         [0.1649],
         [0.0124],
         [0.0137],
         [0.0116],
         [0.0161],
         [0.0038],
         [0.0025],
         [0.0049],
         [0.0558],
         [0.0395]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0116],
         [0.0080],
         [0.0175],
         [0.0059],
         [0.0241],
         [0.0139],
         [0.0075],
         [0.00


Evaluating:  70%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                     | 1403/2000 [01:06<00:27, 21.77it/s][A

tensor([[[0.1262],
         [0.0618],
         [0.0648],
         [0.0311],
         [0.0152],
         [0.0107],
         [0.0608],
         [0.0827],
         [0.0069],
         [0.0256],
         [0.0075],
         [0.0366]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0201],
         [0.0497],
         [0.1168],
         [0.0185],
         [0.0249],
         [0.0101],
         [0.0212],
         [0.0271],
         [0.0083],
         [0.0073],
         [0.0795],
         [0.0560]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0361],
         [0.0146],
         [0.0370],
         [0.0128],
         [0.0204],
         [0.0195],
         [0.0409],
         [0.0107],
         [0.0235],
         [0.0125],
         [0.0046],
         [0.0418]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0239],
         [0.0338],
         [0.0325],
         [0.0143],
         [0.0138],
         [0.0104],
         [0.0218],
         [0.0087],
         [0.0023],
    


Evaluating:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                    | 1406/2000 [01:06<00:27, 21.73it/s][A
Evaluating:  70%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 1409/2000 [01:06<00:27, 21.75it/s]

tensor([[[0.0205],
         [0.0731],
         [0.0564],
         [0.0215],
         [0.0222],
         [0.0078],
         [0.0211],
         [0.0096],
         [0.0088],
         [0.0055],
         [0.0394],
         [0.0571]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1216],
         [0.0595],
         [0.0885],
         [0.0718],
         [0.0730],
         [0.0420],
         [0.0281],
         [0.0359],
         [0.0413],
         [0.0394],
         [0.1235],
         [0.1067]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0588],
         [0.0503],
         [0.1335],
         [0.0288],
         [0.0604],
         [0.0181],
         [0.0196],
         [0.0348],
         [0.0091],
         [0.0089],
         [0.0356],
         [0.0439]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0304],
         [0.0168],
         [0.0305],
         [0.0331],
         [0.0079],
         [0.0207],
         [0.0468],
         [0.0120],
         [0.0062],
    

[A
Evaluating:  71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                    | 1412/2000 [01:06<00:26, 21.81it/s][A

reg attention sum per layer
tensor([[[0.0019],
         [0.0061],
         [0.0048],
         [0.0009],
         [0.0013],
         [0.0028],
         [0.0029],
         [0.0006],
         [0.0006],
         [0.0012],
         [0.0247],
         [0.0084]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0429],
         [0.0178],
         [0.0282],
         [0.0121],
         [0.0152],
         [0.0073],
         [0.0033],
         [0.0052],
         [0.0027],
         [0.0104],
         [0.0142],
         [0.0046]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0194],
         [0.0327],
         [0.0237],
         [0.0190],
         [0.0135],
         [0.0118],
         [0.0174],
         [0.0284],
         [0.0110],
         [0.0054],
         [0.0068],
         [0.0344]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1740],
         [0.0648],
         [0.0791],
         [0.0352],
         [0.0490],
         [0.0158],
         [0.0363],
         [0.01


Evaluating:  71%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                    | 1415/2000 [01:06<00:27, 21.67it/s][A

tensor([[[0.0118],
         [0.0116],
         [0.0266],
         [0.0071],
         [0.0124],
         [0.0038],
         [0.0070],
         [0.0037],
         [0.0046],
         [0.0019],
         [0.0104],
         [0.0149]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1542],
         [0.0614],
         [0.0659],
         [0.0321],
         [0.0504],
         [0.0202],
         [0.0362],
         [0.0219],
         [0.0319],
         [0.0206],
         [0.0156],
         [0.0330]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0385],
         [0.0515],
         [0.0523],
         [0.0133],
         [0.0160],
         [0.0108],
         [0.0329],
         [0.0141],
         [0.0119],
         [0.0162],
         [0.0100],
         [0.01


Evaluating:  71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                   | 1418/2000 [01:06<00:27, 20.98it/s][A
Evaluating:  71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                   | 1421/2000 [01:07<00:28, 20.31it/s][A

reg attention sum per layer
tensor([[[0.0224],
         [0.0382],
         [0.0778],
         [0.0522],
         [0.0061],
         [0.0089],
         [0.0082],
         [0.0044],
         [0.0024],
         [0.0023],
         [0.0515],
         [0.0512]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0269],
         [0.0309],
         [0.0511],
         [0.0128],
         [0.0259],
         [0.0099],
         [0.0111],
         [0.0193],
         [0.0134],
         [0.0045],
         [0.0149],
         [0.0261]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0405],
         [0.0575],
         [0.0672],
         [0.0142],
         [0.0124],
         [0.0116],
         [0.0065],
         [0.0047],
         [0.0020],
         [0.0073],
         [0.0384],
         [0.0248]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0088],
         [0.0210],
         [0.0487],
         [0.0097],
         [0.0157],
         [0.0036],
         [0.0208],
         [0.00


Evaluating:  71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                   | 1424/2000 [01:07<00:29, 19.80it/s][A

reg attention sum per layer
tensor([[[0.0100],
         [0.0098],
         [0.0203],
         [0.0111],
         [0.0241],
         [0.0189],
         [0.0165],
         [0.0021],
         [0.0192],
         [0.0165],
         [0.0169],
         [0.0260]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0375],
         [0.0254],
         [0.0524],
         [0.0332],
         [0.0221],
         [0.0249],
         [0.0113],
         [0.0056],
         [0.0045],
         [0.0079],
         [0.0397],
         [0.0426]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0267],
         [0.0241],
         [0.0785],
         [0.0148],
         [0.0655],
         [0.0142],
         [0.0065],
         [0.0178],
         [0.0102],
         [0.0132],
         [0.0528],
         [0.0359]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0640],
         [0.0389],
         [0.0865],
         [0.0179],
         [0.0361],
         [0.0188],
         [0.0096],
         [0.00


Evaluating:  71%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                   | 1426/2000 [01:07<00:29, 19.35it/s][A
Evaluating:  71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                   | 1428/2000 [01:07<00:30, 18.74it/s][A

reg attention sum per layer
tensor([[[0.0405],
         [0.0591],
         [0.0548],
         [0.0097],
         [0.0278],
         [0.0208],
         [0.0382],
         [0.0358],
         [0.0087],
         [0.0119],
         [0.0142],
         [0.0254]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0185],
         [0.0303],
         [0.0567],
         [0.0145],
         [0.0087],
         [0.0118],
         [0.0077],
         [0.0022],
         [0.0014],
         [0.0027],
         [0.0500],
         [0.0245]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0539],
         [0.0294],
         [0.0489],
         [0.0349],
         [0.0301],
         [0.0309],
         [0.0288],
         [0.0462],
         [0.0133],
         [0.0168],
         [0.0651],
         [0.0523]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0366],
         [0.0340],
         [0.0648],
         [0.0212],
         [0.0306],
         [0.0143],
         [0.0240],
         [0.00


Evaluating:  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                  | 1430/2000 [01:07<00:30, 18.43it/s][A
Evaluating:  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                  | 1432/2000 [01:07<00:31, 18.22it/s][A

reg attention sum per layer
tensor([[[0.0727],
         [0.1236],
         [0.0535],
         [0.0197],
         [0.0620],
         [0.0538],
         [0.0346],
         [0.0349],
         [0.0134],
         [0.0259],
         [0.1381],
         [0.0537]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0223],
         [0.0276],
         [0.0293],
         [0.0042],
         [0.0112],
         [0.0098],
         [0.0394],
         [0.0143],
         [0.0176],
         [0.0086],
         [0.0175],
         [0.0284]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0307],
         [0.0519],
         [0.1083],
         [0.0322],
         [0.0191],
         [0.0147],
         [0.0252],
         [0.0182],
         [0.0050],
         [0.0078],
         [0.0354],
         [0.0629]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1469],
         [0.0550],
         [0.1440],
         [0.0824],
         [0.0179],
         [0.0186],
         [0.0452],
         [0.05


Evaluating:  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                  | 1434/2000 [01:07<00:30, 18.27it/s][A
Evaluating:  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                  | 1436/2000 [01:07<00:31, 18.11it/s][A

reg attention sum per layer
tensor([[[0.0504],
         [0.0201],
         [0.0277],
         [0.0140],
         [0.0106],
         [0.0079],
         [0.0110],
         [0.0217],
         [0.0039],
         [0.0043],
         [0.0119],
         [0.0296]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0316],
         [0.0318],
         [0.1312],
         [0.0247],
         [0.0156],
         [0.0282],
         [0.0075],
         [0.0030],
         [0.0014],
         [0.0037],
         [0.0535],
         [0.1261]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0862],
         [0.0772],
         [0.0268],
         [0.0184],
         [0.0328],
         [0.0197],
         [0.0109],
         [0.0195],
         [0.0032],
         [0.0227],
         [0.0377],
         [0.0123]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0207],
         [0.0176],
         [0.0195],
         [0.0107],
         [0.0136],
         [0.0052],
         [0.0244],
         [0.00


Evaluating:  72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                  | 1438/2000 [01:07<00:30, 18.35it/s][A
Evaluating:  72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                 | 1440/2000 [01:08<00:29, 18.67it/s][A

reg attention sum per layer
tensor([[[0.0267],
         [0.0095],
         [0.0220],
         [0.0041],
         [0.0177],
         [0.0052],
         [0.0056],
         [0.0062],
         [0.0089],
         [0.0105],
         [0.0053],
         [0.0182]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0130],
         [0.0244],
         [0.1517],
         [0.0137],
         [0.0550],
         [0.0076],
         [0.0071],
         [0.0042],
         [0.0053],
         [0.0043],
         [0.0332],
         [0.0173]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0093],
         [0.0308],
         [0.0539],
         [0.0302],
         [0.2201],
         [0.0049],
         [0.0374],
         [0.0014],
         [0.0106],
         [0.0594],
         [0.1228],
         [0.0197]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0097],
         [0.0339],
         [0.0170],
         [0.0043],
         [0.0062],
         [0.0070],
         [0.0083],
         [0.00


Evaluating:  72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                 | 1443/2000 [01:08<00:28, 19.47it/s][A
Evaluating:  72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                                 | 1446/2000 [01:08<00:27, 20.16it/s][A

tensor([[[0.0375],
         [0.0286],
         [0.0289],
         [0.0213],
         [0.0305],
         [0.0237],
         [0.0137],
         [0.0185],
         [0.0415],
         [0.0101],
         [0.0179],
         [0.0543]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0090],
         [0.0067],
         [0.0233],
         [0.0092],
         [0.0109],
         [0.0124],
         [0.0011],
         [0.0037],
         [0.0002],
         [0.0009],
         [0.0230],
         [0.0142]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0262],
         [0.0313],
         [0.0201],
         [0.0093],
         [0.0132],
         [0.0134],
         [0.0132],
         [0.0030],
         [0.0035],
         [0.0122],
         [0.0122],
         [0.0118]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0312],
         [0.0085],
         [0.0243],
         [0.0132],
         [0.0078],
         [0.0088],
         [0.0289],
         [0.0063],
         [0.0048],
    


Evaluating:  72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                 | 1449/2000 [01:08<00:26, 20.71it/s][A

reg attention sum per layer
tensor([[[0.0104],
         [0.0131],
         [0.0346],
         [0.0129],
         [0.0123],
         [0.0140],
         [0.0294],
         [0.0050],
         [0.0059],
         [0.0103],
         [0.0068],
         [0.0388]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0639],
         [0.1135],
         [0.0953],
         [0.0209],
         [0.1122],
         [0.0669],
         [0.0330],
         [0.0479],
         [0.0096],
         [0.0173],
         [0.0408],
         [0.0436]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0394],
         [0.1677],
         [0.0507],
         [0.0109],
         [0.0287],
         [0.0130],
         [0.0234],
         [0.0566],
         [0.0098],
         [0.0110],
         [0.0140],
         [0.0307]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0318],
         [0.0404],
         [0.0362],
         [0.0147],
         [0.0485],
         [0.0067],
         [0.0112],
         [0.02


Evaluating:  73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                                | 1452/2000 [01:08<00:26, 21.02it/s][A

tensor([[[0.0357],
         [0.0366],
         [0.0516],
         [0.0165],
         [0.0202],
         [0.0080],
         [0.0085],
         [0.0324],
         [0.0042],
         [0.0052],
         [0.0605],
         [0.0398]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0263],
         [0.0511],
         [0.0523],
         [0.0163],
         [0.0266],
         [0.0069],
         [0.0223],
         [0.0148],
         [0.0057],
         [0.0053],
         [0.0362],
         [0.0118]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0383],
         [0.0319],
         [0.0448],
         [0.0090],
         [0.0203],
         [0.0053],
         [0.0205],
         [0.0088],
         [0.0055],
         [0.0072],
         [0.0058],
         [0.0243]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0084],
         [0.0210],
         [0.0833],
         [0.0130],
         [0.0482],
         [0.0156],
         [0.0213],
         [0.0266],
         [0.0296],
    


Evaluating:  73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 1455/2000 [01:08<00:25, 21.34it/s][A
Evaluating:  73%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                | 1458/2000 [01:08<00:25, 21.52it/s][A

reg attention sum per layer
tensor([[[0.0104],
         [0.0299],
         [0.0305],
         [0.0182],
         [0.0587],
         [0.0140],
         [0.0313],
         [0.0085],
         [0.0099],
         [0.0098],
         [0.0145],
         [0.0156]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0806],
         [0.0223],
         [0.0101],
         [0.0094],
         [0.0156],
         [0.0101],
         [0.0183],
         [0.0085],
         [0.0065],
         [0.0122],
         [0.0398],
         [0.0173]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0438],
         [0.0401],
         [0.0556],
         [0.0199],
         [0.0236],
         [0.0066],
         [0.0152],
         [0.0060],
         [0.0056],
         [0.0127],
         [0.0204],
         [0.0159]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0315],
         [0.0105],
         [0.0486],
         [0.0184],
         [0.0208],
         [0.0122],
         [0.0079],
         [0.01


Evaluating:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                | 1461/2000 [01:09<00:24, 21.65it/s][A

tensor([[[0.0260],
         [0.0133],
         [0.0352],
         [0.0048],
         [0.0513],
         [0.0033],
         [0.0062],
         [0.0118],
         [0.0167],
         [0.0055],
         [0.0108],
         [0.0133]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0440],
         [0.0248],
         [0.0264],
         [0.0098],
         [0.0182],
         [0.0171],
         [0.0234],
         [0.0234],
         [0.0080],
         [0.0310],
         [0.0064],
         [0.0217]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0454],
         [0.0342],
         [0.0628],
         [0.0202],
         [0.0324],
         [0.0046],
         [0.0195],
         [0.0059],
         [0.0129],
         [0.0069],
         [0.0126],
         [0.0323]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0208],
         [0.0264],
         [0.1005],
         [0.0537],
         [0.0924],
         [0.0488],
         [0.0204],
         [0.0175],
         [0.0107],
    


Evaluating:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                               | 1464/2000 [01:09<00:24, 21.65it/s][A
Evaluating:  73%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                               | 1467/2000 [01:09<00:24, 21.69it/s][A

tensor([[[0.0575],
         [0.0583],
         [0.0377],
         [0.0215],
         [0.0296],
         [0.0194],
         [0.0209],
         [0.0088],
         [0.0050],
         [0.0295],
         [0.0068],
         [0.0195]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0678],
         [0.0347],
         [0.0354],
         [0.0199],
         [0.0196],
         [0.0244],
         [0.0375],
         [0.0041],
         [0.0069],
         [0.0090],
         [0.0119],
         [0.0076]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0559],
         [0.0411],
         [0.0904],
         [0.0106],
         [0.0441],
         [0.0160],
         [0.0240],
         [0.0040],
         [0.0060],
         [0.0089],
         [0.0182],
         [0.0674]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1243],
         [0.0488],
         [0.0490],
         [0.0289],
         [0.0200],
         [0.0320],
         [0.0682],
         [0.0106],
         [0.0080],
    


Evaluating:  74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 1470/2000 [01:09<00:24, 21.72it/s][A


reg attention sum per layer
tensor([[[0.0593],
         [0.0644],
         [0.0486],
         [0.0262],
         [0.0267],
         [0.0332],
         [0.0065],
         [0.0387],
         [0.0095],
         [0.0103],
         [0.0661],
         [0.0456]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0453],
         [0.0414],
         [0.0634],
         [0.0164],
         [0.0308],
         [0.0061],
         [0.0136],
         [0.0146],
         [0.0030],
         [0.0054],
         [0.0470],
         [0.0828]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0492],
         [0.0239],
         [0.0165],
         [0.0239],
         [0.0129],
         [0.0083],
         [0.0108],
         [0.0434],
         [0.0143],
         [0.0139],
         [0.0099],
         [0.0205]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0167],
         [0.1410],
         [0.0154],
         [0.0059],
         [0.0101],
         [0.0063],
         [0.0321],
         [0.0


Evaluating:  74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 1473/2000 [01:09<00:24, 21.79it/s][A
Evaluating:  74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                              | 1476/2000 [01:09<00:23, 21.84it/s][A

reg attention sum per layer
tensor([[[0.1024],
         [0.0593],
         [0.0287],
         [0.0388],
         [0.0411],
         [0.0253],
         [0.0327],
         [0.0494],
         [0.0232],
         [0.0340],
         [0.0079],
         [0.0155]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0525],
         [0.0446],
         [0.0442],
         [0.0703],
         [0.0329],
         [0.0210],
         [0.0220],
         [0.0083],
         [0.0021],
         [0.0041],
         [0.1251],
         [0.0557]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0124],
         [0.0422],
         [0.0536],
         [0.0091],
         [0.0076],
         [0.0038],
         [0.0152],
         [0.0029],
         [0.0054],
         [0.0026],
         [0.0320],
         [0.0170]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0139],
         [0.0204],
         [0.0243],
         [0.0082],
         [0.0159],
         [0.0098],
         [0.0203],
         [0.00


Evaluating:  74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                              | 1479/2000 [01:09<00:24, 21.64it/s][A

tensor([[[0.0655],
         [0.2437],
         [0.0705],
         [0.0166],
         [0.0084],
         [0.0133],
         [0.0191],
         [0.0078],
         [0.0022],
         [0.0068],
         [0.0194],
         [0.0234]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0296],
         [0.1539],
         [0.1197],
         [0.0353],
         [0.0285],
         [0.0037],
         [0.0106],
         [0.0175],
         [0.0201],
         [0.0120],
         [0.0222],
         [0.0332]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0236],
         [0.0437],
         [0.0570],
         [0.0112],
         [0.0565],
         [0.0198],
         [0.0316],
         [0.0144],
         [0.0169],
         [0.0111],
         [0.0941],
         [0.0536]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0501],
         [0.0627],
         [0.0280],
         [0.0199],
         [0.0232],
         [0.0158],
         [0.0336],
         [0.0137],
         [0.0073],
    


Evaluating:  74%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                              | 1482/2000 [01:10<00:23, 21.69it/s][A

tensor([[[0.0155],
         [0.0238],
         [0.0320],
         [0.0144],
         [0.0125],
         [0.0100],
         [0.0098],
         [0.0025],
         [0.0025],
         [0.0020],
         [0.0681],
         [0.0390]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0451],
         [0.0114],
         [0.0535],
         [0.0097],
         [0.0248],
         [0.0137],
         [0.0125],
         [0.0049],
         [0.0114],
         [0.0060],
         [0.0137],
         [0.0294]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0108],
         [0.0067],
         [0.0068],
         [0.0030],
         [0.0017],
         [0.0063],
         [0.0118],
         [0.0022],
         [0.0016],
         [0.0019],
         [0.0067],
         [0.0093]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0261],
         [0.0154],
         [0.0384],
         [0.0121],
         [0.0275],
         [0.0113],
         [0.0091],
         [0.0215],
         [0.0028],
    


Evaluating:  74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                             | 1485/2000 [01:10<00:23, 21.58it/s][A
Evaluating:  74%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                             | 1488/2000 [01:10<00:23, 21.64it/s][A


tensor([[[0.0456],
         [0.0385],
         [0.0639],
         [0.0261],
         [0.0161],
         [0.0066],
         [0.0246],
         [0.0079],
         [0.0050],
         [0.0075],
         [0.0232],
         [0.0178]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0286],
         [0.0564],
         [0.0873],
         [0.0155],
         [0.0180],
         [0.0121],
         [0.0203],
         [0.0057],
         [0.0064],
         [0.0090],
         [0.0281],
         [0.0967]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0089],
         [0.0275],
         [0.0648],
         [0.0042],
         [0.0142],
         [0.0115],
         [0.0163],
         [0.0047],
         [0.0022],
         [0.0028],
         [0.0216],
         [0.0275]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0606],
         [0.0518],
         [0.0407],
         [0.0145],
         [0.0367],
         [0.0064],
         [0.0336],
         [0.0233],
         [0.0165],
   


Evaluating:  75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                             | 1491/2000 [01:10<00:23, 21.69it/s][A

tensor([[[0.0182],
         [0.0901],
         [0.0491],
         [0.0169],
         [0.0157],
         [0.0110],
         [0.0087],
         [0.0062],
         [0.0058],
         [0.0035],
         [0.0272],
         [0.0351]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0423],
         [0.0512],
         [0.0914],
         [0.0218],
         [0.0750],
         [0.0192],
         [0.0455],
         [0.0134],
         [0.0267],
         [0.0235],
         [0.0286],
         [0.0509]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0565],
         [0.0160],
         [0.0436],
         [0.0200],
         [0.0312],
         [0.0166],
         [0.0128],
         [0.0093],
         [0.0015],
         [0.0092],
         [0.0158],
         [0.0272]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0560],
         [0.0869],
         [0.0561],
         [0.0369],
         [0.0274],
         [0.0308],
         [0.0245],
         [0.0399],
         [0.0036],
    


Evaluating:  75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                             | 1494/2000 [01:10<00:23, 21.72it/s][A
Evaluating:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                            | 1497/2000 [01:10<00:23, 21.65it/s][A

reg attention sum per layer
tensor([[[0.0258],
         [0.0263],
         [0.0241],
         [0.0073],
         [0.0235],
         [0.0190],
         [0.0171],
         [0.0375],
         [0.0123],
         [0.0212],
         [0.0283],
         [0.0118]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0599],
         [0.0699],
         [0.0391],
         [0.0293],
         [0.0614],
         [0.0135],
         [0.0220],
         [0.0028],
         [0.0082],
         [0.0069],
         [0.0444],
         [0.0560]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0349],
         [0.0040],
         [0.0247],
         [0.0031],
         [0.0067],
         [0.0309],
         [0.0276],
         [0.0024],
         [0.0028],
         [0.0061],
         [0.0038],
         [0.0070]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0350],
         [0.0527],
         [0.0454],
         [0.0112],
         [0.0057],
         [0.0351],
         [0.0160],
         [0.00


Evaluating:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                            | 1500/2000 [01:10<00:22, 21.74it/s][A


tensor([[[0.0455],
         [0.0421],
         [0.0847],
         [0.0183],
         [0.0348],
         [0.0362],
         [0.0359],
         [0.0071],
         [0.0087],
         [0.0282],
         [0.0357],
         [0.0520]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0500],
         [0.0174],
         [0.0257],
         [0.0092],
         [0.0244],
         [0.0037],
         [0.0157],
         [0.0173],
         [0.0290],
         [0.0114],
         [0.0046],
         [0.0215]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0236],
         [0.0257],
         [0.0534],
         [0.0071],
         [0.0119],
         [0.0033],
         [0.0085],
         [0.0055],
         [0.0152],
         [0.0037],
         [0.0180],
         [0.0311]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0454],
         [0.1034],
         [0.0413],
         [0.0075],
         [0.0173],
         [0.0090],
         [0.0259],
         [0.0068],
         [0.0120],
   


Evaluating:  75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                            | 1503/2000 [01:10<00:22, 21.71it/s][A

tensor([[[0.0274],
         [0.0162],
         [0.0673],
         [0.0144],
         [0.0542],
         [0.0037],
         [0.0050],
         [0.0391],
         [0.0073],
         [0.0202],
         [0.0085],
         [0.0213]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1579],
         [0.0323],
         [0.0197],
         [0.0130],
         [0.0239],
         [0.0226],
         [0.0257],
         [0.0296],
         [0.0047],
         [0.0543],
         [0.0083],
         [0.0067]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0498],
         [0.0386],
         [0.0274],
         [0.0186],
         [0.0201],
         [0.0780],
         [0.0085],
         [0.0238],
         [0.0040],
         [0.0072],
         [0.0493],
         [0.0288]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0204],
         [0.0416],
         [0.0679],
         [0.0168],
         [0.0471],
         [0.0135],
         [0.0111],
         [0.0111],
         [0.0093],
    


Evaluating:  75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                            | 1506/2000 [01:11<00:22, 21.69it/s][A
Evaluating:  75%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                           | 1509/2000 [01:11<00:22, 21.63it/s][A


reg attention sum per layer
tensor([[[0.0058],
         [0.0306],
         [0.0444],
         [0.0086],
         [0.0345],
         [0.0192],
         [0.0287],
         [0.0190],
         [0.0134],
         [0.0100],
         [0.1371],
         [0.0333]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0180],
         [0.0206],
         [0.0331],
         [0.0078],
         [0.0110],
         [0.0132],
         [0.0223],
         [0.0053],
         [0.0020],
         [0.0041],
         [0.0142],
         [0.0209]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0226],
         [0.0653],
         [0.0360],
         [0.0118],
         [0.0133],
         [0.0184],
         [0.0326],
         [0.0056],
         [0.0057],
         [0.0170],
         [0.0094],
         [0.0246]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0876],
         [0.1661],
         [0.1253],
         [0.0455],
         [0.0331],
         [0.0104],
         [0.0775],
         [0.0


Evaluating:  76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                           | 1512/2000 [01:11<00:23, 20.91it/s][A


reg attention sum per layer
tensor([[[0.0130],
         [0.0153],
         [0.0880],
         [0.0067],
         [0.0102],
         [0.0050],
         [0.0115],
         [0.0014],
         [0.0022],
         [0.0017],
         [0.0295],
         [0.0195]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0419],
         [0.0369],
         [0.0931],
         [0.0238],
         [0.0639],
         [0.0112],
         [0.0179],
         [0.0235],
         [0.0175],
         [0.0079],
         [0.0309],
         [0.0612]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0434],
         [0.0488],
         [0.0374],
         [0.0246],
         [0.0380],
         [0.0437],
         [0.0162],
         [0.0251],
         [0.0069],
         [0.0107],
         [0.0356],
         [0.0155]]], device='cuda:0')



Evaluating:  76%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                           | 1515/2000 [01:11<00:24, 20.11it/s][A

reg attention sum per layer
tensor([[[0.0064],
         [0.0070],
         [0.0088],
         [0.0023],
         [0.0101],
         [0.0183],
         [0.0043],
         [0.0051],
         [0.0041],
         [0.0014],
         [0.0293],
         [0.0197]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2495],
         [0.0594],
         [0.0776],
         [0.0275],
         [0.0589],
         [0.1691],
         [0.0436],
         [0.0082],
         [0.0157],
         [0.0272],
         [0.0436],
         [0.0744]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0272],
         [0.0133],
         [0.0623],
         [0.0289],
         [0.0339],
         [0.0264],
         [0.0385],
         [0.0174],
         [0.0038],
         [0.0136],
         [0.0357],
         [0.0312]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0182],
         [0.0276],
         [0.0185],
         [0.0094],
         [0.0111],
         [0.0077],
         [0.0139],
         [0.00


Evaluating:  76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                           | 1518/2000 [01:11<00:24, 19.54it/s][A
Evaluating:  76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                          | 1520/2000 [01:11<00:25, 18.91it/s][A

reg attention sum per layer
tensor([[[0.0496],
         [0.0234],
         [0.0489],
         [0.0041],
         [0.0425],
         [0.0345],
         [0.0091],
         [0.0092],
         [0.0029],
         [0.0209],
         [0.0119],
         [0.0144]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0899],
         [0.0542],
         [0.0668],
         [0.0342],
         [0.0664],
         [0.0047],
         [0.0533],
         [0.0264],
         [0.0370],
         [0.0213],
         [0.0079],
         [0.1145]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0425],
         [0.0321],
         [0.0787],
         [0.0262],
         [0.0339],
         [0.0223],
         [0.0241],
         [0.0073],
         [0.0028],
         [0.0059],
         [0.1875],
         [0.0755]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0372],
         [0.0548],
         [0.0408],
         [0.0279],
         [0.0352],
         [0.0154],
         [0.0165],
         [0.07


Evaluating:  76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                          | 1522/2000 [01:11<00:25, 18.62it/s][A
Evaluating:  76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 1524/2000 [01:12<00:26, 18.20it/s][A

reg attention sum per layer
tensor([[[0.0746],
         [0.0537],
         [0.0759],
         [0.0141],
         [0.0373],
         [0.0872],
         [0.0313],
         [0.0178],
         [0.0226],
         [0.0225],
         [0.0426],
         [0.0282]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0659],
         [0.0714],
         [0.0676],
         [0.0094],
         [0.0195],
         [0.0157],
         [0.0276],
         [0.0058],
         [0.0057],
         [0.0122],
         [0.0207],
         [0.0639]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0259],
         [0.0329],
         [0.0332],
         [0.0090],
         [0.0200],
         [0.0081],
         [0.0288],
         [0.0116],
         [0.0047],
         [0.0150],
         [0.0153],
         [0.0148]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0834],
         [0.0486],
         [0.0229],
         [0.0268],
         [0.0165],
         [0.0187],
         [0.0294],
         [0.00


Evaluating:  76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                          | 1526/2000 [01:12<00:26, 18.16it/s][A
Evaluating:  76%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                          | 1528/2000 [01:12<00:25, 18.28it/s][A

reg attention sum per layer
tensor([[[0.0799],
         [0.0465],
         [0.1181],
         [0.0654],
         [0.0608],
         [0.0248],
         [0.0188],
         [0.0219],
         [0.0082],
         [0.0146],
         [0.0451],
         [0.0403]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0193],
         [0.0372],
         [0.0185],
         [0.0085],
         [0.0093],
         [0.0090],
         [0.0088],
         [0.0041],
         [0.0013],
         [0.0031],
         [0.0046],
         [0.0262]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0208],
         [0.0225],
         [0.0234],
         [0.0085],
         [0.0147],
         [0.0058],
         [0.0202],
         [0.0278],
         [0.0216],
         [0.0096],
         [0.0048],
         [0.0218]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0115],
         [0.0271],
         [0.0225],
         [0.0050],
         [0.0077],
         [0.0201],
         [0.0097],
         [0.01


Evaluating:  76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                         | 1530/2000 [01:12<00:25, 18.32it/s][A
Evaluating:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                         | 1532/2000 [01:12<00:25, 18.49it/s][A

reg attention sum per layer
tensor([[[0.0515],
         [0.0458],
         [0.0707],
         [0.0319],
         [0.0382],
         [0.0119],
         [0.0188],
         [0.0118],
         [0.0076],
         [0.0056],
         [0.0295],
         [0.0727]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0432],
         [0.0458],
         [0.0488],
         [0.0151],
         [0.0260],
         [0.0872],
         [0.0146],
         [0.0154],
         [0.0037],
         [0.0099],
         [0.0215],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0346],
         [0.0615],
         [0.0296],
         [0.0116],
         [0.0074],
         [0.0104],
         [0.0265],
         [0.0086],
         [0.0014],
         [0.0066],
         [0.0357],
         [0.0189]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0203],
         [0.0200],
         [0.0114],
         [0.0065],
         [0.0149],
         [0.0317],
         [0.0139],
         [0.00


Evaluating:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                         | 1535/2000 [01:12<00:24, 19.15it/s][A

tensor([[[0.0305],
         [0.0169],
         [0.0130],
         [0.0095],
         [0.0149],
         [0.0088],
         [0.0072],
         [0.0145],
         [0.0225],
         [0.0096],
         [0.0052],
         [0.0261]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0090],
         [0.0252],
         [0.0485],
         [0.0135],
         [0.0109],
         [0.0107],
         [0.0121],
         [0.0054],
         [0.0031],
         [0.0022],
         [0.0504],
         [0.0285]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0096],
         [0.0103],
         [0.0365],
         [0.0081],
         [0.0217],
         [0.0056],
         [0.0153],
         [0.0037],
         [0.0109],
         [0.0109],
         [0.0076],
         [0.0180]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0714],
         [0.0636],
         [0.0401],
         [0.0085],
         [0.0074],
         [0.0072],
         [0.0115],
         [0.0115],
         [0.0072],
    


Evaluating:  77%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                         | 1538/2000 [01:12<00:23, 19.84it/s][A
Evaluating:  77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                        | 1541/2000 [01:12<00:22, 20.39it/s][A

tensor([[[0.0791],
         [0.0174],
         [0.0234],
         [0.0083],
         [0.0128],
         [0.0117],
         [0.0625],
         [0.0097],
         [0.0414],
         [0.0146],
         [0.0036],
         [0.0235]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0262],
         [0.0274],
         [0.0680],
         [0.0180],
         [0.0336],
         [0.0105],
         [0.0181],
         [0.0083],
         [0.0075],
         [0.0072],
         [0.0172],
         [0.0261]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0310],
         [0.0218],
         [0.0313],
         [0.0158],
         [0.0173],
         [0.0070],
         [0.0122],
         [0.0056],
         [0.0033],
         [0.0048],
         [0.0190],
         [0.0121]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0089],
         [0.0166],
         [0.0569],
         [0.0115],
         [0.0154],
         [0.0090],
         [0.0025],
         [0.0033],
         [0.0019],
    


Evaluating:  77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                        | 1544/2000 [01:13<00:21, 20.79it/s][A

tensor([[[0.0108],
         [0.0202],
         [0.0286],
         [0.0068],
         [0.0419],
         [0.0091],
         [0.0167],
         [0.0122],
         [0.0130],
         [0.0062],
         [0.0254],
         [0.0237]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.6986],
         [0.2664],
         [0.0290],
         [0.0908],
         [0.0996],
         [0.0213],
         [0.1277],
         [0.0469],
         [0.0699],
         [0.2590],
         [0.0076],
         [0.0612]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0066],
         [0.0123],
         [0.0956],
         [0.0124],
         [0.0952],
         [0.0290],
         [0.0061],
         [0.0140],
         [0.0041],
         [0.0093],
         [0.2013],
         [0.0251]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0200],
         [0.0520],
         [0.1059],
         [0.0279],
         [0.0266],
         [0.0139],
         [0.0075],
         [0.0059],
         [0.0038],
    


Evaluating:  77%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                        | 1547/2000 [01:13<00:21, 21.13it/s][A
Evaluating:  78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                        | 1550/2000 [01:13<00:21, 21.37it/s][A

reg attention sum per layer
tensor([[[0.0230],
         [0.0409],
         [0.0524],
         [0.0114],
         [0.0194],
         [0.0075],
         [0.0219],
         [0.0133],
         [0.0179],
         [0.0086],
         [0.0126],
         [0.0239]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0360],
         [0.0103],
         [0.0458],
         [0.0053],
         [0.0112],
         [0.0052],
         [0.0163],
         [0.0138],
         [0.0064],
         [0.0065],
         [0.0142],
         [0.0584]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0235],
         [0.0177],
         [0.0238],
         [0.0041],
         [0.0238],
         [0.0106],
         [0.0104],
         [0.0070],
         [0.0037],
         [0.0109],
         [0.0085],
         [0.0172]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0107],
         [0.0215],
         [0.0956],
         [0.0183],
         [0.0213],
         [0.0125],
         [0.0242],
         [0.00


Evaluating:  78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                       | 1553/2000 [01:13<00:20, 21.50it/s][A

tensor([[[0.0025],
         [0.0209],
         [0.0454],
         [0.0087],
         [0.0035],
         [0.0047],
         [0.0055],
         [0.0003],
         [0.0030],
         [0.0020],
         [0.0522],
         [0.0327]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0256],
         [0.0577],
         [0.1042],
         [0.0371],
         [0.0524],
         [0.0346],
         [0.0148],
         [0.0201],
         [0.0064],
         [0.0073],
         [0.0383],
         [0.0345]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0083],
         [0.0369],
         [0.0651],
         [0.0048],
         [0.0294],
         [0.0146],
         [0.0062],
         [0.0172],
         [0.0036],
         [0.0047],
         [0.0222],
         [0.0318]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0339],
         [0.1264],
         [0.0207],
         [0.0274],
         [0.0159],
         [0.0109],
         [0.0473],
         [0.0046],
         [0.0091],
    


Evaluating:  78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                       | 1556/2000 [01:13<00:20, 21.54it/s][A


tensor([[[0.0353],
         [0.0157],
         [0.0137],
         [0.0069],
         [0.0237],
         [0.0089],
         [0.0304],
         [0.0094],
         [0.0216],
         [0.0152],
         [0.0041],
         [0.0058]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0300],
         [0.0387],
         [0.1192],
         [0.0323],
         [0.0325],
         [0.0044],
         [0.0383],
         [0.0122],
         [0.0406],
         [0.0091],
         [0.0122],
         [0.1493]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0143],
         [0.0057],
         [0.0121],
         [0.0045],
         [0.0516],
         [0.0164],
         [0.0047],
         [0.0060],
         [0.0049],
         [0.0079],
         [0.0076],
         [0.0142]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0239],
         [0.0362],
         [0.0062],
         [0.0078],
         [0.0034],
         [0.0017],
         [0.0189],
         [0.0050],
         [0.0081],
    

Evaluating:  78%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                       | 1559/2000 [01:13<00:20, 21.71it/s][A
Evaluating:  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                       | 1562/2000 [01:13<00:20, 21.74it/s][A

reg attention sum per layer
tensor([[[0.0086],
         [0.0241],
         [0.0126],
         [0.0097],
         [0.0086],
         [0.0366],
         [0.0104],
         [0.0025],
         [0.0011],
         [0.0059],
         [0.1154],
         [0.0424]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0564],
         [0.0443],
         [0.0134],
         [0.0329],
         [0.0043],
         [0.0074],
         [0.0486],
         [0.0068],
         [0.0027],
         [0.0072],
         [0.0434],
         [0.0274]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0290],
         [0.0370],
         [0.1091],
         [0.0209],
         [0.0274],
         [0.0403],
         [0.0444],
         [0.0134],
         [0.0101],
         [0.0130],
         [0.0791],
         [0.0472]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0183],
         [0.0147],
         [0.0912],
         [0.0298],
         [0.0227],
         [0.0072],
         [0.0070],
         [0.00


Evaluating:  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 1565/2000 [01:14<00:20, 21.52it/s][A

tensor([[[0.0352],
         [0.0368],
         [0.0353],
         [0.0108],
         [0.0312],
         [0.0471],
         [0.0321],
         [0.0220],
         [0.0108],
         [0.0102],
         [0.0058],
         [0.0305]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0514],
         [0.0188],
         [0.0800],
         [0.0310],
         [0.1117],
         [0.0133],
         [0.0277],
         [0.0079],
         [0.0082],
         [0.0248],
         [0.0141],
         [0.0275]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0830],
         [0.0807],
         [0.0740],
         [0.0207],
         [0.0142],
         [0.0057],
         [0.0240],
         [0.0209],
         [0.0038],
         [0.0047],
         [0.0099],
         [0.0189]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0233],
         [0.0479],
         [0.0795],
         [0.0215],
         [0.0505],
         [0.0092],
         [0.0100],
         [0.0188],
         [0.0122],
    


Evaluating:  78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                      | 1568/2000 [01:14<00:20, 21.19it/s][A
Evaluating:  79%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                      | 1571/2000 [01:14<00:20, 21.42it/s][A

tensor([[[0.0204],
         [0.0094],
         [0.0233],
         [0.0066],
         [0.0062],
         [0.0096],
         [0.0086],
         [0.0181],
         [0.0095],
         [0.0035],
         [0.0082],
         [0.0120]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0209],
         [0.0315],
         [0.0524],
         [0.0171],
         [0.0178],
         [0.0044],
         [0.0076],
         [0.0083],
         [0.0031],
         [0.0072],
         [0.0038],
         [0.0172]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0335],
         [0.0432],
         [0.0131],
         [0.0043],
         [0.0078],
         [0.0144],
         [0.0463],
         [0.0051],
         [0.0028],
         [0.0042],
         [0.0142],
         [0.0202]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0212],
         [0.0147],
         [0.0233],
         [0.0092],
         [0.0047],
         [0.0040],
         [0.0142],
         [0.0019],
         [0.0016],
    


Evaluating:  79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                      | 1574/2000 [01:14<00:20, 21.26it/s][A


reg attention sum per layer
tensor([[[0.0157],
         [0.0166],
         [0.0419],
         [0.0076],
         [0.0178],
         [0.0071],
         [0.0043],
         [0.0196],
         [0.0086],
         [0.0026],
         [0.0138],
         [0.0253]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0316],
         [0.0298],
         [0.0210],
         [0.0134],
         [0.0135],
         [0.0100],
         [0.0063],
         [0.0476],
         [0.0056],
         [0.0145],
         [0.0226],
         [0.0292]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0301],
         [0.0479],
         [0.0232],
         [0.0151],
         [0.0287],
         [0.0420],
         [0.0228],
         [0.0022],
         [0.0033],
         [0.0085],
         [0.0509],
         [0.0210]]], device='cuda:0')
reg attention sum per layer



Evaluating:  79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                     | 1577/2000 [01:14<00:20, 20.97it/s][A

tensor([[[0.0444],
         [0.0284],
         [0.0346],
         [0.0186],
         [0.0096],
         [0.0056],
         [0.0379],
         [0.0193],
         [0.0239],
         [0.0166],
         [0.0053],
         [0.0316]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0152],
         [0.0389],
         [0.0376],
         [0.0092],
         [0.0246],
         [0.0076],
         [0.0203],
         [0.0035],
         [0.0033],
         [0.0045],
         [0.0400],
         [0.0222]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0433],
         [0.0250],
         [0.0520],
         [0.0211],
         [0.0222],
         [0.0050],
         [0.0126],
         [0.0295],
         [0.0072],
         [0.0120],
         [0.0070],
         [0.0150]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0300],
         [0.0215],
         [0.0399],
         [0.0079],
         [0.0362],
         [0.0298],
         [0.0165],
         [0.0229],
         [0.0104],
    


Evaluating:  79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                     | 1580/2000 [01:14<00:20, 20.71it/s][A
Evaluating:  79%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                     | 1583/2000 [01:14<00:20, 20.55it/s][A

reg attention sum per layer
tensor([[[0.0470],
         [0.0103],
         [0.0711],
         [0.0111],
         [0.0099],
         [0.0156],
         [0.0105],
         [0.0132],
         [0.0030],
         [0.0094],
         [0.0116],
         [0.0457]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0392],
         [0.0465],
         [0.0398],
         [0.0242],
         [0.0162],
         [0.0143],
         [0.0207],
         [0.0416],
         [0.0290],
         [0.0117],
         [0.0184],
         [0.0453]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0832],
         [0.0325],
         [0.0290],
         [0.0100],
         [0.0346],
         [0.0106],
         [0.0224],
         [0.0285],
         [0.0257],
         [0.0088],
         [0.0060],
         [0.0296]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0556],
         [0.0085],
         [0.0077],
         [0.0043],
         [0.0047],
         [0.0092],
         [0.0343],
         [0.01


Evaluating:  79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                    | 1586/2000 [01:15<00:20, 20.56it/s][A

reg attention sum per layer
tensor([[[0.0710],
         [0.0424],
         [0.0188],
         [0.0156],
         [0.0154],
         [0.0074],
         [0.0300],
         [0.0056],
         [0.0072],
         [0.0122],
         [0.0211],
         [0.0150]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1282],
         [0.0677],
         [0.0633],
         [0.0265],
         [0.0217],
         [0.0251],
         [0.0361],
         [0.0179],
         [0.0037],
         [0.0087],
         [0.0136],
         [0.0428]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0280],
         [0.0468],
         [0.0686],
         [0.0240],
         [0.0515],
         [0.0085],
         [0.0235],
         [0.0116],
         [0.0040],
         [0.0031],
         [0.0328],
         [0.0576]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0507],
         [0.0266],
         [0.1157],
         [0.0207],
         [0.0633],
         [0.0451],
         [0.0378],
         [0.05


Evaluating:  79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                    | 1589/2000 [01:15<00:19, 20.83it/s][A
Evaluating:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                    | 1592/2000 [01:15<00:19, 20.98it/s]

tensor([[[0.0072],
         [0.0355],
         [0.0423],
         [0.0158],
         [0.0151],
         [0.0038],
         [0.0281],
         [0.0035],
         [0.0069],
         [0.0089],
         [0.1690],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0445],
         [0.0266],
         [0.0150],
         [0.0100],
         [0.0086],
         [0.0044],
         [0.0447],
         [0.0154],
         [0.0141],
         [0.0111],
         [0.0033],
         [0.0128]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0305],
         [0.0564],
         [0.0794],
         [0.0197],
         [0.0532],
         [0.0414],
         [0.0130],
         [0.0099],
         [0.0092],
         [0.0112],
         [0.0782],
         [0.0434]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0320],
         [0.0726],
         [0.0484],
         [0.0220],
         [0.0103],
         [0.0104],
         [0.0353],
         [0.0138],
         [0.0041],
    

[A
Evaluating:  80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                    | 1595/2000 [01:15<00:19, 21.17it/s][A

reg attention sum per layer
tensor([[[0.0070],
         [0.0126],
         [0.0193],
         [0.0038],
         [0.0111],
         [0.0039],
         [0.0073],
         [0.0042],
         [0.0034],
         [0.0035],
         [0.0445],
         [0.0069]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0789],
         [0.0293],
         [0.0997],
         [0.0220],
         [0.0436],
         [0.0208],
         [0.0637],
         [0.0037],
         [0.0077],
         [0.0908],
         [0.0297],
         [0.0673]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0994],
         [0.0730],
         [0.1087],
         [0.0402],
         [0.0365],
         [0.0317],
         [0.0522],
         [0.0857],
         [0.0645],
         [0.0382],
         [0.0628],
         [0.0891]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0165],
         [0.0797],
         [0.1266],
         [0.0507],
         [0.0274],
         [0.0156],
         [0.0156],
         [0.01


Evaluating:  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                   | 1598/2000 [01:15<00:18, 21.40it/s][A

tensor([[[0.0094],
         [0.0124],
         [0.0229],
         [0.0037],
         [0.0128],
         [0.0125],
         [0.0092],
         [0.0058],
         [0.0034],
         [0.0041],
         [0.0076],
         [0.0265]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0483],
         [0.0356],
         [0.0542],
         [0.0140],
         [0.1665],
         [0.0563],
         [0.0054],
         [0.0465],
         [0.0051],
         [0.0231],
         [0.0218],
         [0.0211]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0220],
         [0.0433],
         [0.0948],
         [0.0132],
         [0.0141],
         [0.0082],
         [0.0203],
         [0.0086],
         [0.0040],
         [0.0072],
         [0.0268],
         [0.0170]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0895],
         [0.0560],
         [0.0158],
         [0.0080],
         [0.0181],
         [0.0392],
         [0.0289],
         [0.0160],
         [0.0052],
    


Evaluating:  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                   | 1601/2000 [01:15<00:18, 21.11it/s][A
Evaluating:  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                   | 1604/2000 [01:15<00:18, 21.04it/s][A

tensor([[[0.0329],
         [0.0344],
         [0.0329],
         [0.0117],
         [0.0195],
         [0.0217],
         [0.0141],
         [0.1142],
         [0.0107],
         [0.0132],
         [0.0103],
         [0.0316]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0703],
         [0.0299],
         [0.1080],
         [0.0300],
         [0.0485],
         [0.0093],
         [0.0123],
         [0.0574],
         [0.0065],
         [0.0189],
         [0.0234],
         [0.1080]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1370],
         [0.0578],
         [0.1539],
         [0.0241],
         [0.0682],
         [0.0226],
         [0.0462],
         [0.0272],
         [0.0176],
         [0.0167],
         [0.0754],
         [0.0481]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0075],
         [0.0061],
         [0.0167],
         [0.0195],
         [0.0056],
         [0.0107],
         [0.0032],
         [0.0018],
         [0.0010],
    


Evaluating:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                   | 1607/2000 [01:16<00:18, 21.20it/s][A

tensor([[[0.0410],
         [0.0105],
         [0.0532],
         [0.0787],
         [0.0364],
         [0.0118],
         [0.0190],
         [0.0058],
         [0.0058],
         [0.0129],
         [0.0325],
         [0.0069]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0168],
         [0.0273],
         [0.0188],
         [0.0095],
         [0.0056],
         [0.0150],
         [0.0080],
         [0.0143],
         [0.0029],
         [0.0039],
         [0.0238],
         [0.0270]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0405],
         [0.0502],
         [0.0271],
         [0.0147],
         [0.0260],
         [0.0231],
         [0.0327],
         [0.0175],
         [0.0284],
         [0.0182],
         [0.0089],
         [0.0383]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0382],
         [0.0502],
         [0.1960],
         [0.0589],
         [0.0428],
         [0.0137],
         [0.0158],
         [0.0611],
         [0.0060],
    


Evaluating:  80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                  | 1610/2000 [01:16<00:18, 21.28it/s][A
Evaluating:  81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 1613/2000 [01:16<00:18, 21.46it/s][A


reg attention sum per layer
tensor([[[0.0253],
         [0.0209],
         [0.0242],
         [0.0108],
         [0.0077],
         [0.0112],
         [0.0238],
         [0.0021],
         [0.0024],
         [0.0056],
         [0.0082],
         [0.0210]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0535],
         [0.0350],
         [0.0413],
         [0.0145],
         [0.0323],
         [0.0234],
         [0.0144],
         [0.0020],
         [0.0020],
         [0.0020],
         [0.0894],
         [0.0815]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0482],
         [0.0857],
         [0.0320],
         [0.0193],
         [0.0269],
         [0.0258],
         [0.0247],
         [0.0336],
         [0.0079],
         [0.0111],
         [0.0107],
         [0.0401]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0119],
         [0.0323],
         [0.0557],
         [0.0248],
         [0.0075],
         [0.0056],
         [0.0105],
         [0.0


Evaluating:  81%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 1616/2000 [01:16<00:17, 21.51it/s][A

reg attention sum per layer
tensor([[[0.0111],
         [0.0165],
         [0.0459],
         [0.0162],
         [0.0094],
         [0.0067],
         [0.0064],
         [0.0051],
         [0.0033],
         [0.0037],
         [0.1086],
         [0.0176]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0112],
         [0.0157],
         [0.0366],
         [0.0059],
         [0.0192],
         [0.0152],
         [0.0105],
         [0.0285],
         [0.0047],
         [0.0057],
         [0.0178],
         [0.0207]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0235],
         [0.0239],
         [0.0202],
         [0.0047],
         [0.0066],
         [0.0106],
         [0.0077],
         [0.0102],
         [0.0038],
         [0.0072],
         [0.0317],
         [0.0118]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0279],
         [0.0168],
         [0.0763],
         [0.0184],
         [0.0185],
         [0.0081],
         [0.0363],
         [0.02


Evaluating:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                  | 1619/2000 [01:16<00:17, 21.46it/s][A

tensor([[[0.0187],
         [0.0196],
         [0.0669],
         [0.0137],
         [0.0311],
         [0.0114],
         [0.0062],
         [0.0097],
         [0.0158],
         [0.0063],
         [0.0168],
         [0.0506]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0879],
         [0.0706],
         [0.0428],
         [0.0204],
         [0.0348],
         [0.0213],
         [0.0411],
         [0.0420],
         [0.0709],
         [0.0337],
         [0.0073],
         [0.0270]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0620],
         [0.0375],
         [0.0435],
         [0.0077],
         [0.0155],
         [0.0149],
         [0.0165],
         [0.0074],
         [0.0016],
         [0.0058],
         [0.0170],
         [0.0214]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0270],
         [0.0261],
         [0.0541],
         [0.0103],
         [0.0293],
         [0.0146],
         [0.0088],
         [0.0168],
         [0.0097],
    


Evaluating:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                 | 1622/2000 [01:16<00:17, 21.51it/s][A
Evaluating:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                 | 1625/2000 [01:16<00:17, 21.60it/s][A

tensor([[[0.0536],
         [0.0829],
         [0.0354],
         [0.0419],
         [0.0163],
         [0.0115],
         [0.0205],
         [0.0224],
         [0.0077],
         [0.0242],
         [0.1259],
         [0.0661]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0100],
         [0.0252],
         [0.0492],
         [0.0178],
         [0.0064],
         [0.0053],
         [0.0014],
         [0.0021],
         [0.0028],
         [0.0008],
         [0.1277],
         [0.0664]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0183],
         [0.0612],
         [0.0478],
         [0.0053],
         [0.0062],
         [0.0037],
         [0.0082],
         [0.0082],
         [0.0020],
         [0.0022],
         [0.0142],
         [0.0193]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0513],
         [0.0258],
         [0.0061],
         [0.0171],
         [0.0211],
         [0.0508],
         [0.0105],
         [0.0034],
         [0.0019],
    


Evaluating:  81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                 | 1628/2000 [01:17<00:17, 21.52it/s][A

tensor([[[0.0236],
         [0.1150],
         [0.0989],
         [0.0528],
         [0.0515],
         [0.0347],
         [0.0404],
         [0.0195],
         [0.0283],
         [0.0181],
         [0.1154],
         [0.0796]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0182],
         [0.0156],
         [0.0315],
         [0.0210],
         [0.0269],
         [0.0033],
         [0.0067],
         [0.0163],
         [0.0027],
         [0.0078],
         [0.0224],
         [0.0070]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0062],
         [0.0236],
         [0.0785],
         [0.0115],
         [0.0054],
         [0.0104],
         [0.0208],
         [0.0081],
         [0.0063],
         [0.0021],
         [0.0253],
         [0.0394]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0312],
         [0.0285],
         [0.0154],
         [0.0082],
         [0.0131],
         [0.0072],
         [0.0245],
         [0.0149],
         [0.0123],
    


Evaluating:  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                | 1631/2000 [01:17<00:17, 21.60it/s][A
Evaluating:  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                | 1634/2000 [01:17<00:16, 21.71it/s][A

tensor([[[0.0072],
         [0.0124],
         [0.0291],
         [0.0085],
         [0.0191],
         [0.0060],
         [0.0044],
         [0.0014],
         [0.0006],
         [0.0029],
         [0.0320],
         [0.0660]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0472],
         [0.0471],
         [0.0075],
         [0.0130],
         [0.0061],
         [0.0046],
         [0.0165],
         [0.0086],
         [0.0009],
         [0.0062],
         [0.0074],
         [0.0156]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0211],
         [0.0162],
         [0.1925],
         [0.0100],
         [0.0852],
         [0.0141],
         [0.0060],
         [0.0207],
         [0.0046],
         [0.0060],
         [0.0216],
         [0.0759]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0386],
         [0.0276],
         [0.0382],
         [0.0163],
         [0.0225],
         [0.0248],
         [0.0132],
         [0.0165],
         [0.0023],
    


Evaluating:  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                | 1637/2000 [01:17<00:16, 21.83it/s][A

reg attention sum per layer
tensor([[[0.0371],
         [0.0672],
         [0.0943],
         [0.0228],
         [0.0334],
         [0.0168],
         [0.0115],
         [0.0131],
         [0.0034],
         [0.0059],
         [0.0196],
         [0.0478]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0389],
         [0.0290],
         [0.0233],
         [0.0038],
         [0.0098],
         [0.0313],
         [0.0523],
         [0.0068],
         [0.0044],
         [0.0584],
         [0.0189],
         [0.0108]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0355],
         [0.0187],
         [0.0263],
         [0.0117],
         [0.0089],
         [0.0112],
         [0.0218],
         [0.0094],
         [0.0024],
         [0.0027],
         [0.0058],
         [0.0099]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1530],
         [0.0853],
         [0.0274],
         [0.0193],
         [0.0148],
         [0.0070],
         [0.0174],
         [0.02


Evaluating:  82%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                | 1640/2000 [01:17<00:16, 21.82it/s][A

tensor([[[0.0158],
         [0.0277],
         [0.0063],
         [0.0146],
         [0.0047],
         [0.0096],
         [0.0113],
         [0.0023],
         [0.0014],
         [0.0070],
         [0.0139],
         [0.0125]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1135],
         [0.0424],
         [0.0241],
         [0.0089],
         [0.0243],
         [0.0052],
         [0.0451],
         [0.0283],
         [0.0729],
         [0.0417],
         [0.0033],
         [0.0222]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0414],
         [0.0652],
         [0.0473],
         [0.0240],
         [0.0120],
         [0.0121],
         [0.0432],
         [0.0208],
         [0.0278],
         [0.0125],
         [0.0280],
         [0.0526]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0677],
         [0.0819],
         [0.0435],
         [0.0264],
         [0.0376],
         [0.0454],
         [0.0238],
         [0.0161],
         [0.0100],
    


Evaluating:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                               | 1643/2000 [01:17<00:16, 21.86it/s][A
Evaluating:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                               | 1646/2000 [01:17<00:16, 21.75it/s][A


reg attention sum per layer
tensor([[[0.0470],
         [0.0437],
         [0.0256],
         [0.0213],
         [0.0130],
         [0.0130],
         [0.0281],
         [0.0149],
         [0.0130],
         [0.0141],
         [0.0067],
         [0.0183]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0055],
         [0.0240],
         [0.0383],
         [0.0091],
         [0.0212],
         [0.0068],
         [0.0117],
         [0.0026],
         [0.0076],
         [0.0042],
         [0.1138],
         [0.0168]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0343],
         [0.0198],
         [0.0252],
         [0.0147],
         [0.0927],
         [0.0195],
         [0.0109],
         [0.0090],
         [0.0049],
         [0.0058],
         [0.0362],
         [0.0115]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0565],
         [0.0246],
         [0.0429],
         [0.0117],
         [0.0173],
         [0.0646],
         [0.0225],
         [0.0


Evaluating:  82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                               | 1649/2000 [01:17<00:16, 21.57it/s][A


reg attention sum per layer
tensor([[[0.0043],
         [0.0029],
         [0.0085],
         [0.0020],
         [0.0068],
         [0.0029],
         [0.0006],
         [0.0077],
         [0.0002],
         [0.0006],
         [0.0036],
         [0.0028]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0070],
         [0.0061],
         [0.0479],
         [0.0078],
         [0.0226],
         [0.0139],
         [0.0213],
         [0.0085],
         [0.0032],
         [0.0024],
         [0.0146],
         [0.0303]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0345],
         [0.0185],
         [0.0788],
         [0.0338],
         [0.0423],
         [0.0096],
         [0.0266],
         [0.0249],
         [0.0060],
         [0.0085],
         [0.0189],
         [0.0810]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1330],
         [0.0440],
         [0.0166],
         [0.0159],
         [0.0152],
         [0.0209],
         [0.0400],
         [0.0


Evaluating:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                               | 1652/2000 [01:18<00:16, 21.55it/s][A
Evaluating:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                              | 1655/2000 [01:18<00:15, 21.58it/s][A


reg attention sum per layer
tensor([[[0.0262],
         [0.0359],
         [0.0139],
         [0.0043],
         [0.0025],
         [0.0104],
         [0.0153],
         [0.0126],
         [0.0024],
         [0.0050],
         [0.0045],
         [0.0070]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0389],
         [0.0182],
         [0.0235],
         [0.0137],
         [0.0102],
         [0.0236],
         [0.0300],
         [0.0140],
         [0.0113],
         [0.0071],
         [0.0104],
         [0.0193]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0357],
         [0.0397],
         [0.0143],
         [0.0068],
         [0.0097],
         [0.0235],
         [0.0105],
         [0.0032],
         [0.0025],
         [0.0048],
         [0.0223],
         [0.0155]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0312],
         [0.0240],
         [0.0443],
         [0.0299],
         [0.0194],
         [0.0188],
         [0.0208],
         [0.0


Evaluating:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                              | 1658/2000 [01:18<00:15, 21.69it/s][A

reg attention sum per layer
tensor([[[0.0501],
         [0.0983],
         [0.0588],
         [0.0396],
         [0.0296],
         [0.0075],
         [0.1075],
         [0.0366],
         [0.0259],
         [0.0241],
         [0.0684],
         [0.0945]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1396],
         [0.0737],
         [0.0374],
         [0.0228],
         [0.0165],
         [0.0095],
         [0.0394],
         [0.0060],
         [0.0038],
         [0.0100],
         [0.0200],
         [0.0183]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0473],
         [0.0151],
         [0.0535],
         [0.0053],
         [0.0068],
         [0.0141],
         [0.0074],
         [0.0106],
         [0.0027],
         [0.0168],
         [0.0129],
         [0.0064]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0239],
         [0.0204],
         [0.0433],
         [0.0206],
         [0.0134],
         [0.0071],
         [0.0135],
         [0.01


Evaluating:  83%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 1661/2000 [01:18<00:15, 21.67it/s][A

tensor([[[0.1562],
         [0.0651],
         [0.0560],
         [0.0347],
         [0.0279],
         [0.0041],
         [0.0194],
         [0.0127],
         [0.0030],
         [0.0129],
         [0.0056],
         [0.0200]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0348],
         [0.0403],
         [0.2422],
         [0.0275],
         [0.0516],
         [0.0884],
         [0.0224],
         [0.1391],
         [0.0070],
         [0.0218],
         [0.0785],
         [0.1092]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0919],
         [0.2321],
         [0.1139],
         [0.0362],
         [0.0090],
         [0.0135],
         [0.0217],
         [0.0325],
         [0.0052],
         [0.0086],
         [0.0193],
         [0.0629]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0212],
         [0.0168],
         [0.0321],
         [0.0102],
         [0.0455],
         [0.0276],
         [0.0088],
         [0.0072],
         [0.0028],
    


Evaluating:  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                              | 1664/2000 [01:18<00:15, 21.62it/s][A
Evaluating:  83%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                             | 1667/2000 [01:18<00:15, 21.67it/s][A

tensor([[[0.0373],
         [0.0307],
         [0.0490],
         [0.0125],
         [0.0286],
         [0.0321],
         [0.0197],
         [0.0044],
         [0.0065],
         [0.0145],
         [0.0052],
         [0.0234]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0570],
         [0.0397],
         [0.1128],
         [0.0337],
         [0.0804],
         [0.0100],
         [0.0327],
         [0.0180],
         [0.0334],
         [0.0183],
         [0.0161],
         [0.1375]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0169],
         [0.0111],
         [0.0334],
         [0.0091],
         [0.0103],
         [0.0029],
         [0.0122],
         [0.0040],
         [0.0027],
         [0.0037],
         [0.0027],
         [0.0135]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0883],
         [0.0211],
         [0.0305],
         [0.0258],
         [0.0193],
         [0.0217],
         [0.0170],
         [0.0061],
         [0.0039],
    


Evaluating:  84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 1670/2000 [01:18<00:15, 21.71it/s][A

tensor([[[0.0622],
         [0.0525],
         [0.0119],
         [0.0266],
         [0.0116],
         [0.0066],
         [0.0386],
         [0.0053],
         [0.0080],
         [0.0107],
         [0.0241],
         [0.0205]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0084],
         [0.0092],
         [0.0955],
         [0.0075],
         [0.0622],
         [0.0058],
         [0.0065],
         [0.0170],
         [0.0047],
         [0.0043],
         [0.0154],
         [0.0217]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0405],
         [0.0203],
         [0.0167],
         [0.0137],
         [0.0362],
         [0.0067],
         [0.0214],
         [0.0095],
         [0.0119],
         [0.0110],
         [0.0093],
         [0.0095]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0182],
         [0.0282],
         [0.1194],
         [0.0143],
         [0.0692],
         [0.0067],
         [0.0116],
         [0.0113],
         [0.0455],
    


Evaluating:  84%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                             | 1673/2000 [01:19<00:14, 21.83it/s][A
Evaluating:  84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                            | 1676/2000 [01:19<00:14, 21.87it/s][A

reg attention sum per layer
tensor([[[0.0103],
         [0.0172],
         [0.0255],
         [0.0196],
         [0.0494],
         [0.0097],
         [0.0042],
         [0.0014],
         [0.0092],
         [0.0049],
         [0.0243],
         [0.0179]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0268],
         [0.0212],
         [0.0727],
         [0.0114],
         [0.0411],
         [0.0299],
         [0.0051],
         [0.0012],
         [0.0043],
         [0.0033],
         [0.0709],
         [0.0387]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0367],
         [0.0246],
         [0.0394],
         [0.0339],
         [0.0133],
         [0.0077],
         [0.0144],
         [0.0025],
         [0.0032],
         [0.0020],
         [0.0254],
         [0.0289]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0310],
         [0.0321],
         [0.0618],
         [0.0195],
         [0.0385],
         [0.0210],
         [0.0329],
         [0.01


Evaluating:  84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                            | 1679/2000 [01:19<00:14, 21.80it/s][A

tensor([[[0.0293],
         [0.0720],
         [0.0354],
         [0.0076],
         [0.0158],
         [0.0130],
         [0.0148],
         [0.0426],
         [0.0060],
         [0.0082],
         [0.0132],
         [0.0229]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0507],
         [0.0228],
         [0.0263],
         [0.0132],
         [0.0286],
         [0.0026],
         [0.0137],
         [0.0055],
         [0.0031],
         [0.0066],
         [0.0065],
         [0.0250]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0270],
         [0.0093],
         [0.0227],
         [0.0158],
         [0.0097],
         [0.0022],
         [0.0061],
         [0.0055],
         [0.0017],
         [0.0038],
         [0.0070],
         [0.0197]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0176],
         [0.0393],
         [0.0448],
         [0.0149],
         [0.0072],
         [0.0073],
         [0.0039],
         [0.0327],
         [0.0079],
    


Evaluating:  84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                            | 1682/2000 [01:19<00:14, 21.75it/s][A


tensor([[[0.0426],
         [0.0273],
         [0.0351],
         [0.0134],
         [0.0223],
         [0.0129],
         [0.0189],
         [0.0095],
         [0.0099],
         [0.0122],
         [0.0099],
         [0.0318]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0053],
         [0.0137],
         [0.1133],
         [0.0102],
         [0.0757],
         [0.0152],
         [0.0072],
         [0.0065],
         [0.0030],
         [0.0035],
         [0.0759],
         [0.1128]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0021],
         [0.0085],
         [0.0251],
         [0.0064],
         [0.0168],
         [0.0077],
         [0.0025],
         [0.0019],
         [0.0020],
         [0.0021],
         [0.0752],
         [0.0113]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0216],
         [0.0369],
         [0.0284],
         [0.0143],
         [0.0177],
         [0.0067],
         [0.0225],
         [0.0071],
         [0.0087],
    

Evaluating:  84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                            | 1685/2000 [01:19<00:14, 21.81it/s][A
Evaluating:  84%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                           | 1688/2000 [01:19<00:14, 21.48it/s][A

reg attention sum per layer
tensor([[[0.0365],
         [0.0583],
         [0.0996],
         [0.0121],
         [0.0320],
         [0.0052],
         [0.0336],
         [0.0087],
         [0.0077],
         [0.0276],
         [0.0116],
         [0.0319]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0123],
         [0.0400],
         [0.0416],
         [0.0137],
         [0.0238],
         [0.0184],
         [0.0046],
         [0.0114],
         [0.0026],
         [0.0064],
         [0.0963],
         [0.0255]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0238],
         [0.0341],
         [0.0634],
         [0.0052],
         [0.0139],
         [0.0295],
         [0.0228],
         [0.0073],
         [0.0099],
         [0.0082],
         [0.0307],
         [0.0279]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.


Evaluating:  85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                           | 1691/2000 [01:19<00:14, 21.53it/s][A
Evaluating:  85%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                           | 1694/2000 [01:20<00:14, 21.61it/s]

tensor([[[0.0810],
         [0.0169],
         [0.0240],
         [0.0163],
         [0.0189],
         [0.0188],
         [0.0340],
         [0.0121],
         [0.0055],
         [0.0180],
         [0.0096],
         [0.0128]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0211],
         [0.0350],
         [0.0394],
         [0.0119],
         [0.0082],
         [0.0024],
         [0.0066],
         [0.0048],
         [0.0012],
         [0.0027],
         [0.0130],
         [0.0114]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0440],
         [0.0166],
         [0.0552],
         [0.0410],
         [0.0721],
         [0.0238],
         [0.0226],
         [0.0136],
         [0.0107],
         [0.0182],
         [0.0334],
         [0.0391]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0360],
         [0.0186],
         [0.0123],
         [0.0078],
         [0.0128],
         [0.0055],
         [0.0059],
         [0.0045],
         [0.0007],
    

[A
Evaluating:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                           | 1697/2000 [01:20<00:14, 21.62it/s][A

reg attention sum per layer
tensor([[[0.0060],
         [0.0089],
         [0.0255],
         [0.0030],
         [0.0072],
         [0.0097],
         [0.0037],
         [0.0035],
         [0.0006],
         [0.0012],
         [0.0085],
         [0.0065]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0395],
         [0.1235],
         [0.0650],
         [0.0233],
         [0.0115],
         [0.0136],
         [0.0535],
         [0.0090],
         [0.0154],
         [0.0157],
         [0.0262],
         [0.0542]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0119],
         [0.0194],
         [0.0726],
         [0.0048],
         [0.0148],
         [0.0081],
         [0.0061],
         [0.0237],
         [0.0025],
         [0.0036],
         [0.0527],
         [0.0116]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0374],
         [0.0266],
         [0.0444],
         [0.0141],
         [0.0106],
         [0.0164],
         [0.0283],
         [0.00


Evaluating:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                          | 1700/2000 [01:20<00:13, 21.62it/s][A

tensor([[[0.0165],
         [0.0066],
         [0.0201],
         [0.0038],
         [0.0101],
         [0.0059],
         [0.0066],
         [0.0012],
         [0.0006],
         [0.0151],
         [0.0146],
         [0.0124]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0144],
         [0.0313],
         [0.0954],
         [0.0124],
         [0.0262],
         [0.0121],
         [0.0166],
         [0.0059],
         [0.0087],
         [0.0044],
         [0.0219],
         [0.0269]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0233],
         [0.0276],
         [0.0672],
         [0.0125],
         [0.0600],
         [0.0159],
         [0.0111],
         [0.0046],
         [0.0061],
         [0.0266],
         [0.0155],
         [0.0214]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0261],
         [0.0367],
         [0.0372],
         [0.0162],
         [0.0712],
         [0.0061],
         [0.0161],
         [0.0226],
         [0.0113],
    


Evaluating:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                          | 1703/2000 [01:20<00:13, 21.63it/s][A
Evaluating:  85%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                          | 1706/2000 [01:20<00:13, 21.73it/s][A

tensor([[[0.0410],
         [0.0346],
         [0.0948],
         [0.0229],
         [0.0192],
         [0.0146],
         [0.0055],
         [0.0041],
         [0.0011],
         [0.0021],
         [0.0547],
         [0.0541]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0129],
         [0.0115],
         [0.0218],
         [0.0138],
         [0.0135],
         [0.0064],
         [0.0123],
         [0.0035],
         [0.0076],
         [0.0023],
         [0.0193],
         [0.0080]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0188],
         [0.0166],
         [0.0318],
         [0.0068],
         [0.0333],
         [0.0084],
         [0.0102],
         [0.0087],
         [0.0071],
         [0.0237],
         [0.0042],
         [0.0229]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0626],
         [0.0546],
         [0.0636],
         [0.0143],
         [0.0141],
         [0.0405],
         [0.1134],
         [0.0026],
         [0.0051],
    


Evaluating:  85%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                          | 1709/2000 [01:20<00:13, 21.51it/s][A

tensor([[[0.0055],
         [0.0090],
         [0.0435],
         [0.0073],
         [0.0088],
         [0.0040],
         [0.0064],
         [0.0165],
         [0.0024],
         [0.0019],
         [0.0243],
         [0.0085]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0490],
         [0.1082],
         [0.1625],
         [0.0283],
         [0.0346],
         [0.0180],
         [0.0195],
         [0.0133],
         [0.0063],
         [0.0085],
         [0.0602],
         [0.0378]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0195],
         [0.0266],
         [0.0260],
         [0.0083],
         [0.0049],
         [0.0031],
         [0.0075],
         [0.0017],
         [0.0003],
         [0.0011],
         [0.0376],
         [0.0333]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0290],
         [0.0530],
         [0.0617],
         [0.0128],
         [0.0193],
         [0.0083],
         [0.0124],
         [0.0328],
         [0.0311],
    


Evaluating:  86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 1712/2000 [01:20<00:13, 21.60it/s][A

tensor([[[0.0434],
         [0.0211],
         [0.0145],
         [0.0095],
         [0.0087],
         [0.0021],
         [0.0506],
         [0.0107],
         [0.0064],
         [0.0070],
         [0.0066],
         [0.0042]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0041],
         [0.0047],
         [0.0101],
         [0.0044],
         [0.0261],
         [0.0216],
         [0.0009],
         [0.0028],
         [0.0004],
         [0.0030],
         [0.0095],
         [0.0025]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0349],
         [0.0279],
         [0.0375],
         [0.0123],
         [0.0332],
         [0.0085],
         [0.0244],
         [0.0082],
         [0.0041],
         [0.0093],
         [0.0322],
         [0.0327]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1449],
         [0.1123],
         [0.0242],
         [0.0166],
         [0.0221],
         [0.0057],
         [0.0238],
         [0.0139],
         [0.0065],
    


Evaluating:  86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                         | 1715/2000 [01:21<00:13, 21.56it/s][A
Evaluating:  86%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                         | 1718/2000 [01:21<00:12, 21.73it/s][A


reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0606],
         [0.0807],
         [0.0098],
         [0.0300],
         [0.0537],
         [0.0455],
         [0.0424],
         [0.0047],
         [0.0092],
         [0.0231],
         [0.1838],
         [0.0242]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0250],
         [0.0650],
         [0.0353],
         [0.0172],
         [0.0083],
         [0.0447],
         [0.0507],
         [0.0054],
         [0.0079],
         [0.0042],
         [0.0414],
         [0.0561]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0402],
         [0.0355],
         [0.0560],
         [0.0141],
         [0.0908],
         [0.0221],
         [0.0218],
         [0.0147],
         [0.0199],
         [0.0137],
    


Evaluating:  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                        | 1721/2000 [01:21<00:12, 21.84it/s][A

reg attention sum per layer
tensor([[[0.1684],
         [0.0614],
         [0.0330],
         [0.0175],
         [0.0084],
         [0.0109],
         [0.0460],
         [0.0478],
         [0.0072],
         [0.0435],
         [0.0056],
         [0.0348]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0154],
         [0.0150],
         [0.0348],
         [0.0068],
         [0.0087],
         [0.0338],
         [0.0087],
         [0.0013],
         [0.0016],
         [0.0022],
         [0.0770],
         [0.0175]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0339],
         [0.1402],
         [0.0355],
         [0.0182],
         [0.0421],
         [0.0307],
         [0.0142],
         [0.0060],
         [0.0071],
         [0.0191],
         [0.0270],
         [0.0345]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0187],
         [0.0299],
         [0.1331],
         [0.0068],
         [0.0141],
         [0.0102],
         [0.0355],
         [0.01


Evaluating:  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                        | 1724/2000 [01:21<00:12, 21.36it/s][A
Evaluating:  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                        | 1727/2000 [01:21<00:12, 21.31it/s][A

tensor([[[0.0042],
         [0.0113],
         [0.0285],
         [0.0057],
         [0.0306],
         [0.0115],
         [0.0043],
         [0.0044],
         [0.0004],
         [0.0036],
         [0.0483],
         [0.0091]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0178],
         [0.0341],
         [0.0444],
         [0.0157],
         [0.0408],
         [0.0576],
         [0.0042],
         [0.0111],
         [0.0014],
         [0.0013],
         [0.2579],
         [0.0534]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0299],
         [0.0695],
         [0.0851],
         [0.0150],
         [0.0206],
         [0.0083],
         [0.0286],
         [0.0222],
         [0.0066],
         [0.0135],
         [0.0095],
         [0.0681]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0040],
         [0.0041],
         [0.0181],
         [0.0058],
         [0.0252],
         [0.0109],
         [0.0030],
         [0.0007],
         [0.0008],
    


Evaluating:  86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                        | 1730/2000 [01:21<00:12, 20.92it/s][A

reg attention sum per layer
tensor([[[0.0204],
         [0.0276],
         [0.0634],
         [0.0126],
         [0.0102],
         [0.0382],
         [0.0078],
         [0.0022],
         [0.0021],
         [0.0086],
         [0.0570],
         [0.0526]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0431],
         [0.0457],
         [0.0339],
         [0.0151],
         [0.0129],
         [0.0067],
         [0.0325],
         [0.0267],
         [0.0034],
         [0.0072],
         [0.0154],
         [0.0149]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0397],
         [0.0220],
         [0.0167],
         [0.0115],
         [0.0059],
         [0.0069],
         [0.0395],
         [0.0017],
         [0.0023],
         [0.0027],
         [0.0156],
         [0.0286]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0187],
         [0.0171],
         [0.0415],
         [0.0125],
         [0.0080],
         [0.0092],
         [0.0244],
         [0.02


Evaluating:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                       | 1733/2000 [01:21<00:12, 20.86it/s][A

tensor([[[0.0355],
         [0.0253],
         [0.0689],
         [0.0101],
         [0.0059],
         [0.0366],
         [0.0112],
         [0.0044],
         [0.0019],
         [0.0041],
         [0.0162],
         [0.0192]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0066],
         [0.0155],
         [0.0144],
         [0.0076],
         [0.0121],
         [0.0190],
         [0.0017],
         [0.0017],
         [0.0008],
         [0.0039],
         [0.0262],
         [0.0254]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0188],
         [0.0518],
         [0.0489],
         [0.0168],
         [0.0140],
         [0.0448],
         [0.0123],
         [0.0101],
         [0.0018],
         [0.0069],
         [0.2165],
         [0.0449]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0069],
         [0.0232],
         [0.0141],
         [0.0017],
         [0.0057],
         [0.0028],
         [0.0027],
         [0.0026],
         [0.0079],
    


Evaluating:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                       | 1736/2000 [01:22<00:12, 20.61it/s][A
Evaluating:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                       | 1739/2000 [01:22<00:12, 20.78it/s][A

reg attention sum per layer
tensor([[[0.0866],
         [0.0298],
         [0.0159],
         [0.0090],
         [0.0223],
         [0.0099],
         [0.0896],
         [0.0126],
         [0.0041],
         [0.0198],
         [0.0041],
         [0.0069]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0424],
         [0.0674],
         [0.0466],
         [0.0185],
         [0.0072],
         [0.0245],
         [0.0220],
         [0.0059],
         [0.0041],
         [0.0088],
         [0.0430],
         [0.0332]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0370],
         [0.0123],
         [0.0157],
         [0.0172],
         [0.0300],
         [0.0182],
         [0.0166],
         [0.0046],
         [0.0036],
         [0.0030],
         [0.0091],
         [0.0206]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0105],
         [0.0247],
         [0.0276],
         [0.0047],
         [0.0266],
         [0.0065],
         [0.0086],
         [0.00


Evaluating:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                       | 1742/2000 [01:22<00:12, 20.94it/s][A


tensor([[[0.0156],
         [0.0178],
         [0.0433],
         [0.0116],
         [0.0190],
         [0.0026],
         [0.0189],
         [0.0037],
         [0.0103],
         [0.0070],
         [0.0212],
         [0.0166]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0025],
         [0.0020],
         [0.0031],
         [0.0023],
         [0.0042],
         [0.0040],
         [0.0046],
         [0.0009],
         [0.0003],
         [0.0007],
         [0.0044],
         [0.0088]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0149],
         [0.0208],
         [0.0319],
         [0.0080],
         [0.0333],
         [0.0212],
         [0.0231],
         [0.0230],
         [0.0071],
         [0.0146],
         [0.0399],
         [0.0203]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0089],
         [0.0153],
         [0.0121],
         [0.0042],
         [0.0052],
         [0.0147],
         [0.0132],
         [0.0031],
         [0.0032],
   


Evaluating:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                      | 1745/2000 [01:22<00:12, 20.75it/s][A

tensor([[[0.0309],
         [0.0089],
         [0.0221],
         [0.0152],
         [0.0051],
         [0.0073],
         [0.0167],
         [0.0035],
         [0.0025],
         [0.0046],
         [0.0094],
         [0.0079]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0028],
         [0.0260],
         [0.0336],
         [0.0095],
         [0.0104],
         [0.0077],
         [0.0032],
         [0.0042],
         [0.0018],
         [0.0009],
         [0.2832],
         [0.0320]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0249],
         [0.0459],
         [0.0879],
         [0.0332],
         [0.0344],
         [0.0041],
         [0.0293],
         [0.0162],
         [0.0170],
         [0.0123],
         [0.0101],
         [0.0382]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0989],
         [0.0479],
         [0.0483],
         [0.0339],
         [0.0620],
         [0.0124],
         [0.0188],
         [0.0185],
         [0.0171],
    


Evaluating:  87%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                      | 1748/2000 [01:22<00:12, 20.84it/s][A
Evaluating:  88%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                      | 1751/2000 [01:22<00:11, 20.98it/s][A

tensor([[[0.0124],
         [0.0510],
         [0.0253],
         [0.0055],
         [0.0053],
         [0.0274],
         [0.0101],
         [0.0069],
         [0.0026],
         [0.0047],
         [0.0500],
         [0.0215]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0062],
         [0.0043],
         [0.0040],
         [0.0017],
         [0.0137],
         [0.0048],
         [0.0032],
         [0.0051],
         [0.0017],
         [0.0076],
         [0.0050],
         [0.0056]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0940],
         [0.0229],
         [0.0154],
         [0.0210],
         [0.0098],
         [0.0081],
         [0.0454],
         [0.0284],
         [0.0189],
         [0.0198],
         [0.0037],
         [0.0195]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0119],
         [0.0574],
         [0.0850],
         [0.0329],
         [0.0158],
         [0.0089],
         [0.0062],
         [0.0174],
         [0.0031],
    


Evaluating:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                      | 1754/2000 [01:22<00:11, 21.27it/s][A

tensor([[[0.0431],
         [0.0386],
         [0.1066],
         [0.0172],
         [0.0905],
         [0.0087],
         [0.0093],
         [0.0205],
         [0.0033],
         [0.0051],
         [0.0484],
         [0.0827]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0372],
         [0.0282],
         [0.0637],
         [0.0189],
         [0.0081],
         [0.0063],
         [0.0124],
         [0.0077],
         [0.0008],
         [0.0050],
         [0.0139],
         [0.0103]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0101],
         [0.0085],
         [0.0788],
         [0.0123],
         [0.0179],
         [0.0086],
         [0.0047],
         [0.0047],
         [0.0143],
         [0.0083],
         [0.0308],
         [0.0198]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0409],
         [0.0623],
         [0.1405],
         [0.0369],
         [0.1034],
         [0.0314],
         [0.0255],
         [0.0234],
         [0.0207],
    


Evaluating:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                     | 1757/2000 [01:23<00:11, 21.29it/s][A
Evaluating:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                     | 1760/2000 [01:23<00:11, 21.34it/s][A

reg attention sum per layer
tensor([[[0.0171],
         [0.0341],
         [0.0663],
         [0.0263],
         [0.0265],
         [0.0140],
         [0.0104],
         [0.0091],
         [0.0041],
         [0.0050],
         [0.0353],
         [0.0449]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0364],
         [0.0221],
         [0.0284],
         [0.0291],
         [0.0394],
         [0.0082],
         [0.0077],
         [0.0140],
         [0.0088],
         [0.0117],
         [0.0148],
         [0.0258]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0211],
         [0.0475],
         [0.0483],
         [0.0195],
         [0.0382],
         [0.0481],
         [0.0227],
         [0.0121],
         [0.0089],
         [0.0055],
         [0.0975],
         [0.0551]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0089],
         [0.0159],
         [0.0239],
         [0.0039],
         [0.0028],
         [0.0025],
         [0.0104],
         [0.00


Evaluating:  88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                     | 1763/2000 [01:23<00:11, 21.34it/s][A


tensor([[[0.0276],
         [0.0313],
         [0.0368],
         [0.0067],
         [0.0066],
         [0.0152],
         [0.0104],
         [0.0015],
         [0.0011],
         [0.0041],
         [0.0169],
         [0.0097]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0138],
         [0.0083],
         [0.0447],
         [0.0137],
         [0.0098],
         [0.0021],
         [0.0044],
         [0.0077],
         [0.0014],
         [0.0013],
         [0.0177],
         [0.0261]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0399],
         [0.0237],
         [0.0329],
         [0.0180],
         [0.0122],
         [0.0188],
         [0.0113],
         [0.0198],
         [0.0031],
         [0.0103],
         [0.0254],
         [0.0136]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0727],
         [0.0394],
         [0.0533],
         [0.0219],
         [0.0211],
         [0.0272],
         [0.0278],
         [0.0286],
         [0.0064],
   


Evaluating:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                    | 1766/2000 [01:23<00:10, 21.48it/s][A

tensor([[[0.0257],
         [0.0224],
         [0.0383],
         [0.0108],
         [0.0117],
         [0.0056],
         [0.0344],
         [0.0065],
         [0.0070],
         [0.0081],
         [0.0178],
         [0.0151]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0027],
         [0.0346],
         [0.0119],
         [0.0026],
         [0.0015],
         [0.0001],
         [0.0070],
         [0.0004],
         [0.0004],
         [0.0015],
         [0.0109],
         [0.0021]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0134],
         [0.0438],
         [0.0355],
         [0.0542],
         [0.0283],
         [0.0108],
         [0.0348],
         [0.0119],
         [0.0038],
         [0.0083],
         [0.0350],
         [0.0287]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0565],
         [0.0739],
         [0.1058],
         [0.0215],
         [0.0346],
         [0.0339],
         [0.0676],
         [0.0720],
         [0.0260],
    


Evaluating:  88%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                    | 1769/2000 [01:23<00:10, 21.52it/s][A
Evaluating:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                    | 1772/2000 [01:23<00:10, 21.61it/s][A

tensor([[[0.0220],
         [0.0202],
         [0.0380],
         [0.0126],
         [0.0308],
         [0.0115],
         [0.0149],
         [0.0072],
         [0.0034],
         [0.0050],
         [0.0137],
         [0.0179]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0296],
         [0.0460],
         [0.0386],
         [0.0101],
         [0.0586],
         [0.0164],
         [0.0337],
         [0.0188],
         [0.0083],
         [0.0392],
         [0.0512],
         [0.0677]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0066],
         [0.0067],
         [0.0396],
         [0.0180],
         [0.0094],
         [0.0132],
         [0.0230],
         [0.0080],
         [0.0066],
         [0.0068],
         [0.0268],
         [0.0199]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0182],
         [0.0239],
         [0.0162],
         [0.0075],
         [0.0072],
         [0.0130],
         [0.0147],
         [0.0136],
         [0.0077],
    


Evaluating:  89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                    | 1775/2000 [01:23<00:10, 21.64it/s][A

tensor([[[0.0161],
         [0.0204],
         [0.0298],
         [0.0039],
         [0.0099],
         [0.0075],
         [0.0195],
         [0.0027],
         [0.0047],
         [0.0089],
         [0.0628],
         [0.0209]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1254],
         [0.0846],
         [0.0260],
         [0.0311],
         [0.0126],
         [0.0089],
         [0.0193],
         [0.0531],
         [0.0032],
         [0.0258],
         [0.0037],
         [0.0087]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.2062],
         [0.0512],
         [0.0449],
         [0.0272],
         [0.0365],
         [0.0100],
         [0.0135],
         [0.0392],
         [0.0066],
         [0.0129],
         [0.0140],
         [0.0387]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1567],
         [0.0806],
         [0.0458],
         [0.0264],
         [0.0648],
         [0.0281],
         [0.0535],
         [0.0253],
         [0.0166],
    


Evaluating:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                   | 1778/2000 [01:24<00:10, 21.73it/s][A
Evaluating:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                   | 1781/2000 [01:24<00:10, 21.80it/s][A

reg attention sum per layer
tensor([[[0.0082],
         [0.0205],
         [0.0298],
         [0.0114],
         [0.0065],
         [0.0041],
         [0.0050],
         [0.0024],
         [0.0013],
         [0.0012],
         [0.0367],
         [0.0338]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0166],
         [0.0578],
         [0.1601],
         [0.0299],
         [0.0387],
         [0.0108],
         [0.0057],
         [0.0150],
         [0.0073],
         [0.0024],
         [0.0314],
         [0.0338]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0236],
         [0.0390],
         [0.0561],
         [0.0089],
         [0.0724],
         [0.0451],
         [0.0138],
         [0.0193],
         [0.0159],
         [0.0072],
         [0.0647],
         [0.0194]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0996],
         [0.0770],
         [0.1071],
         [0.0291],
         [0.0526],
         [0.0289],
         [0.0486],
         [0.01


Evaluating:  89%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                   | 1784/2000 [01:24<00:09, 21.80it/s][A

tensor([[[0.0391],
         [0.0394],
         [0.1048],
         [0.0310],
         [0.0754],
         [0.0164],
         [0.0160],
         [0.0299],
         [0.0217],
         [0.0283],
         [0.0690],
         [0.0944]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0178],
         [0.0174],
         [0.0593],
         [0.0052],
         [0.0221],
         [0.0095],
         [0.0101],
         [0.0148],
         [0.0211],
         [0.0111],
         [0.0161],
         [0.0269]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0181],
         [0.0108],
         [0.0320],
         [0.0094],
         [0.0129],
         [0.0032],
         [0.0182],
         [0.0030],
         [0.0020],
         [0.0040],
         [0.0271],
         [0.0142]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0275],
         [0.0260],
         [0.0128],
         [0.0087],
         [0.0111],
         [0.0049],
         [0.0202],
         [0.0118],
         [0.0385],
    


Evaluating:  89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                   | 1787/2000 [01:24<00:09, 21.80it/s][A
Evaluating:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                  | 1790/2000 [01:24<00:09, 21.85it/s][A

tensor([[[0.0364],
         [0.0215],
         [0.0446],
         [0.0048],
         [0.0728],
         [0.0126],
         [0.0095],
         [0.0067],
         [0.0076],
         [0.0273],
         [0.0054],
         [0.0070]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0272],
         [0.0675],
         [0.1136],
         [0.0354],
         [0.0031],
         [0.0019],
         [0.0068],
         [0.0048],
         [0.0008],
         [0.0022],
         [0.0380],
         [0.0302]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0458],
         [0.0176],
         [0.0118],
         [0.0165],
         [0.0294],
         [0.0292],
         [0.0225],
         [0.0018],
         [0.0018],
         [0.0049],
         [0.0466],
         [0.0575]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0530],
         [0.0367],
         [0.0598],
         [0.0239],
         [0.0199],
         [0.0092],
         [0.0097],
         [0.0326],
         [0.0125],
    


Evaluating:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                  | 1793/2000 [01:24<00:09, 21.74it/s][A

reg attention sum per layer
tensor([[[0.1689],
         [0.1149],
         [0.0268],
         [0.0130],
         [0.0334],
         [0.0252],
         [0.0422],
         [0.0219],
         [0.0035],
         [0.0262],
         [0.0103],
         [0.0141]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0371],
         [0.1178],
         [0.0259],
         [0.0172],
         [0.0431],
         [0.0216],
         [0.0138],
         [0.0376],
         [0.0121],
         [0.0134],
         [0.0281],
         [0.0412]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0178],
         [0.0136],
         [0.0608],
         [0.0063],
         [0.0344],
         [0.0130],
         [0.0069],
         [0.0064],
         [0.0061],
         [0.0047],
         [0.0169],
         [0.0337]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0617],
         [0.0558],
         [0.0401],
         [0.0244],
         [0.0526],
         [0.0254],
         [0.0060],
         [0.00


Evaluating:  90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                  | 1796/2000 [01:24<00:09, 21.75it/s][A

tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0073],
         [0.0143],
         [0.0083],
         [0.0028],
         [0.0049],
         [0.0088],
         [0.0104],
         [0.0023],
         [0.0021],
         [0.0015],
         [0.0057],
         [0.0057]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0193],
         [0.0027],
         [0.0339],
         [0.0166],
         [0.0463],
         [0.0080],
         [0.0144],
         [0.0115],
         [0.0096],
         [0.0133],
         [0.0057],
         [0.0170]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0106],
         [0.0327],
         [0.0239],
         [0.0071],
         [0.0110],
         [0.0243],
         [0.0469],
         [0.0079],
         [0.0035],
         [0.0052],
         [0.0530],
         [0.02


Evaluating:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                  | 1799/2000 [01:24<00:09, 21.77it/s][A
Evaluating:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                 | 1802/2000 [01:25<00:09, 21.78it/s][A

tensor([[[0.0180],
         [0.0053],
         [0.0089],
         [0.0124],
         [0.0020],
         [0.0124],
         [0.0031],
         [0.0096],
         [0.0001],
         [0.0014],
         [0.0199],
         [0.0103]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0644],
         [0.0777],
         [0.0923],
         [0.0448],
         [0.0447],
         [0.0515],
         [0.0298],
         [0.0127],
         [0.0055],
         [0.0071],
         [0.0560],
         [0.0378]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0992],
         [0.0089],
         [0.0227],
         [0.0243],
         [0.0319],
         [0.0174],
         [0.0302],
         [0.0315],
         [0.0398],
         [0.0153],
         [0.0049],
         [0.0588]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0373],
         [0.0490],
         [0.0960],
         [0.0426],
         [0.0500],
         [0.0138],
         [0.0188],
         [0.0072],
         [0.0055],
    


Evaluating:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                 | 1805/2000 [01:25<00:08, 21.83it/s][A


tensor([[[0.0465],
         [0.0345],
         [0.0262],
         [0.0113],
         [0.0213],
         [0.0084],
         [0.0257],
         [0.0108],
         [0.0097],
         [0.0065],
         [0.0055],
         [0.0283]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1147],
         [0.0188],
         [0.0098],
         [0.0065],
         [0.0139],
         [0.0069],
         [0.1203],
         [0.0050],
         [0.0344],
         [0.0227],
         [0.0036],
         [0.0123]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0056],
         [0.0074],
         [0.0755],
         [0.0045],
         [0.0701],
         [0.0097],
         [0.0175],
         [0.0029],
         [0.0042],
         [0.0028],
         [0.0138],
         [0.0154]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0222],
         [0.0096],
         [0.0511],
         [0.0100],
         [0.0332],
         [0.0223],
         [0.0138],
         [0.0032],
         [0.0018],
   


Evaluating:  90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                 | 1808/2000 [01:25<00:08, 21.77it/s][A
Evaluating:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                | 1811/2000 [01:25<00:08, 21.78it/s][A

reg attention sum per layer
tensor([[[0.0240],
         [0.0615],
         [0.0520],
         [0.0107],
         [0.0317],
         [0.0133],
         [0.0389],
         [0.0083],
         [0.0084],
         [0.0083],
         [0.0442],
         [0.0327]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0431],
         [0.0309],
         [0.0248],
         [0.0171],
         [0.0214],
         [0.0210],
         [0.0134],
         [0.0055],
         [0.0023],
         [0.0040],
         [0.0270],
         [0.0388]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0046],
         [0.0066],
         [0.0160],
         [0.0041],
         [0.0034],
         [0.0017],
         [0.0071],
         [0.0011],
         [0.0005],
         [0.0017],
         [0.0118],
         [0.0102]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0187],
         [0.0152],
         [0.0145],
         [0.0041],
         [0.0362],
         [0.0171],
         [0.0071],
         [0.00


Evaluating:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                | 1814/2000 [01:25<00:08, 21.79it/s][A


tensor([[[0.0305],
         [0.0173],
         [0.0535],
         [0.0085],
         [0.0711],
         [0.0241],
         [0.0032],
         [0.0050],
         [0.0004],
         [0.0144],
         [0.0778],
         [0.0144]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0439],
         [0.0338],
         [0.0613],
         [0.0175],
         [0.0413],
         [0.0042],
         [0.0321],
         [0.0183],
         [0.0133],
         [0.0272],
         [0.0117],
         [0.0446]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0681],
         [0.0878],
         [0.0356],
         [0.0330],
         [0.0427],
         [0.0276],
         [0.0337],
         [0.0198],
         [0.0036],
         [0.0261],
         [0.1064],
         [0.0369]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0292],
         [0.0483],
         [0.0675],
         [0.0107],
         [0.0151],
         [0.0218],
         [0.0504],
         [0.0062],
         [0.0050],
   


Evaluating:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                | 1817/2000 [01:25<00:08, 21.84it/s][A

tensor([[[0.0612],
         [0.0233],
         [0.0212],
         [0.0112],
         [0.0243],
         [0.0150],
         [0.0194],
         [0.0076],
         [0.0163],
         [0.0101],
         [0.0112],
         [0.0169]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0085],
         [0.0208],
         [0.0559],
         [0.0067],
         [0.0145],
         [0.0178],
         [0.0091],
         [0.0263],
         [0.0117],
         [0.0063],
         [0.0233],
         [0.0321]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0916],
         [0.0658],
         [0.0305],
         [0.0158],
         [0.0097],
         [0.0058],
         [0.0158],
         [0.0118],
         [0.0038],
         [0.0051],
         [0.0121],
         [0.0309]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0337],
         [0.0096],
         [0.0928],
         [0.0436],
         [0.0251],
         [0.0070],
         [0.0198],
         [0.0105],
         [0.0294],
    


Evaluating:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 1820/2000 [01:25<00:08, 21.82it/s][A
Evaluating:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏               | 1823/2000 [01:26<00:08, 21.72it/s][A


reg attention sum per layer
tensor([[[0.0791],
         [0.1208],
         [0.0243],
         [0.0125],
         [0.0223],
         [0.0186],
         [0.0109],
         [0.0251],
         [0.0036],
         [0.0199],
         [0.0128],
         [0.0313]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0193],
         [0.0289],
         [0.0203],
         [0.0097],
         [0.0059],
         [0.0046],
         [0.0141],
         [0.0466],
         [0.0049],
         [0.0090],
         [0.0088],
         [0.0205]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0645],
         [0.0180],
         [0.0181],
         [0.0129],
         [0.0179],
         [0.0115],
         [0.0410],
         [0.0061],
         [0.1690],
         [0.0287],
         [0.0028],
         [0.0279]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0120],
         [0.0211],
         [0.0335],
         [0.0041],
         [0.0022],
         [0.0065],
         [0.0059],
         [0.0


Evaluating:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 1826/2000 [01:26<00:08, 21.60it/s][A


reg attention sum per layer
tensor([[[0.0102],
         [0.0099],
         [0.0941],
         [0.0112],
         [0.0124],
         [0.0094],
         [0.0075],
         [0.0069],
         [0.0033],
         [0.0019],
         [0.0161],
         [0.0380]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0286],
         [0.0269],
         [0.0310],
         [0.0120],
         [0.0157],
         [0.0184],
         [0.0135],
         [0.0124],
         [0.0024],
         [0.0039],
         [0.0242],
         [0.0208]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0287],
         [0.0197],
         [0.0255],
         [0.0225],
         [0.0196],
         [0.0101],
         [0.0100],
         [0.0144],
         [0.0114],
         [0.0059],
         [0.0068],
         [0.0268]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0431],
         [0.0628],
         [0.1220],
         [0.0555],
         [0.0634],
         [0.0084],
         [0.0926],
         [0.0


Evaluating:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊               | 1829/2000 [01:26<00:07, 21.57it/s][A
Evaluating:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████               | 1832/2000 [01:26<00:07, 21.59it/s][A

reg attention sum per layer
tensor([[[0.0634],
         [0.0794],
         [0.0217],
         [0.0180],
         [0.0118],
         [0.0082],
         [0.0317],
         [0.0118],
         [0.0040],
         [0.0135],
         [0.0208],
         [0.0208]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0102],
         [0.0188],
         [0.0379],
         [0.0089],
         [0.0205],
         [0.0051],
         [0.0065],
         [0.0059],
         [0.0075],
         [0.0037],
         [0.0129],
         [0.0260]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0541],
         [0.0645],
         [0.0566],
         [0.0145],
         [0.0597],
         [0.0454],
         [0.0227],
         [0.0143],
         [0.0144],
         [0.0215],
         [0.0315],
         [0.0650]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0142],
         [0.0172],
         [0.0515],
         [0.0067],
         [0.0187],
         [0.0246],
         [0.0111],
         [0.01


Evaluating:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎              | 1835/2000 [01:26<00:07, 21.56it/s][A

reg attention sum per layer
tensor([[[0.0278],
         [0.0414],
         [0.0507],
         [0.0155],
         [0.0436],
         [0.0410],
         [0.0128],
         [0.0067],
         [0.0037],
         [0.0069],
         [0.0469],
         [0.0321]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0215],
         [0.0855],
         [0.0629],
         [0.0118],
         [0.0128],
         [0.0143],
         [0.0095],
         [0.0105],
         [0.0069],
         [0.0026],
         [0.0216],
         [0.0357]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0090],
         [0.0148],
         [0.0144],
         [0.0030],
         [0.0270],
         [0.0014],
         [0.0068],
         [0.0040],
         [0.0008],
         [0.0035],
         [0.0031],
         [0.0034]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0227],
         [0.0834],
         [0.0316],
         [0.0166],
         [0.0223],
         [0.0283],
         [0.0435],
         [0.00


Evaluating:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌              | 1838/2000 [01:26<00:07, 21.44it/s][A

tensor([[[0.0335],
         [0.0627],
         [0.0233],
         [0.0377],
         [0.0109],
         [0.0053],
         [0.0279],
         [0.0055],
         [0.0041],
         [0.0040],
         [0.0389],
         [0.0255]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0665],
         [0.0495],
         [0.0493],
         [0.0310],
         [0.0071],
         [0.0161],
         [0.0228],
         [0.0020],
         [0.0040],
         [0.0070],
         [0.0356],
         [0.0280]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0188],
         [0.0197],
         [0.0582],
         [0.0317],
         [0.0154],
         [0.0055],
         [0.0203],
         [0.0026],
         [0.0010],
         [0.0036],
         [0.0709],
         [0.0470]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0098],
         [0.0518],
         [0.0132],
         [0.0095],
         [0.0127],
         [0.0306],
         [0.0042],
         [0.0029],
         [0.0018],
    


Evaluating:  92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊              | 1841/2000 [01:26<00:07, 21.60it/s][A
Evaluating:  92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████              | 1844/2000 [01:27<00:07, 21.15it/s][A

tensor([[[0.0809],
         [0.0350],
         [0.0611],
         [0.0206],
         [0.0483],
         [0.0244],
         [0.0087],
         [0.0204],
         [0.0018],
         [0.0064],
         [0.0370],
         [0.0593]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0119],
         [0.0733],
         [0.0521],
         [0.0132],
         [0.0142],
         [0.0107],
         [0.0141],
         [0.0082],
         [0.0046],
         [0.0043],
         [0.0720],
         [0.0188]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0283],
         [0.0146],
         [0.0221],
         [0.0143],
         [0.0057],
         [0.0200],
         [0.0083],
         [0.0026],
         [0.0014],
         [0.0029],
         [0.1117],
         [0.0337]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0666],
         [0.0634],
         [0.0316],
         [0.0181],
         [0.0300],
         [0.0054],
         [0.0511],
         [0.0128],
         [0.0161],
    


Evaluating:  92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍             | 1847/2000 [01:27<00:07, 20.30it/s][A

reg attention sum per layer
tensor([[[0.0216],
         [0.0243],
         [0.0192],
         [0.0162],
         [0.0037],
         [0.0062],
         [0.0139],
         [0.0032],
         [0.0012],
         [0.0031],
         [0.0066],
         [0.0298]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0269],
         [0.0527],
         [0.0612],
         [0.0121],
         [0.0097],
         [0.0067],
         [0.0260],
         [0.0046],
         [0.0028],
         [0.0025],
         [0.0802],
         [0.0313]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0324],
         [0.0510],
         [0.0217],
         [0.0097],
         [0.0208],
         [0.0309],
         [0.0576],
         [0.0137],
         [0.0077],
         [0.0291],
         [0.0075],
         [0.0096]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0206],
         [0.0115],
         [0.0140],
         [0.0032],
         [0.0070],
         [0.0024],
         [0.0087],
         [0.00


Evaluating:  92%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋             | 1850/2000 [01:27<00:07, 19.81it/s][A

reg attention sum per layer
tensor([[[0.0302],
         [0.0131],
         [0.0283],
         [0.0169],
         [0.0160],
         [0.0024],
         [0.0135],
         [0.0036],
         [0.0268],
         [0.0048],
         [0.0084],
         [0.0234]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0078],
         [0.0091],
         [0.0355],
         [0.0175],
         [0.0318],
         [0.0059],
         [0.0123],
         [0.0048],
         [0.0028],
         [0.0030],
         [0.0303],
         [0.0202]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0445],
         [0.1066],
         [0.1404],
         [0.0125],
         [0.0282],
         [0.0213],
         [0.0113],
         [0.0266],
         [0.0102],
         [0.0133],
         [0.0804],
         [0.0999]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0209],
         [0.0299],
         [0.0145],
         [0.0048],
         [0.0172],
         [0.0064],
         [0.0350],
         [0.00


Evaluating:  93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉             | 1853/2000 [01:27<00:07, 20.28it/s][A
Evaluating:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏            | 1856/2000 [01:27<00:06, 20.76it/s][A

tensor([[[0.0598],
         [0.0721],
         [0.3820],
         [0.0589],
         [0.1433],
         [0.0772],
         [0.0391],
         [0.0878],
         [0.0171],
         [0.0302],
         [0.5460],
         [0.3911]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0261],
         [0.0208],
         [0.0338],
         [0.0061],
         [0.0159],
         [0.0058],
         [0.0154],
         [0.0284],
         [0.0049],
         [0.0126],
         [0.0124],
         [0.0080]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0574],
         [0.0793],
         [0.0588],
         [0.0175],
         [0.0251],
         [0.0194],
         [0.0352],
         [0.0554],
         [0.0033],
         [0.0155],
         [0.0174],
         [0.0309]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0543],
         [0.0337],
         [0.0163],
         [0.0140],
         [0.0103],
         [0.0275],
         [0.0671],
         [0.0025],
         [0.0030],
    


Evaluating:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍            | 1859/2000 [01:27<00:06, 20.67it/s][A

tensor([[[0.0474],
         [0.0736],
         [0.0269],
         [0.0130],
         [0.0206],
         [0.0083],
         [0.0981],
         [0.0227],
         [0.0305],
         [0.0086],
         [0.0031],
         [0.0311]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0156],
         [0.0197],
         [0.0266],
         [0.0082],
         [0.0118],
         [0.0038],
         [0.0062],
         [0.0017],
         [0.0012],
         [0.0051],
         [0.0207],
         [0.0111]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0091],
         [0.0172],
         [0.0096],
         [0.0027],
         [0.0063],
         [0.0230],
         [0.0346],
         [0.0010],
         [0.0036],
         [0.0122],
         [0.0119],
         [0.0062]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0159],
         [0.0282],
         [0.1077],
         [0.0288],
         [0.0323],
         [0.0083],
         [0.0113],
         [0.0307],
         [0.0068],
    


Evaluating:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋            | 1862/2000 [01:27<00:07, 19.56it/s][A
Evaluating:  93%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉            | 1864/2000 [01:28<00:07, 18.98it/s]

reg attention sum per layer
tensor([[[0.0668],
         [0.0164],
         [0.0286],
         [0.0191],
         [0.0181],
         [0.0080],
         [0.0290],
         [0.0063],
         [0.0245],
         [0.0131],
         [0.0043],
         [0.0265]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1028],
         [0.0510],
         [0.0543],
         [0.0172],
         [0.0264],
         [0.0228],
         [0.0275],
         [0.0243],
         [0.0036],
         [0.0173],
         [0.0363],
         [0.0306]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0461],
         [0.0135],
         [0.0221],
         [0.0086],
         [0.0181],
         [0.0100],
         [0.0559],
         [0.0025],
         [0.0125],
         [0.0136],
         [0.0108],
         [0.0173]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1139],
         [0.0139],
         [0.0504],
         [0.0204],
         [0.0455],
         [0.0129],
         [0.1586],
         [0.00

[A
Evaluating:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████            | 1866/2000 [01:28<00:07, 18.59it/s][A
Evaluating:  93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 1868/2000 [01:28<00:07, 18.74it/s][A

reg attention sum per layer
tensor([[[0.0113],
         [0.0104],
         [0.0214],
         [0.0067],
         [0.0100],
         [0.0051],
         [0.0080],
         [0.0075],
         [0.0052],
         [0.0061],
         [0.0133],
         [0.0122]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0212],
         [0.0216],
         [0.0220],
         [0.0104],
         [0.0307],
         [0.0054],
         [0.0064],
         [0.0030],
         [0.0022],
         [0.0026],
         [0.0057],
         [0.0092]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0098],
         [0.0713],
         [0.0619],
         [0.0086],
         [0.0102],
         [0.0125],
         [0.0059],
         [0.0016],
         [0.0006],
         [0.0057],
         [0.0424],
         [0.0631]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0604],
         [0.0750],
         [0.1159],
         [0.0311],
         [0.0242],
         [0.0374],
         [0.0493],
         [0.04


Evaluating:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍           | 1870/2000 [01:28<00:07, 18.43it/s][A
Evaluating:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌           | 1872/2000 [01:28<00:07, 18.22it/s][A

reg attention sum per layer
tensor([[[0.0510],
         [0.0402],
         [0.0448],
         [0.0193],
         [0.0318],
         [0.0225],
         [0.0178],
         [0.0067],
         [0.0018],
         [0.0075],
         [0.0199],
         [0.0475]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0485],
         [0.0098],
         [0.0397],
         [0.0160],
         [0.0348],
         [0.0126],
         [0.0148],
         [0.0259],
         [0.0055],
         [0.0075],
         [0.0187],
         [0.0371]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0091],
         [0.0117],
         [0.0204],
         [0.0031],
         [0.0028],
         [0.0075],
         [0.0028],
         [0.0029],
         [0.0004],
         [0.0013],
         [0.0505],
         [0.0083]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0126],
         [0.0349],
         [0.0342],
         [0.0131],
         [0.0252],
         [0.0166],
         [0.0091],
         [0.00


Evaluating:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊           | 1874/2000 [01:28<00:06, 18.17it/s][A
Evaluating:  94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉           | 1876/2000 [01:28<00:06, 18.04it/s][A

reg attention sum per layer
tensor([[[0.0116],
         [0.0218],
         [0.1784],
         [0.0289],
         [0.0780],
         [0.0078],
         [0.0120],
         [0.0373],
         [0.0109],
         [0.0057],
         [0.0200],
         [0.0669]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0303],
         [0.0552],
         [0.0227],
         [0.0049],
         [0.0214],
         [0.0283],
         [0.0258],
         [0.0110],
         [0.0052],
         [0.0058],
         [0.0268],
         [0.0244]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0103],
         [0.0573],
         [0.0768],
         [0.0278],
         [0.0192],
         [0.0119],
         [0.0069],
         [0.0020],
         [0.0028],
         [0.0092],
         [0.0953],
         [0.1186]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0173],
         [0.0382],
         [0.0486],
         [0.0090],
         [0.0297],
         [0.0305],
         [0.0313],
         [0.03


Evaluating:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏          | 1878/2000 [01:28<00:06, 18.07it/s][A
Evaluating:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎          | 1880/2000 [01:28<00:06, 18.12it/s][A

reg attention sum per layer
tensor([[[0.0337],
         [0.0318],
         [0.0324],
         [0.0084],
         [0.0084],
         [0.0054],
         [0.0113],
         [0.0103],
         [0.0018],
         [0.0036],
         [0.0232],
         [0.0259]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0338],
         [0.0127],
         [0.0301],
         [0.0137],
         [0.0102],
         [0.0040],
         [0.0158],
         [0.0226],
         [0.0134],
         [0.0082],
         [0.0039],
         [0.0424]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0582],
         [0.1299],
         [0.0193],
         [0.0201],
         [0.0090],
         [0.0101],
         [0.0299],
         [0.0172],
         [0.0044],
         [0.0127],
         [0.0102],
         [0.0136]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0458],
         [0.0390],
         [0.1573],
         [0.0167],
         [0.0807],
         [0.0257],
         [0.0548],
         [0.01


Evaluating:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍          | 1882/2000 [01:29<00:06, 18.25it/s][A

reg attention sum per layer
tensor([[[0.0275],
         [0.0367],
         [0.0315],
         [0.0122],
         [0.0119],
         [0.0066],
         [0.0123],
         [0.0094],
         [0.0075],
         [0.0058],
         [0.0144],
         [0.0355]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0359],
         [0.1476],
         [0.0166],
         [0.0135],
         [0.0106],
         [0.0063],
         [0.0218],
         [0.0093],
         [0.0050],
         [0.0128],
         [0.0142],
         [0.0274]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0764],
         [0.0683],
         [0.0392],
         [0.0150],
         [0.0256],
         [0.0156],
         [0.0469],
         [0.0364],
         [0.0192],
         [0.0149],
         [0.0138],
         [0.0437]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0771],
         [0.0496],
         [0.0392],
         [0.0132],
         [0.0283],
         [0.0236],
         [0.0469],
         [0.01


Evaluating:  94%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 1885/2000 [01:29<00:06, 19.08it/s][A
Evaluating:  94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████          | 1888/2000 [01:29<00:05, 19.90it/s][A


tensor([[[0.0182],
         [0.0349],
         [0.0662],
         [0.0144],
         [0.0142],
         [0.0167],
         [0.0211],
         [0.0113],
         [0.0070],
         [0.0041],
         [0.0667],
         [0.1040]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0317],
         [0.0266],
         [0.0251],
         [0.0127],
         [0.0152],
         [0.0235],
         [0.0054],
         [0.0036],
         [0.0027],
         [0.0040],
         [0.0429],
         [0.0291]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0581],
         [0.0376],
         [0.0207],
         [0.0167],
         [0.0249],
         [0.0058],
         [0.0369],
         [0.0076],
         [0.0081],
         [0.0176],
         [0.0496],
         [0.0168]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0298],
         [0.0473],
         [0.0236],
         [0.0031],
         [0.0032],
         [0.0079],
         [0.0140],
         [0.0225],
         [0.0022],
   


Evaluating:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎         | 1891/2000 [01:29<00:05, 20.39it/s][A

tensor([[[0.0072],
         [0.0180],
         [0.0551],
         [0.0054],
         [0.0079],
         [0.0058],
         [0.0080],
         [0.0019],
         [0.0034],
         [0.0019],
         [0.0659],
         [0.0334]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0038],
         [0.0167],
         [0.0386],
         [0.0063],
         [0.0144],
         [0.0256],
         [0.0093],
         [0.0135],
         [0.0039],
         [0.0042],
         [0.0324],
         [0.0157]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0064],
         [0.0102],
         [0.0141],
         [0.0050],
         [0.0126],
         [0.0055],
         [0.0023],
         [0.0080],
         [0.0009],
         [0.0019],
         [0.0021],
         [0.0088]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0871],
         [0.0378],
         [0.0662],
         [0.0177],
         [0.0367],
         [0.0065],
         [0.0356],
         [0.0238],
         [0.0650],
    


Evaluating:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌         | 1894/2000 [01:29<00:05, 20.75it/s][A


reg attention sum per layer
tensor([[[0.0134],
         [0.0179],
         [0.0260],
         [0.0128],
         [0.0122],
         [0.0116],
         [0.0169],
         [0.0091],
         [0.0065],
         [0.0131],
         [0.0226],
         [0.0139]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0757],
         [0.1122],
         [0.0972],
         [0.0283],
         [0.0224],
         [0.0086],
         [0.0239],
         [0.0149],
         [0.0078],
         [0.0154],
         [0.0293],
         [0.0347]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0166],
         [0.0234],
         [0.0224],
         [0.0049],
         [0.0018],
         [0.0039],
         [0.0262],
         [0.0020],
         [0.0037],
         [0.0019],
         [0.0049],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0357],
         [0.0556],
         [0.0340],
         [0.0110],
         [0.0162],
         [0.0117],
         [0.0115],
         [0.0


Evaluating:  95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊         | 1897/2000 [01:29<00:04, 20.88it/s][A
Evaluating:  95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████         | 1900/2000 [01:29<00:04, 21.19it/s][A


reg attention sum per layer
tensor([[[0.0404],
         [0.1066],
         [0.0419],
         [0.0304],
         [0.0418],
         [0.0138],
         [0.0475],
         [0.0143],
         [0.0154],
         [0.0220],
         [0.0374],
         [0.0153]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0181],
         [0.0142],
         [0.0757],
         [0.0070],
         [0.0317],
         [0.0129],
         [0.0073],
         [0.0014],
         [0.0011],
         [0.0011],
         [0.0270],
         [0.0367]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0268],
         [0.0033],
         [0.0038],
         [0.0053],
         [0.0081],
         [0.0034],
         [0.0059],
         [0.0083],
         [0.0053],
         [0.0040],
         [0.0023],
         [0.0096]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0335],
         [0.0339],
         [0.0393],
         [0.0257],
         [0.0131],
         [0.0114],
         [0.0180],
         [0.0


Evaluating:  95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 1903/2000 [01:30<00:04, 21.37it/s][A

reg attention sum per layer
tensor([[[0.0069],
         [0.0119],
         [0.0298],
         [0.0050],
         [0.0082],
         [0.0065],
         [0.0062],
         [0.0253],
         [0.0028],
         [0.0017],
         [0.0547],
         [0.0137]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0605],
         [0.0298],
         [0.0645],
         [0.0249],
         [0.0192],
         [0.0275],
         [0.1021],
         [0.0105],
         [0.0083],
         [0.0133],
         [0.0798],
         [0.0549]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0316],
         [0.0185],
         [0.0298],
         [0.0084],
         [0.0238],
         [0.0062],
         [0.0215],
         [0.0346],
         [0.0135],
         [0.0133],
         [0.0087],
         [0.0274]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0608],
         [0.0559],
         [0.2071],
         [0.0216],
         [0.0356],
         [0.0079],
         [0.0367],
         [0.01


Evaluating:  95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 1906/2000 [01:30<00:04, 21.54it/s][A
Evaluating:  95%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉        | 1909/2000 [01:30<00:04, 21.62it/s][A

tensor([[[0.0382],
         [0.0493],
         [0.1377],
         [0.0106],
         [0.0518],
         [0.0064],
         [0.0344],
         [0.0192],
         [0.0380],
         [0.0184],
         [0.0062],
         [0.0815]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0399],
         [0.0817],
         [0.0558],
         [0.0147],
         [0.0178],
         [0.0122],
         [0.0113],
         [0.0030],
         [0.0028],
         [0.0071],
         [0.0584],
         [0.0445]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0228],
         [0.0112],
         [0.0595],
         [0.0072],
         [0.0140],
         [0.0097],
         [0.0097],
         [0.0095],
         [0.0964],
         [0.0047],
         [0.0037],
         [0.0317]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0191],
         [0.0101],
         [0.0934],
         [0.0177],
         [0.0311],
         [0.0086],
         [0.0164],
         [0.0054],
         [0.0066],
    


Evaluating:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏       | 1912/2000 [01:30<00:04, 21.72it/s][A


reg attention sum per layer
tensor([[[0.0702],
         [0.0405],
         [0.0575],
         [0.0221],
         [0.0187],
         [0.0224],
         [0.0604],
         [0.0148],
         [0.0129],
         [0.0178],
         [0.0451],
         [0.0390]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0335],
         [0.0508],
         [0.0896],
         [0.0199],
         [0.0058],
         [0.0236],
         [0.0240],
         [0.0139],
         [0.0029],
         [0.0031],
         [0.0474],
         [0.0324]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0262],
         [0.0211],
         [0.0397],
         [0.0158],
         [0.0236],
         [0.0080],
         [0.0116],
         [0.0091],
         [0.0115],
         [0.0114],
         [0.0139],
         [0.0175]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0203],
         [0.0141],
         [0.0518],
         [0.0125],
         [0.0142],
         [0.0035],
         [0.0089],
         [0.0


Evaluating:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍       | 1915/2000 [01:30<00:03, 21.74it/s][A
Evaluating:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋       | 1918/2000 [01:30<00:03, 21.81it/s][A

reg attention sum per layer
tensor([[[0.0092],
         [0.0143],
         [0.0342],
         [0.0077],
         [0.0054],
         [0.0082],
         [0.0053],
         [0.0019],
         [0.0011],
         [0.0010],
         [0.0140],
         [0.0253]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0415],
         [0.0385],
         [0.0910],
         [0.0222],
         [0.0334],
         [0.0102],
         [0.0134],
         [0.0518],
         [0.0082],
         [0.0076],
         [0.0376],
         [0.0584]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0268],
         [0.0257],
         [0.0724],
         [0.0112],
         [0.0358],
         [0.0442],
         [0.0278],
         [0.0073],
         [0.0134],
         [0.0179],
         [0.0184],
         [0.0297]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0564],
         [0.0488],
         [0.0464],
         [0.0156],
         [0.0149],
         [0.0141],
         [0.0315],
         [0.03


Evaluating:  96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉       | 1921/2000 [01:30<00:03, 21.66it/s][A

tensor([[[0.0267],
         [0.0251],
         [0.0256],
         [0.0445],
         [0.0183],
         [0.0140],
         [0.0411],
         [0.0127],
         [0.0222],
         [0.0110],
         [0.0384],
         [0.0111]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0029],
         [0.0425],
         [0.0505],
         [0.0030],
         [0.0033],
         [0.0070],
         [0.0013],
         [0.0010],
         [0.0005],
         [0.0002],
         [0.0209],
         [0.0152]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0595],
         [0.0335],
         [0.0941],
         [0.0060],
         [0.0086],
         [0.0166],
         [0.0231],
         [0.0036],
         [0.0024],
         [0.0084],
         [0.0548],
         [0.0289]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0270],
         [0.0791],
         [0.0693],
         [0.0170],
         [0.0207],
         [0.0053],
         [0.0085],
         [0.0073],
         [0.0088],
    


Evaluating:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 1924/2000 [01:31<00:03, 21.56it/s][A

tensor([[[0.0282],
         [0.0276],
         [0.0554],
         [0.0185],
         [0.0421],
         [0.0142],
         [0.0241],
         [0.0118],
         [0.0121],
         [0.0091],
         [0.0372],
         [0.0299]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0116],
         [0.0083],
         [0.0297],
         [0.0116],
         [0.0140],
         [0.0085],
         [0.0070],
         [0.0056],
         [0.0046],
         [0.0022],
         [0.0130],
         [0.0723]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0744],
         [0.0178],
         [0.0201],
         [0.0148],
         [0.0689],
         [0.0030],
         [0.0457],
         [0.0050],
         [0.0038],
         [0.0465],
         [0.0445],
         [0.0244]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0286],
         [0.0329],
         [0.0104],
         [0.0061],
         [0.0046],
         [0.0113],
         [0.0373],
         [0.0058],
         [0.0059],
    


Evaluating:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌      | 1927/2000 [01:31<00:03, 21.59it/s][A
Evaluating:  96%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊      | 1930/2000 [01:31<00:03, 21.51it/s][A

tensor([[[0.0766],
         [0.0553],
         [0.1476],
         [0.0350],
         [0.1121],
         [0.0645],
         [0.0264],
         [0.0345],
         [0.0090],
         [0.0254],
         [0.0402],
         [0.0396]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0512],
         [0.0844],
         [0.0381],
         [0.0150],
         [0.0273],
         [0.0191],
         [0.0149],
         [0.0064],
         [0.0027],
         [0.0063],
         [0.0769],
         [0.0896]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0477],
         [0.0302],
         [0.1598],
         [0.0242],
         [0.0486],
         [0.0306],
         [0.0295],
         [0.0290],
         [0.0521],
         [0.0148],
         [0.0414],
         [0.1093]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0028],
         [0.0013],
         [0.0293],
         [0.0117],
         [0.0157],
         [0.0024],
         [0.0026],
         [0.0017],
         [0.0008],
    


Evaluating:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████      | 1933/2000 [01:31<00:03, 21.10it/s][A

tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0288],
         [0.0352],
         [0.0261],
         [0.0180],
         [0.0254],
         [0.0092],
         [0.0103],
         [0.0290],
         [0.0061],
         [0.0114],
         [0.0075],
         [0.0228]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0181],
         [0.0546],
         [0.0356],
         [0.0086],
         [0.0149],
         [0.0131],
         [0.0187],
         [0.0204],
         [0.0033],
         [0.0055],
         [0.0216],
         [0.0105]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per la


Evaluating:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎     | 1936/2000 [01:31<00:02, 21.35it/s][A

tensor([[[0.1049],
         [0.1123],
         [0.0538],
         [0.0234],
         [0.0349],
         [0.0242],
         [0.0654],
         [0.0271],
         [0.0112],
         [0.0730],
         [0.0194],
         [0.0248]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0307],
         [0.0538],
         [0.0220],
         [0.0092],
         [0.0140],
         [0.0166],
         [0.0333],
         [0.0299],
         [0.0098],
         [0.0095],
         [0.0149],
         [0.0134]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0129],
         [0.0178],
         [0.2847],
         [0.0321],
         [0.0690],
         [0.0200],
         [0.0308],
         [0.0105],
         [0.0061],
         [0.0042],
         [0.0767],
         [0.2191]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0085],
         [0.0221],
         [0.0438],
         [0.0124],
         [0.0296],
         [0.0079],
         [0.0075],
         [0.0083],
         [0.0034],
    


Evaluating:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 1939/2000 [01:31<00:02, 21.43it/s][A
Evaluating:  97%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊     | 1942/2000 [01:31<00:02, 20.86it/s][A

tensor([[[0.0124],
         [0.0236],
         [0.0648],
         [0.0148],
         [0.0290],
         [0.0065],
         [0.0089],
         [0.0063],
         [0.0090],
         [0.0091],
         [0.0617],
         [0.0795]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0427],
         [0.0412],
         [0.0194],
         [0.0287],
         [0.0126],
         [0.0102],
         [0.0285],
         [0.0074],
         [0.0053],
         [0.0071],
         [0.0177],
         [0.0461]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0147],
         [0.0398],
         [0.0248],
         [0.0083],
         [0.0103],
         [0.0053],
         [0.0103],
         [0.0066],
         [0.0024],
         [0.0031],
         [0.0260],
         [0.0188]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0076],
         [0.0148],
         [0.0364],
         [0.0110],
         [0.0089],
         [0.0021],
         [0.0049],
         [0.0160],
         [0.0040],
    


Evaluating:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████     | 1945/2000 [01:32<00:02, 21.04it/s][A

reg attention sum per layer
tensor([[[0.0511],
         [0.0377],
         [0.1074],
         [0.0234],
         [0.1017],
         [0.0216],
         [0.0205],
         [0.0292],
         [0.0108],
         [0.0157],
         [0.0507],
         [0.0515]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0196],
         [0.0572],
         [0.0418],
         [0.0157],
         [0.0260],
         [0.0211],
         [0.0201],
         [0.0177],
         [0.0049],
         [0.0067],
         [0.0271],
         [0.0300]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0651],
         [0.0270],
         [0.0445],
         [0.0083],
         [0.0143],
         [0.0270],
         [0.0165],
         [0.0135],
         [0.0058],
         [0.0127],
         [0.0270],
         [0.0140]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0179],
         [0.0147],
         [0.0416],
         [0.0105],
         [0.0204],
         [0.0072],
         [0.0109],
         [0.01


Evaluating:  97%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎    | 1948/2000 [01:32<00:02, 21.27it/s][A
Evaluating:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 1951/2000 [01:32<00:02, 21.52it/s][A


reg attention sum per layer
tensor([[[0.0228],
         [0.0395],
         [0.0105],
         [0.0471],
         [0.0076],
         [0.0056],
         [0.0131],
         [0.0046],
         [0.0015],
         [0.0118],
         [0.1316],
         [0.0179]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0029],
         [0.0059],
         [0.0076],
         [0.0019],
         [0.0045],
         [0.0025],
         [0.0025],
         [0.0010],
         [0.0002],
         [0.0006],
         [0.0147],
         [0.0205]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0163],
         [0.0131],
         [0.0189],
         [0.0227],
         [0.0137],
         [0.0079],
         [0.0038],
         [0.0120],
         [0.0052],
         [0.0052],
    


Evaluating:  98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉    | 1954/2000 [01:32<00:02, 21.65it/s][A

reg attention sum per layer
tensor([[[0.0138],
         [0.0430],
         [0.0526],
         [0.0118],
         [0.0451],
         [0.0051],
         [0.0301],
         [0.0102],
         [0.0337],
         [0.0123],
         [0.0182],
         [0.0241]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0104],
         [0.0104],
         [0.0068],
         [0.0023],
         [0.0014],
         [0.0035],
         [0.0278],
         [0.0074],
         [0.0017],
         [0.0025],
         [0.0033],
         [0.0053]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0322],
         [0.0413],
         [0.0294],
         [0.0115],
         [0.0360],
         [0.0339],
         [0.0178],
         [0.0139],
         [0.0108],
         [0.0167],
         [0.0077],
         [0.0226]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0603],
         [0.0791],
         [0.0433],
         [0.0290],
         [0.0239],
         [0.0095],
         [0.0744],
         [0.02


Evaluating:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏   | 1957/2000 [01:32<00:01, 21.60it/s][A

tensor([[[0.0403],
         [0.0170],
         [0.0348],
         [0.0226],
         [0.0222],
         [0.0145],
         [0.0159],
         [0.0824],
         [0.0113],
         [0.0050],
         [0.0057],
         [0.0153]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0122],
         [0.0069],
         [0.0142],
         [0.0103],
         [0.0050],
         [0.0034],
         [0.0056],
         [0.0050],
         [0.0091],
         [0.0010],
         [0.0148],
         [0.0271]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0382],
         [0.0323],
         [0.0574],
         [0.0109],
         [0.0530],
         [0.0249],
         [0.0146],
         [0.0283],
         [0.0027],
         [0.0075],
         [0.1220],
         [0.0279]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0158],
         [0.0119],
         [0.0232],
         [0.0030],
         [0.0071],
         [0.0016],
         [0.0868],
         [0.0041],
         [0.0091],
    


Evaluating:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 1960/2000 [01:32<00:01, 21.70it/s][A
Evaluating:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋   | 1963/2000 [01:32<00:01, 21.68it/s][A


reg attention sum per layer
tensor([[[0.0122],
         [0.0297],
         [0.1214],
         [0.0154],
         [0.0131],
         [0.0201],
         [0.0150],
         [0.0105],
         [0.0056],
         [0.0036],
         [0.1187],
         [0.0818]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0379],
         [0.0474],
         [0.0666],
         [0.0263],
         [0.0467],
         [0.0108],
         [0.0112],
         [0.0165],
         [0.0200],
         [0.0135],
         [0.0284],
         [0.0611]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1814],
         [0.0558],
         [0.0317],
         [0.0264],
         [0.0075],
         [0.0282],
         [0.0564],
         [0.0185],
         [0.0082],
         [0.0224],
         [0.0024],
         [0.0212]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0199],
         [0.0296],
         [0.0612],
         [0.0129],
         [0.0242],
         [0.0469],
         [0.0169],
         [0.0


Evaluating:  98%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 1966/2000 [01:32<00:01, 21.72it/s][A


reg attention sum per layer
tensor([[[0.0218],
         [0.0299],
         [0.0389],
         [0.0093],
         [0.0040],
         [0.0016],
         [0.0055],
         [0.0014],
         [0.0005],
         [0.0020],
         [0.0610],
         [0.0136]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0372],
         [0.0734],
         [0.0574],
         [0.0315],
         [0.0193],
         [0.0125],
         [0.0063],
         [0.0051],
         [0.0077],
         [0.0059],
         [0.0534],
         [0.0395]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0616],
         [0.0259],
         [0.0259],
         [0.0105],
         [0.0137],
         [0.0260],
         [0.0101],
         [0.0077],
         [0.0024],
         [0.0136],
         [0.0201],
         [0.0412]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0211],
         [0.0247],
         [0.0767],
         [0.0182],
         [0.0158],
         [0.0127],
         [0.0152],
         [0.0


Evaluating:  98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 1969/2000 [01:33<00:01, 21.84it/s][A
Evaluating:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌  | 1972/2000 [01:33<00:01, 21.87it/s][A

reg attention sum per layer
tensor([[[0.0263],
         [0.0163],
         [0.0102],
         [0.0057],
         [0.0134],
         [0.0054],
         [0.0205],
         [0.0046],
         [0.0332],
         [0.0052],
         [0.0038],
         [0.0151]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0371],
         [0.0467],
         [0.0204],
         [0.0130],
         [0.0443],
         [0.0172],
         [0.0500],
         [0.0081],
         [0.0075],
         [0.0068],
         [0.0102],
         [0.0156]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0433],
         [0.0308],
         [0.0420],
         [0.0117],
         [0.0284],
         [0.0495],
         [0.0073],
         [0.0063],
         [0.0012],
         [0.0037],
         [0.0792],
         [0.0282]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0067],
         [0.0104],
         [0.0155],
         [0.0107],
         [0.0098],
         [0.0140],
         [0.0176],
         [0.00


Evaluating:  99%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊  | 1975/2000 [01:33<00:01, 21.85it/s][A

tensor([[[0.0854],
         [0.0231],
         [0.0235],
         [0.0148],
         [0.0141],
         [0.0071],
         [0.0403],
         [0.0108],
         [0.0041],
         [0.0139],
         [0.0103],
         [0.0063]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0159],
         [0.0166],
         [0.0129],
         [0.0046],
         [0.0062],
         [0.0151],
         [0.0194],
         [0.0137],
         [0.0024],
         [0.0035],
         [0.0200],
         [0.0185]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0451],
         [0.0213],
         [0.0356],
         [0.0525],
         [0.0392],
         [0.0307],
         [0.0183],
         [0.0354],
         [0.0092],
         [0.0132],
         [0.0437],
         [0.0559]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0079],
         [0.0207],
         [0.0225],
         [0.0035],
         [0.0122],
         [0.0042],
         [0.0044],
         [0.0098],
         [0.0013],
    


Evaluating:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████  | 1978/2000 [01:33<00:01, 21.83it/s][A
Evaluating:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 1981/2000 [01:33<00:00, 21.78it/s][A

reg attention sum per layer
tensor([[[0.1045],
         [0.0803],
         [0.0662],
         [0.0167],
         [0.0241],
         [0.0112],
         [0.0552],
         [0.0312],
         [0.0469],
         [0.0430],
         [0.0072],
         [0.0476]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0239],
         [0.0309],
         [0.0694],
         [0.0072],
         [0.0131],
         [0.0100],
         [0.0094],
         [0.0237],
         [0.0062],
         [0.0032],
         [0.0246],
         [0.0270]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0136],
         [0.0093],
         [0.0105],
         [0.0146],
         [0.0071],
         [0.0052],
         [0.0147],
         [0.0012],
         [0.0012],
         [0.0062],
         [0.0113],
         [0.0083]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0654],
         [0.0312],
         [0.0453],
         [0.0224],
         [0.0176],
         [0.0265],
         [0.0422],
         [0.02


Evaluating:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 1984/2000 [01:33<00:00, 21.83it/s][A

tensor([[[0.0158],
         [0.0127],
         [0.0234],
         [0.0043],
         [0.0210],
         [0.0091],
         [0.0066],
         [0.0091],
         [0.0056],
         [0.0050],
         [0.0187],
         [0.0120]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0122],
         [0.0870],
         [0.1074],
         [0.0068],
         [0.0172],
         [0.0208],
         [0.0133],
         [0.0254],
         [0.0122],
         [0.0042],
         [0.0200],
         [0.0401]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0499],
         [0.0770],
         [0.0267],
         [0.0175],
         [0.0136],
         [0.0236],
         [0.0445],
         [0.0140],
         [0.0054],
         [0.0073],
         [0.0342],
         [0.0338]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0548],
         [0.0103],
         [0.0775],
         [0.0203],
         [0.0421],
         [0.0184],
         [0.0173],
         [0.0274],
         [0.0161],
    


Evaluating:  99%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 1987/2000 [01:33<00:00, 21.79it/s][A

tensor([[[0.0227],
         [0.1387],
         [0.0644],
         [0.0153],
         [0.0071],
         [0.0100],
         [0.0136],
         [0.0080],
         [0.0021],
         [0.0048],
         [0.0960],
         [0.0253]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0306],
         [0.0717],
         [0.1099],
         [0.0344],
         [0.0276],
         [0.0076],
         [0.0090],
         [0.0141],
         [0.0248],
         [0.0060],
         [0.0480],
         [0.2312]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0310],
         [0.0127],
         [0.0117],
         [0.0048],
         [0.0132],
         [0.0261],
         [0.0122],
         [0.0025],
         [0.0043],
         [0.0091],
         [0.0212],
         [0.0261]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0201],
         [0.0333],
         [0.0711],
         [0.0129],
         [0.0296],
         [0.0096],
         [0.0148],
         [0.0067],
         [0.0138],
    


Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 1990/2000 [01:34<00:00, 21.79it/s][A
Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍| 1993/2000 [01:34<00:00, 21.84it/s][A


reg attention sum per layer
tensor([[[0.0083],
         [0.0129],
         [0.0445],
         [0.0087],
         [0.0113],
         [0.0131],
         [0.0109],
         [0.0075],
         [0.0016],
         [0.0032],
         [0.0303],
         [0.0180]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0349],
         [0.0340],
         [0.0598],
         [0.0322],
         [0.0222],
         [0.0125],
         [0.0089],
         [0.0218],
         [0.0044],
         [0.0088],
         [0.0129],
         [0.0220]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.1172],
         [0.0393],
         [0.0329],
         [0.0147],
         [0.0286],
         [0.0070],
         [0.0269],
         [0.0439],
         [0.0154],
         [0.0252],
         [0.0120],
         [0.0254]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0385],
         [0.0518],
         [0.0656],
         [0.0445],
         [0.0166],
         [0.0168],
         [0.0324],
         [0.0


Evaluating: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 1996/2000 [01:34<00:00, 21.83it/s][A

reg attention sum per layer
tensor([[[0.0240],
         [0.0192],
         [0.0478],
         [0.0254],
         [0.0131],
         [0.0077],
         [0.0095],
         [0.0117],
         [0.0015],
         [0.0044],
         [0.0137],
         [0.0331]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0658],
         [0.0350],
         [0.0363],
         [0.0073],
         [0.0157],
         [0.0269],
         [0.0228],
         [0.0027],
         [0.0048],
         [0.0053],
         [0.0112],
         [0.0222]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0260],
         [0.0232],
         [0.0707],
         [0.0116],
         [0.0053],
         [0.0070],
         [0.0370],
         [0.0044],
         [0.0029],
         [0.0040],
         [0.0036],
         [0.0284]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0608],
         [0.0698],
         [0.0757],
         [0.0151],
         [0.0234],
         [0.0300],
         [0.0422],
         [0.01


Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:34<00:00, 21.16it/s][A
01/28/2021 14:29:12 - INFO - __main__ -   ***** test results *****
01/28/2021 14:29:12 - INFO - __main__ -     acc = 0.0975
01/28/2021 14:29:12 - INFO - __main__ -     ar_eval_loss = 0.0
01/28/2021 14:29:12 - INFO - __main__ -     attmaxidx = [0.9856042861938477, 0.9822251200675964, 0.9837579727172852, 0.9802085757255554, 0.9928159713745117, 0.9903120994567871, 0.972429633140564, 0.9800984263420105, 0.9822763204574585, 0.9857756495475769, 0.9652194976806641, 0.9596598148345947, 0.9696062803268433, 0.9870446920394897, 0.9779829382896423, 0.9638165831565857, 0.97624272108078, 0.9722636938095093, 0.9871633052825928, 0.986213207244873, 0.9907867312431335, 0.9451868534088135, 0.983123242855072, 0.9619816541671753, 0.9885979890823364, 0.9875311851501465, 0.976

01/28/2021 14:29:12 - INFO - __main__ -     avg_max_attention_mass = 0.0798516800852958
01/28/2021 14:29:12 - INFO - __main__ -     avg_max_attention_mass_non_reg = 0.9952118453383446
01/28/2021 14:29:12 - INFO - __main__ -     avg_max_value_norm = 0.08239777570217847
01/28/2021 14:29:12 - INFO - __main__ -     avg_mean_attention_mass = 0.026325053208391184
01/28/2021 14:29:12 - INFO - __main__ -     avg_mean_value_norm = 0.07422285725013353
01/28/2021 14:29:12 - INFO - __main__ -     avg_min_value_norm = 0.06579640151746571
01/28/2021 14:29:12 - INFO - __main__ -     avg_non_reg_attention_mass = 0.9736749669015408
01/28/2021 14:29:12 - INFO - __main__ -     avg_pad_attention_mass = 0.0
01/28/2021 14:29:12 - INFO - __main__ -     ce_eval_loss = 1.8058597431182861
01/28/2021 14:29:12 - INFO - __main__ -     eval_loss = 1.8058597431182861
01/28/2021 14:29:12 - INFO - __main__ -     global_step = 0
01/28/2021 14:29:12 - INFO - __main__ -     label_match_score = 0.0
01/28/2021 14:29:12 - I

tensor([[[0.1076],
         [0.1422],
         [0.0271],
         [0.0281],
         [0.0332],
         [0.0207],
         [0.0352],
         [0.0807],
         [0.0069],
         [0.0371],
         [0.0172],
         [0.0209]]], device='cuda:0')
reg attention sum per layer
tensor([[[0.0594],
         [0.0247],
         [0.0339],
         [0.0304],
         [0.0183],
         [0.0296],
         [0.0092],
         [0.0732],
         [0.0016],
         [0.0028],
         [0.0301],
         [0.1128]]], device='cuda:0')
attetnion
count    2000.000000
mean        0.973675
std         0.014798
min         0.841295
25%         0.967114
50%         0.976322
75%         0.983481
max         1.000000
dtype: float64
test	0	0.0975	0.026325053208391184	0.0798516800852958	0.9952118453383446	1.8058597431182861	0.0	0.0	0.07422285725013353	0.08239777570217847	0.06579640151746571	[0.9856042861938477, 0.9822251200675964, 0.9837579727172852, 0.9802085757255554, 0.9928159713745117, 0.9903120994567871, 0.97

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:882.)
  next_m.mul_(beta1).add_(1 - beta1, grad)

Iteration:   0%|                                                                                                                                                                                      | 1/7000 [00:00<33:14,  3.51it/s][A
Iteration:   0%|                                                                                                                                                                                      | 2/7000 [00:00<33:06,  3.52it/s][A
Iteration:   0%|                                                                                                                                                                                      | 3/7000 [00:00<32:00,  3.64it/s][A
Iteration:   0%|                                  

Iteration:   0%|▊                                                                                                                                                                                    | 33/7000 [00:08<29:28,  3.94it/s][A
Iteration:   0%|▉                                                                                                                                                                                    | 34/7000 [00:08<29:35,  3.92it/s][A
Iteration:   0%|▉                                                                                                                                                                                    | 35/7000 [00:08<29:36,  3.92it/s][A
Iteration:   1%|▉                                                                                                                                                                                    | 36/7000 [00:09<29:35,  3.92it/s][A
Iteration:   1%|▉                                           

Iteration:   1%|█▋                                                                                                                                                                                   | 67/7000 [00:17<29:28,  3.92it/s][A
Iteration:   1%|█▊                                                                                                                                                                                   | 68/7000 [00:17<29:27,  3.92it/s][A
Iteration:   1%|█▊                                                                                                                                                                                   | 69/7000 [00:17<29:34,  3.91it/s][A
Iteration:   1%|█▊                                                                                                                                                                                   | 70/7000 [00:17<29:34,  3.90it/s][A
Iteration:   1%|█▊                                          

Iteration:   1%|██▌                                                                                                                                                                                 | 101/7000 [00:25<29:28,  3.90it/s][A
Iteration:   1%|██▌                                                                                                                                                                                 | 102/7000 [00:25<29:19,  3.92it/s][A
Iteration:   1%|██▋                                                                                                                                                                                 | 103/7000 [00:26<29:17,  3.92it/s][A
Iteration:   1%|██▋                                                                                                                                                                                 | 104/7000 [00:26<29:12,  3.94it/s][A
Iteration:   2%|██▋                                         

Iteration:   2%|███▍                                                                                                                                                                                | 135/7000 [00:34<29:10,  3.92it/s][A
Iteration:   2%|███▍                                                                                                                                                                                | 136/7000 [00:34<29:06,  3.93it/s][A
Iteration:   2%|███▌                                                                                                                                                                                | 137/7000 [00:34<29:22,  3.89it/s][A
Iteration:   2%|███▌                                                                                                                                                                                | 138/7000 [00:35<29:36,  3.86it/s][A
Iteration:   2%|███▌                                        

Iteration:   2%|████▎                                                                                                                                                                               | 169/7000 [00:43<28:47,  3.95it/s][A
Iteration:   2%|████▎                                                                                                                                                                               | 170/7000 [00:43<28:46,  3.96it/s][A
Iteration:   2%|████▍                                                                                                                                                                               | 171/7000 [00:43<29:04,  3.92it/s][A
Iteration:   2%|████▍                                                                                                                                                                               | 172/7000 [00:43<31:22,  3.63it/s][A
Iteration:   2%|████▍                                       

Iteration:   3%|█████▏                                                                                                                                                                              | 203/7000 [00:51<28:48,  3.93it/s][A
Iteration:   3%|█████▏                                                                                                                                                                              | 204/7000 [00:52<28:49,  3.93it/s][A
Iteration:   3%|█████▎                                                                                                                                                                              | 205/7000 [00:52<28:41,  3.95it/s][A
Iteration:   3%|█████▎                                                                                                                                                                              | 206/7000 [00:52<28:46,  3.93it/s][A
Iteration:   3%|█████▎                                      

Iteration:   3%|██████                                                                                                                                                                              | 237/7000 [01:00<28:30,  3.95it/s][A
Iteration:   3%|██████                                                                                                                                                                              | 238/7000 [01:00<28:35,  3.94it/s][A
Iteration:   3%|██████▏                                                                                                                                                                             | 239/7000 [01:01<28:32,  3.95it/s][A
Iteration:   3%|██████▏                                                                                                                                                                             | 240/7000 [01:01<28:38,  3.93it/s][A
Iteration:   3%|██████▏                                     

Iteration:   4%|██████▉                                                                                                                                                                             | 271/7000 [01:09<28:44,  3.90it/s][A
Iteration:   4%|██████▉                                                                                                                                                                             | 272/7000 [01:09<28:45,  3.90it/s][A
Iteration:   4%|███████                                                                                                                                                                             | 273/7000 [01:09<28:42,  3.90it/s][A
Iteration:   4%|███████                                                                                                                                                                             | 274/7000 [01:10<28:40,  3.91it/s][A
Iteration:   4%|███████                                     

Iteration:   4%|███████▊                                                                                                                                                                            | 305/7000 [01:17<28:29,  3.92it/s][A
Iteration:   4%|███████▊                                                                                                                                                                            | 306/7000 [01:18<28:31,  3.91it/s][A
Iteration:   4%|███████▉                                                                                                                                                                            | 307/7000 [01:18<28:32,  3.91it/s][A
Iteration:   4%|███████▉                                                                                                                                                                            | 308/7000 [01:18<28:27,  3.92it/s][A
Iteration:   4%|███████▉                                    

Iteration:   5%|████████▋                                                                                                                                                                           | 339/7000 [01:26<28:15,  3.93it/s][A
Iteration:   5%|████████▋                                                                                                                                                                           | 340/7000 [01:26<28:17,  3.92it/s][A
Iteration:   5%|████████▊                                                                                                                                                                           | 341/7000 [01:27<28:13,  3.93it/s][A
Iteration:   5%|████████▊                                                                                                                                                                           | 342/7000 [01:27<28:15,  3.93it/s][A
Iteration:   5%|████████▊                                   

Iteration:   5%|█████████▌                                                                                                                                                                          | 373/7000 [01:35<27:45,  3.98it/s][A
Iteration:   5%|█████████▌                                                                                                                                                                          | 374/7000 [01:35<27:43,  3.98it/s][A
Iteration:   5%|█████████▋                                                                                                                                                                          | 375/7000 [01:35<27:50,  3.97it/s][A
Iteration:   5%|█████████▋                                                                                                                                                                          | 376/7000 [01:35<27:40,  3.99it/s][A
Iteration:   5%|█████████▋                                  

Iteration:   6%|██████████▍                                                                                                                                                                         | 407/7000 [01:43<28:01,  3.92it/s][A
Iteration:   6%|██████████▍                                                                                                                                                                         | 408/7000 [01:44<27:58,  3.93it/s][A
Iteration:   6%|██████████▌                                                                                                                                                                         | 409/7000 [01:44<28:05,  3.91it/s][A
Iteration:   6%|██████████▌                                                                                                                                                                         | 410/7000 [01:44<27:52,  3.94it/s][A
Iteration:   6%|██████████▌                                 

Iteration:   6%|███████████▎                                                                                                                                                                        | 441/7000 [01:52<27:43,  3.94it/s][A
Iteration:   6%|███████████▎                                                                                                                                                                        | 442/7000 [01:52<27:44,  3.94it/s][A
Iteration:   6%|███████████▍                                                                                                                                                                        | 443/7000 [01:53<27:47,  3.93it/s][A
Iteration:   6%|███████████▍                                                                                                                                                                        | 444/7000 [01:53<27:52,  3.92it/s][A
Iteration:   6%|███████████▍                                

Iteration:   7%|████████████▏                                                                                                                                                                       | 475/7000 [02:01<27:36,  3.94it/s][A
Iteration:   7%|████████████▏                                                                                                                                                                       | 476/7000 [02:01<27:27,  3.96it/s][A
Iteration:   7%|████████████▎                                                                                                                                                                       | 477/7000 [02:01<27:26,  3.96it/s][A
Iteration:   7%|████████████▎                                                                                                                                                                       | 478/7000 [02:01<27:24,  3.97it/s][A
Iteration:   7%|████████████▎                               

Iteration:   7%|█████████████                                                                                                                                                                       | 509/7000 [02:09<27:37,  3.92it/s][A
Iteration:   7%|█████████████                                                                                                                                                                       | 510/7000 [02:10<27:39,  3.91it/s][A
Iteration:   7%|█████████████▏                                                                                                                                                                      | 511/7000 [02:10<27:30,  3.93it/s][A
Iteration:   7%|█████████████▏                                                                                                                                                                      | 512/7000 [02:10<27:30,  3.93it/s][A
Iteration:   7%|█████████████▏                              

Iteration:   8%|█████████████▉                                                                                                                                                                      | 543/7000 [02:18<27:12,  3.95it/s][A
Iteration:   8%|█████████████▉                                                                                                                                                                      | 544/7000 [02:18<27:15,  3.95it/s][A
Iteration:   8%|██████████████                                                                                                                                                                      | 545/7000 [02:18<27:16,  3.94it/s][A
Iteration:   8%|██████████████                                                                                                                                                                      | 546/7000 [02:19<27:20,  3.94it/s][A
Iteration:   8%|██████████████                              

Iteration:   8%|██████████████▊                                                                                                                                                                     | 577/7000 [02:27<26:58,  3.97it/s][A
Iteration:   8%|██████████████▊                                                                                                                                                                     | 578/7000 [02:27<26:57,  3.97it/s][A
Iteration:   8%|██████████████▉                                                                                                                                                                     | 579/7000 [02:27<27:01,  3.96it/s][A
Iteration:   8%|██████████████▉                                                                                                                                                                     | 580/7000 [02:27<27:01,  3.96it/s][A
Iteration:   8%|██████████████▉                             

Iteration:   9%|███████████████▋                                                                                                                                                                    | 611/7000 [02:35<26:53,  3.96it/s][A
Iteration:   9%|███████████████▋                                                                                                                                                                    | 612/7000 [02:35<26:55,  3.95it/s][A
Iteration:   9%|███████████████▊                                                                                                                                                                    | 613/7000 [02:36<26:59,  3.94it/s][A
Iteration:   9%|███████████████▊                                                                                                                                                                    | 614/7000 [02:36<26:45,  3.98it/s][A
Iteration:   9%|███████████████▊                            

Iteration:   9%|████████████████▌                                                                                                                                                                   | 645/7000 [02:44<26:52,  3.94it/s][A
Iteration:   9%|████████████████▌                                                                                                                                                                   | 646/7000 [02:44<26:51,  3.94it/s][A
Iteration:   9%|████████████████▋                                                                                                                                                                   | 647/7000 [02:44<26:52,  3.94it/s][A
Iteration:   9%|████████████████▋                                                                                                                                                                   | 648/7000 [02:44<26:55,  3.93it/s][A
Iteration:   9%|████████████████▋                           

Iteration:  10%|█████████████████▍                                                                                                                                                                  | 679/7000 [02:52<26:40,  3.95it/s][A
Iteration:  10%|█████████████████▍                                                                                                                                                                  | 680/7000 [02:53<26:38,  3.95it/s][A
Iteration:  10%|█████████████████▌                                                                                                                                                                  | 681/7000 [02:53<26:44,  3.94it/s][A
Iteration:  10%|█████████████████▌                                                                                                                                                                  | 682/7000 [02:53<26:47,  3.93it/s][A
Iteration:  10%|█████████████████▌                          

Iteration:  10%|██████████████████▎                                                                                                                                                                 | 713/7000 [03:01<26:49,  3.91it/s][A
Iteration:  10%|██████████████████▎                                                                                                                                                                 | 714/7000 [03:01<26:44,  3.92it/s][A
Iteration:  10%|██████████████████▍                                                                                                                                                                 | 715/7000 [03:01<26:43,  3.92it/s][A
Iteration:  10%|██████████████████▍                                                                                                                                                                 | 716/7000 [03:02<26:39,  3.93it/s][A
Iteration:  10%|██████████████████▍                         

Iteration:  11%|███████████████████▏                                                                                                                                                                | 747/7000 [03:10<26:29,  3.93it/s][A
Iteration:  11%|███████████████████▏                                                                                                                                                                | 748/7000 [03:10<26:26,  3.94it/s][A
Iteration:  11%|███████████████████▎                                                                                                                                                                | 749/7000 [03:10<26:30,  3.93it/s][A
Iteration:  11%|███████████████████▎                                                                                                                                                                | 750/7000 [03:10<26:32,  3.93it/s][A
Iteration:  11%|███████████████████▎                        

Iteration:  11%|████████████████████                                                                                                                                                                | 781/7000 [03:18<26:13,  3.95it/s][A
Iteration:  11%|████████████████████                                                                                                                                                                | 782/7000 [03:19<26:14,  3.95it/s][A
Iteration:  11%|████████████████████▏                                                                                                                                                               | 783/7000 [03:19<26:19,  3.94it/s][A
Iteration:  11%|████████████████████▏                                                                                                                                                               | 784/7000 [03:19<26:12,  3.95it/s][A
Iteration:  11%|████████████████████▏                       

Iteration:  12%|████████████████████▉                                                                                                                                                               | 815/7000 [03:27<26:05,  3.95it/s][A
Iteration:  12%|████████████████████▉                                                                                                                                                               | 816/7000 [03:27<26:13,  3.93it/s][A
Iteration:  12%|█████████████████████                                                                                                                                                               | 817/7000 [03:27<26:10,  3.94it/s][A
Iteration:  12%|█████████████████████                                                                                                                                                               | 818/7000 [03:28<26:09,  3.94it/s][A
Iteration:  12%|█████████████████████                       

Iteration:  12%|█████████████████████▊                                                                                                                                                              | 849/7000 [03:36<25:56,  3.95it/s][A
Iteration:  12%|█████████████████████▊                                                                                                                                                              | 850/7000 [03:36<26:05,  3.93it/s][A
Iteration:  12%|█████████████████████▉                                                                                                                                                              | 851/7000 [03:36<26:01,  3.94it/s][A
Iteration:  12%|█████████████████████▉                                                                                                                                                              | 852/7000 [03:36<25:56,  3.95it/s][A
Iteration:  12%|█████████████████████▉                      

Iteration:  13%|██████████████████████▋                                                                                                                                                             | 883/7000 [03:44<26:02,  3.92it/s][A
Iteration:  13%|██████████████████████▋                                                                                                                                                             | 884/7000 [03:44<26:01,  3.92it/s][A
Iteration:  13%|██████████████████████▊                                                                                                                                                             | 885/7000 [03:45<25:45,  3.96it/s][A
Iteration:  13%|██████████████████████▊                                                                                                                                                             | 886/7000 [03:45<25:44,  3.96it/s][A
Iteration:  13%|██████████████████████▊                     

Iteration:  13%|███████████████████████▌                                                                                                                                                            | 917/7000 [03:53<25:50,  3.92it/s][A
Iteration:  13%|███████████████████████▌                                                                                                                                                            | 918/7000 [03:53<25:54,  3.91it/s][A
Iteration:  13%|███████████████████████▋                                                                                                                                                            | 919/7000 [03:53<26:06,  3.88it/s][A
Iteration:  13%|███████████████████████▋                                                                                                                                                            | 920/7000 [03:54<26:16,  3.86it/s][A
Iteration:  13%|███████████████████████▋                    

Iteration:  14%|████████████████████████▍                                                                                                                                                           | 951/7000 [04:01<25:40,  3.93it/s][A
Iteration:  14%|████████████████████████▍                                                                                                                                                           | 952/7000 [04:02<25:35,  3.94it/s][A
Iteration:  14%|████████████████████████▌                                                                                                                                                           | 953/7000 [04:02<25:32,  3.95it/s][A
Iteration:  14%|████████████████████████▌                                                                                                                                                           | 954/7000 [04:02<25:32,  3.95it/s][A
Iteration:  14%|████████████████████████▌                   

Iteration:  14%|█████████████████████████▎                                                                                                                                                          | 985/7000 [04:10<25:15,  3.97it/s][A
Iteration:  14%|█████████████████████████▎                                                                                                                                                          | 986/7000 [04:10<25:11,  3.98it/s][A
Iteration:  14%|█████████████████████████▍                                                                                                                                                          | 987/7000 [04:11<25:16,  3.96it/s][A
Iteration:  14%|█████████████████████████▍                                                                                                                                                          | 988/7000 [04:11<25:23,  3.95it/s][A
Iteration:  14%|█████████████████████████▍                  

Iteration:  15%|██████████████████████████                                                                                                                                                         | 1019/7000 [04:19<25:11,  3.96it/s][A
Iteration:  15%|██████████████████████████                                                                                                                                                         | 1020/7000 [04:19<25:15,  3.94it/s][A
Iteration:  15%|██████████████████████████                                                                                                                                                         | 1021/7000 [04:19<25:15,  3.94it/s][A
Iteration:  15%|██████████████████████████▏                                                                                                                                                        | 1022/7000 [04:19<25:15,  3.94it/s][A
Iteration:  15%|██████████████████████████▏                 

Iteration:  15%|██████████████████████████▉                                                                                                                                                        | 1053/7000 [04:27<24:49,  3.99it/s][A
Iteration:  15%|██████████████████████████▉                                                                                                                                                        | 1054/7000 [04:27<24:57,  3.97it/s][A
Iteration:  15%|██████████████████████████▉                                                                                                                                                        | 1055/7000 [04:28<24:58,  3.97it/s][A
Iteration:  15%|███████████████████████████                                                                                                                                                        | 1056/7000 [04:28<25:00,  3.96it/s][A
Iteration:  15%|███████████████████████████                 

Iteration:  16%|███████████████████████████▊                                                                                                                                                       | 1087/7000 [04:36<26:07,  3.77it/s][A
Iteration:  16%|███████████████████████████▊                                                                                                                                                       | 1088/7000 [04:36<26:39,  3.70it/s][A
Iteration:  16%|███████████████████████████▊                                                                                                                                                       | 1089/7000 [04:36<26:45,  3.68it/s][A
Iteration:  16%|███████████████████████████▊                                                                                                                                                       | 1090/7000 [04:37<26:18,  3.74it/s][A
Iteration:  16%|███████████████████████████▉                

Iteration:  16%|████████████████████████████▋                                                                                                                                                      | 1121/7000 [04:46<28:26,  3.45it/s][A
Iteration:  16%|████████████████████████████▋                                                                                                                                                      | 1122/7000 [04:46<28:32,  3.43it/s][A
Iteration:  16%|████████████████████████████▋                                                                                                                                                      | 1123/7000 [04:46<28:28,  3.44it/s][A
Iteration:  16%|████████████████████████████▋                                                                                                                                                      | 1124/7000 [04:46<28:15,  3.47it/s][A
Iteration:  16%|████████████████████████████▊               

Iteration:  16%|█████████████████████████████▌                                                                                                                                                     | 1155/7000 [04:55<28:02,  3.47it/s][A
Iteration:  17%|█████████████████████████████▌                                                                                                                                                     | 1156/7000 [04:56<28:04,  3.47it/s][A
Iteration:  17%|█████████████████████████████▌                                                                                                                                                     | 1157/7000 [04:56<28:12,  3.45it/s][A
Iteration:  17%|█████████████████████████████▌                                                                                                                                                     | 1158/7000 [04:56<28:34,  3.41it/s][A
Iteration:  17%|█████████████████████████████▋              

Iteration:  17%|██████████████████████████████▍                                                                                                                                                    | 1189/7000 [05:05<28:25,  3.41it/s][A
Iteration:  17%|██████████████████████████████▍                                                                                                                                                    | 1190/7000 [05:06<28:17,  3.42it/s][A
Iteration:  17%|██████████████████████████████▍                                                                                                                                                    | 1191/7000 [05:06<28:07,  3.44it/s][A
Iteration:  17%|██████████████████████████████▍                                                                                                                                                    | 1192/7000 [05:06<28:06,  3.44it/s][A
Iteration:  17%|██████████████████████████████▌             

Iteration:  17%|███████████████████████████████▎                                                                                                                                                   | 1223/7000 [05:15<28:05,  3.43it/s][A
Iteration:  17%|███████████████████████████████▎                                                                                                                                                   | 1224/7000 [05:16<28:00,  3.44it/s][A
Iteration:  18%|███████████████████████████████▎                                                                                                                                                   | 1225/7000 [05:16<27:57,  3.44it/s][A
Iteration:  18%|███████████████████████████████▎                                                                                                                                                   | 1226/7000 [05:16<27:53,  3.45it/s][A
Iteration:  18%|███████████████████████████████▍            

Iteration:  18%|████████████████████████████████▏                                                                                                                                                  | 1257/7000 [05:25<27:34,  3.47it/s][A
Iteration:  18%|████████████████████████████████▏                                                                                                                                                  | 1258/7000 [05:25<27:34,  3.47it/s][A
Iteration:  18%|████████████████████████████████▏                                                                                                                                                  | 1259/7000 [05:26<27:32,  3.47it/s][A
Iteration:  18%|████████████████████████████████▏                                                                                                                                                  | 1260/7000 [05:26<27:31,  3.48it/s][A
Iteration:  18%|████████████████████████████████▏           

Iteration:  18%|█████████████████████████████████                                                                                                                                                  | 1291/7000 [05:34<24:05,  3.95it/s][A
Iteration:  18%|█████████████████████████████████                                                                                                                                                  | 1292/7000 [05:35<24:03,  3.95it/s][A
Iteration:  18%|█████████████████████████████████                                                                                                                                                  | 1293/7000 [05:35<24:10,  3.93it/s][A
Iteration:  18%|█████████████████████████████████                                                                                                                                                  | 1294/7000 [05:35<24:21,  3.91it/s][A
Iteration:  18%|█████████████████████████████████           

Iteration:  19%|█████████████████████████████████▉                                                                                                                                                 | 1325/7000 [05:44<25:33,  3.70it/s][A
Iteration:  19%|█████████████████████████████████▉                                                                                                                                                 | 1326/7000 [05:44<26:20,  3.59it/s][A
Iteration:  19%|█████████████████████████████████▉                                                                                                                                                 | 1327/7000 [05:44<26:30,  3.57it/s][A
Iteration:  19%|█████████████████████████████████▉                                                                                                                                                 | 1328/7000 [05:45<26:36,  3.55it/s][A
Iteration:  19%|█████████████████████████████████▉          

Iteration:  19%|██████████████████████████████████▊                                                                                                                                                | 1359/7000 [05:52<23:47,  3.95it/s][A
Iteration:  19%|██████████████████████████████████▊                                                                                                                                                | 1360/7000 [05:53<23:51,  3.94it/s][A
Iteration:  19%|██████████████████████████████████▊                                                                                                                                                | 1361/7000 [05:53<23:50,  3.94it/s][A
Iteration:  19%|██████████████████████████████████▊                                                                                                                                                | 1362/7000 [05:53<23:54,  3.93it/s][A
Iteration:  19%|██████████████████████████████████▊         

Iteration:  20%|███████████████████████████████████▌                                                                                                                                               | 1393/7000 [06:01<24:03,  3.88it/s][A
Iteration:  20%|███████████████████████████████████▋                                                                                                                                               | 1394/7000 [06:02<23:49,  3.92it/s][A
Iteration:  20%|███████████████████████████████████▋                                                                                                                                               | 1395/7000 [06:02<24:03,  3.88it/s][A
Iteration:  20%|███████████████████████████████████▋                                                                                                                                               | 1396/7000 [06:02<23:54,  3.91it/s][A
Iteration:  20%|███████████████████████████████████▋        

Iteration:  20%|████████████████████████████████████▍                                                                                                                                              | 1427/7000 [06:10<23:48,  3.90it/s][A
Iteration:  20%|████████████████████████████████████▌                                                                                                                                              | 1428/7000 [06:10<23:43,  3.92it/s][A
Iteration:  20%|████████████████████████████████████▌                                                                                                                                              | 1429/7000 [06:11<23:39,  3.92it/s][A
Iteration:  20%|████████████████████████████████████▌                                                                                                                                              | 1430/7000 [06:11<23:35,  3.94it/s][A
Iteration:  20%|████████████████████████████████████▌       

Iteration:  21%|█████████████████████████████████████▎                                                                                                                                             | 1461/7000 [06:19<23:37,  3.91it/s][A
Iteration:  21%|█████████████████████████████████████▍                                                                                                                                             | 1462/7000 [06:19<23:41,  3.90it/s][A
Iteration:  21%|█████████████████████████████████████▍                                                                                                                                             | 1463/7000 [06:19<23:34,  3.92it/s][A
Iteration:  21%|█████████████████████████████████████▍                                                                                                                                             | 1464/7000 [06:19<23:33,  3.92it/s][A
Iteration:  21%|█████████████████████████████████████▍      

Iteration:  21%|██████████████████████████████████████▏                                                                                                                                            | 1495/7000 [06:27<23:17,  3.94it/s][A
Iteration:  21%|██████████████████████████████████████▎                                                                                                                                            | 1496/7000 [06:28<23:15,  3.95it/s][A
Iteration:  21%|██████████████████████████████████████▎                                                                                                                                            | 1497/7000 [06:28<23:13,  3.95it/s][A
Iteration:  21%|██████████████████████████████████████▎                                                                                                                                            | 1498/7000 [06:28<23:16,  3.94it/s][A
Iteration:  21%|██████████████████████████████████████▎     

Iteration:  22%|███████████████████████████████████████                                                                                                                                            | 1529/7000 [06:36<23:08,  3.94it/s][A
Iteration:  22%|███████████████████████████████████████                                                                                                                                            | 1530/7000 [06:36<23:10,  3.93it/s][A
Iteration:  22%|███████████████████████████████████████▏                                                                                                                                           | 1531/7000 [06:37<23:10,  3.93it/s][A
Iteration:  22%|███████████████████████████████████████▏                                                                                                                                           | 1532/7000 [06:37<23:13,  3.92it/s][A
Iteration:  22%|███████████████████████████████████████▏    

Iteration:  22%|███████████████████████████████████████▉                                                                                                                                           | 1563/7000 [06:45<26:01,  3.48it/s][A
Iteration:  22%|███████████████████████████████████████▉                                                                                                                                           | 1564/7000 [06:45<26:12,  3.46it/s][A
Iteration:  22%|████████████████████████████████████████                                                                                                                                           | 1565/7000 [06:46<26:23,  3.43it/s][A
Iteration:  22%|████████████████████████████████████████                                                                                                                                           | 1566/7000 [06:46<26:23,  3.43it/s][A
Iteration:  22%|████████████████████████████████████████    

Iteration:  23%|████████████████████████████████████████▊                                                                                                                                          | 1597/7000 [06:55<26:32,  3.39it/s][A
Iteration:  23%|████████████████████████████████████████▊                                                                                                                                          | 1598/7000 [06:55<26:20,  3.42it/s][A
Iteration:  23%|████████████████████████████████████████▉                                                                                                                                          | 1599/7000 [06:56<26:22,  3.41it/s][A
Iteration:  23%|████████████████████████████████████████▉                                                                                                                                          | 1600/7000 [06:56<26:28,  3.40it/s][A
Iteration:  23%|████████████████████████████████████████▉   

Iteration:  23%|█████████████████████████████████████████▋                                                                                                                                         | 1631/7000 [07:05<26:05,  3.43it/s][A
Iteration:  23%|█████████████████████████████████████████▋                                                                                                                                         | 1632/7000 [07:05<25:59,  3.44it/s][A
Iteration:  23%|█████████████████████████████████████████▊                                                                                                                                         | 1633/7000 [07:06<26:03,  3.43it/s][A
Iteration:  23%|█████████████████████████████████████████▊                                                                                                                                         | 1634/7000 [07:06<26:14,  3.41it/s][A
Iteration:  23%|█████████████████████████████████████████▊  

Iteration:  24%|██████████████████████████████████████████▌                                                                                                                                        | 1665/7000 [07:15<25:56,  3.43it/s][A
Iteration:  24%|██████████████████████████████████████████▌                                                                                                                                        | 1666/7000 [07:15<26:13,  3.39it/s][A
Iteration:  24%|██████████████████████████████████████████▋                                                                                                                                        | 1667/7000 [07:16<26:20,  3.37it/s][A
Iteration:  24%|██████████████████████████████████████████▋                                                                                                                                        | 1668/7000 [07:16<26:11,  3.39it/s][A
Iteration:  24%|██████████████████████████████████████████▋ 

Iteration:  24%|███████████████████████████████████████████▍                                                                                                                                       | 1699/7000 [07:25<25:47,  3.42it/s][A
Iteration:  24%|███████████████████████████████████████████▍                                                                                                                                       | 1700/7000 [07:25<25:58,  3.40it/s][A
Iteration:  24%|███████████████████████████████████████████▍                                                                                                                                       | 1701/7000 [07:26<25:53,  3.41it/s][A
Iteration:  24%|███████████████████████████████████████████▌                                                                                                                                       | 1702/7000 [07:26<25:58,  3.40it/s][A
Iteration:  24%|███████████████████████████████████████████▌

Iteration:  25%|████████████████████████████████████████████▎                                                                                                                                      | 1733/7000 [07:35<25:55,  3.39it/s][A
Iteration:  25%|████████████████████████████████████████████▎                                                                                                                                      | 1734/7000 [07:35<25:45,  3.41it/s][A
Iteration:  25%|████████████████████████████████████████████▎                                                                                                                                      | 1735/7000 [07:36<25:50,  3.39it/s][A
Iteration:  25%|████████████████████████████████████████████▍                                                                                                                                      | 1736/7000 [07:36<25:45,  3.41it/s][A
Iteration:  25%|████████████████████████████████████████████

Iteration:  25%|█████████████████████████████████████████████▏                                                                                                                                     | 1767/7000 [07:45<25:41,  3.40it/s][A
Iteration:  25%|█████████████████████████████████████████████▏                                                                                                                                     | 1768/7000 [07:45<25:28,  3.42it/s][A
Iteration:  25%|█████████████████████████████████████████████▏                                                                                                                                     | 1769/7000 [07:46<25:42,  3.39it/s][A
Iteration:  25%|█████████████████████████████████████████████▎                                                                                                                                     | 1770/7000 [07:46<25:37,  3.40it/s][A
Iteration:  25%|████████████████████████████████████████████

Iteration:  26%|██████████████████████████████████████████████                                                                                                                                     | 1801/7000 [07:55<25:48,  3.36it/s][A
Iteration:  26%|██████████████████████████████████████████████                                                                                                                                     | 1802/7000 [07:55<25:44,  3.37it/s][A
Iteration:  26%|██████████████████████████████████████████████                                                                                                                                     | 1803/7000 [07:56<25:44,  3.37it/s][A
Iteration:  26%|██████████████████████████████████████████████▏                                                                                                                                    | 1804/7000 [07:56<25:40,  3.37it/s][A
Iteration:  26%|████████████████████████████████████████████

Iteration:  26%|██████████████████████████████████████████████▉                                                                                                                                    | 1835/7000 [08:05<22:03,  3.90it/s][A
Iteration:  26%|██████████████████████████████████████████████▉                                                                                                                                    | 1836/7000 [08:05<22:04,  3.90it/s][A
Iteration:  26%|██████████████████████████████████████████████▉                                                                                                                                    | 1837/7000 [08:05<22:07,  3.89it/s][A
Iteration:  26%|███████████████████████████████████████████████                                                                                                                                    | 1838/7000 [08:05<22:00,  3.91it/s][A
Iteration:  26%|████████████████████████████████████████████

Iteration:  27%|███████████████████████████████████████████████▊                                                                                                                                   | 1869/7000 [08:14<22:42,  3.76it/s][A
Iteration:  27%|███████████████████████████████████████████████▊                                                                                                                                   | 1870/7000 [08:14<23:28,  3.64it/s][A
Iteration:  27%|███████████████████████████████████████████████▊                                                                                                                                   | 1871/7000 [08:14<23:55,  3.57it/s][A
Iteration:  27%|███████████████████████████████████████████████▊                                                                                                                                   | 1872/7000 [08:14<23:55,  3.57it/s][A
Iteration:  27%|████████████████████████████████████████████

Iteration:  27%|████████████████████████████████████████████████▋                                                                                                                                  | 1903/7000 [08:23<21:58,  3.87it/s][A
Iteration:  27%|████████████████████████████████████████████████▋                                                                                                                                  | 1904/7000 [08:23<21:54,  3.88it/s][A
Iteration:  27%|████████████████████████████████████████████████▋                                                                                                                                  | 1905/7000 [08:23<21:50,  3.89it/s][A
Iteration:  27%|████████████████████████████████████████████████▋                                                                                                                                  | 1906/7000 [08:23<21:51,  3.89it/s][A
Iteration:  27%|████████████████████████████████████████████

Iteration:  28%|█████████████████████████████████████████████████▌                                                                                                                                 | 1937/7000 [08:31<21:30,  3.92it/s][A
Iteration:  28%|█████████████████████████████████████████████████▌                                                                                                                                 | 1938/7000 [08:32<21:30,  3.92it/s][A
Iteration:  28%|█████████████████████████████████████████████████▌                                                                                                                                 | 1939/7000 [08:32<21:38,  3.90it/s][A
Iteration:  28%|█████████████████████████████████████████████████▌                                                                                                                                 | 1940/7000 [08:32<21:33,  3.91it/s][A
Iteration:  28%|████████████████████████████████████████████

Iteration:  28%|██████████████████████████████████████████████████▍                                                                                                                                | 1971/7000 [08:40<23:27,  3.57it/s][A
Iteration:  28%|██████████████████████████████████████████████████▍                                                                                                                                | 1972/7000 [08:40<23:46,  3.53it/s][A
Iteration:  28%|██████████████████████████████████████████████████▍                                                                                                                                | 1973/7000 [08:41<23:55,  3.50it/s][A
Iteration:  28%|██████████████████████████████████████████████████▍                                                                                                                                | 1974/7000 [08:41<24:00,  3.49it/s][A
Iteration:  28%|████████████████████████████████████████████

Iteration:  29%|███████████████████████████████████████████████████▎                                                                                                                               | 2005/7000 [08:49<21:24,  3.89it/s][A
Iteration:  29%|███████████████████████████████████████████████████▎                                                                                                                               | 2006/7000 [08:49<21:26,  3.88it/s][A
Iteration:  29%|███████████████████████████████████████████████████▎                                                                                                                               | 2007/7000 [08:50<21:25,  3.88it/s][A
Iteration:  29%|███████████████████████████████████████████████████▎                                                                                                                               | 2008/7000 [08:50<21:25,  3.88it/s][A
Iteration:  29%|████████████████████████████████████████████

Iteration:  29%|████████████████████████████████████████████████████▏                                                                                                                              | 2039/7000 [08:58<21:23,  3.86it/s][A
Iteration:  29%|████████████████████████████████████████████████████▏                                                                                                                              | 2040/7000 [08:58<21:18,  3.88it/s][A
Iteration:  29%|████████████████████████████████████████████████████▏                                                                                                                              | 2041/7000 [08:58<21:18,  3.88it/s][A
Iteration:  29%|████████████████████████████████████████████████████▏                                                                                                                              | 2042/7000 [08:59<21:11,  3.90it/s][A
Iteration:  29%|████████████████████████████████████████████

Iteration:  30%|█████████████████████████████████████████████████████                                                                                                                              | 2073/7000 [09:07<21:07,  3.89it/s][A
Iteration:  30%|█████████████████████████████████████████████████████                                                                                                                              | 2074/7000 [09:07<21:17,  3.86it/s][A
Iteration:  30%|█████████████████████████████████████████████████████                                                                                                                              | 2075/7000 [09:07<21:11,  3.87it/s][A
Iteration:  30%|█████████████████████████████████████████████████████                                                                                                                              | 2076/7000 [09:07<21:01,  3.90it/s][A
Iteration:  30%|████████████████████████████████████████████

Iteration:  30%|█████████████████████████████████████████████████████▉                                                                                                                             | 2107/7000 [09:15<20:49,  3.92it/s][A
Iteration:  30%|█████████████████████████████████████████████████████▉                                                                                                                             | 2108/7000 [09:16<20:53,  3.90it/s][A
Iteration:  30%|█████████████████████████████████████████████████████▉                                                                                                                             | 2109/7000 [09:16<20:53,  3.90it/s][A
Iteration:  30%|█████████████████████████████████████████████████████▉                                                                                                                             | 2110/7000 [09:16<20:45,  3.92it/s][A
Iteration:  30%|████████████████████████████████████████████

Iteration:  31%|██████████████████████████████████████████████████████▋                                                                                                                            | 2141/7000 [09:24<21:07,  3.83it/s][A
Iteration:  31%|██████████████████████████████████████████████████████▊                                                                                                                            | 2142/7000 [09:25<21:01,  3.85it/s][A
Iteration:  31%|██████████████████████████████████████████████████████▊                                                                                                                            | 2143/7000 [09:25<21:02,  3.85it/s][A
Iteration:  31%|██████████████████████████████████████████████████████▊                                                                                                                            | 2144/7000 [09:25<20:55,  3.87it/s][A
Iteration:  31%|████████████████████████████████████████████

Iteration:  31%|███████████████████████████████████████████████████████▌                                                                                                                           | 2175/7000 [09:33<20:59,  3.83it/s][A
Iteration:  31%|███████████████████████████████████████████████████████▋                                                                                                                           | 2176/7000 [09:33<21:44,  3.70it/s][A
Iteration:  31%|███████████████████████████████████████████████████████▋                                                                                                                           | 2177/7000 [09:34<22:25,  3.58it/s][A
Iteration:  31%|███████████████████████████████████████████████████████▋                                                                                                                           | 2178/7000 [09:34<22:42,  3.54it/s][A
Iteration:  31%|████████████████████████████████████████████

Iteration:  32%|████████████████████████████████████████████████████████▍                                                                                                                          | 2209/7000 [09:42<20:52,  3.83it/s][A
Iteration:  32%|████████████████████████████████████████████████████████▌                                                                                                                          | 2210/7000 [09:42<21:06,  3.78it/s][A
Iteration:  32%|████████████████████████████████████████████████████████▌                                                                                                                          | 2211/7000 [09:43<21:00,  3.80it/s][A
Iteration:  32%|████████████████████████████████████████████████████████▌                                                                                                                          | 2212/7000 [09:43<20:54,  3.82it/s][A
Iteration:  32%|████████████████████████████████████████████

Iteration:  32%|█████████████████████████████████████████████████████████▎                                                                                                                         | 2243/7000 [09:51<20:56,  3.79it/s][A
Iteration:  32%|█████████████████████████████████████████████████████████▍                                                                                                                         | 2244/7000 [09:51<21:08,  3.75it/s][A
Iteration:  32%|█████████████████████████████████████████████████████████▍                                                                                                                         | 2245/7000 [09:52<20:55,  3.79it/s][A
Iteration:  32%|█████████████████████████████████████████████████████████▍                                                                                                                         | 2246/7000 [09:52<20:50,  3.80it/s][A
Iteration:  32%|████████████████████████████████████████████