# QaDialMoE run_healthmoe.py를 snippet 단위로 뜯어보기
###### 2023.04.11 화요일

In [1]:
!ml anaconda3

In [2]:
from __future__ import absolute_import, division, print_function
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # del
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import argparse
import random
import sys
import re
import json
import jsonlines
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset)
from tqdm import tqdm, trange
import torch.nn.functional
from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME
from transformers import RobertaTokenizer, RobertaConfig
from pytorch_pretrained_bert.optimization import BertAdam
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tensorboardX import SummaryWriter
from model import  RobertaMoEForSequenceClassification
import nltk
import logging
import pandas as pd
from tfidf_similarity import TfIdfSimilarity

logger = logging.getLogger(__name__)

LABELS = {"Supports":0, "Refutes":1, "Neutral":2}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
!nvidia-smi

Thu Apr 13 23:51:36 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   29C    P0    30W / 250W |      2MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [4]:
class InputExample(object):
    def __init__(self, idx, text_a, text_b=None, label=None,priori=None):
        '''
        Args:
            idx:   unique id
            text_a: response/claim
            text_b: context+evidence
            label:  positive / negative / NEI
            priori: priori distribution over experts based on rules
        '''
        self.idx = idx
        self.text_a = text_a
        self.text_b = text_b
        self.label = label
        self.priori = priori

In [5]:
class InputFeatures(object):
    def __init__(self, input_ids, input_mask, segment_ids, label_id, priori):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.priori = priori

# class DataProcessor

In [6]:
class DataProcessor(object):
    def get_examples(self, data_dir, dataset=None):
        logger.info('Get examples from: {}.csv'.format(dataset))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "{}.csv".format(dataset))))

    def get_labels(self):
        return [0, 1, 2], len([0, 1, 2])

    def _read_csv(cls, input_file):
        data = pd.read_csv(input_file)
        lines = []
        for i in range(len(data)):
            lines.append(data.iloc[i])
        return lines

    def _create_examples(self, lines, max_evidences=5):
        examples = []
        obj = TfIdfSimilarity()
        for i, datapoint in enumerate(tqdm(lines)):
            #  sent1 = '[CONTEXT]: ' + ' [EOT] '.join(example['context'][-2:]) + ' [RESPONSE]: ' + sent1
            primi_idx = datapoint['id']
            text_a = datapoint['claim']
            text_b = datapoint['question'] + datapoint['evidence']
            label = LABELS[datapoint['label']]
            # priori = get_priori(obj, text_a, datapoint['question'],datapoint['evidence'], T = 1)
            priori = [0.3,0.3,0.3]
            examples.append((InputExample(idx=primi_idx, text_a=text_a, text_b=text_b, label=label, priori=priori)))
        return examples

In [7]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    label_map = {label: i for i, label in enumerate(label_list)}

    features = []

    for (ex_index, example) in enumerate(tqdm(examples, desc="convert to features")):

        label_id = label_map[example.label]

        tokens_a = tokenizer.tokenize(example.text_a)
        tokens_b = tokenizer.tokenize(example.text_b)
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

        tokens = ["<s>"] + tokens_a + ["</s>"]
        segment_ids = [0] * (len(tokens_a) + 2)
        tokens += tokens_b + ["</s>"]
        segment_ids += [1] * (len(tokens_b) + 1)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)

        padding = [1] * (max_seq_length - len(input_ids))
        input_mask += [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        segment_ids += padding
        #print(len(input_ids))
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if ex_index < 1:
            logger.info("*** Example ***")
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(InputFeatures(input_ids=input_ids,
                                      input_mask=input_mask,
                                      segment_ids=segment_ids,
                                      label_id=label_id,
                                      priori=example.priori))
    return features

In [8]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [9]:
def get_priori(obj, response, context, evidence, T = 1):
    score = [0.2,0.2,0.6]
    try:
        res_ctx_score = 0.2*(1 - obj.cal_consine_similarities(response, context))
        res_evi_score = 0.2*(1 - obj.cal_consine_similarities(response, evidence))
    except:
        res_ctx_score = 0
        res_evi_score = 0
    score[0] += res_ctx_score
    score[1] += res_evi_score
    score = softmax(score,T)
    return score

In [10]:
def compute_metrics_fn(preds, labels):
    # preds = np.argmax(p.predictions, axis=1)
    assert len(preds) == len(labels)
#     acc = (preds == labels).mean()
    f1 = f1_score(y_true= labels, y_pred=preds, average="macro", labels=np.unique(labels))
    acc = accuracy_score(y_true= labels, y_pred=preds)
    p = precision_score(y_true= labels, y_pred=preds, average="macro", labels=np.unique(labels))
    r = recall_score(y_true= labels, y_pred=preds, average="macro", labels=np.unique(labels))
    return {   
        "acc": acc,
        "macro_f1": f1,
        "macro_recall":r,
        "p": p
    }

In [11]:
def mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)

# get_dataLoader()

In [12]:
def get_dataLoader(args, processor, tokenizer, phase=None):
    print('*********************** in get_dataLoader() ***********************')
    dataset_dict = {"train": args.train_set, "dev": args.dev_set, "test": args.test_set}
    label_list, _ = processor.get_labels()
    print('[DAN] : dataset_dict', dataset_dict)
    print('[DAN] : label_list', label_list)

    examples = processor.get_examples(args.data_dir, dataset_dict[phase])
    features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer)
    print(len(examples), len(features))
    print(examples[0], features[0])
    print('[DAN] : examples', examples[0])
    print('[DAN] : features', features[0])

    batch_size = args.train_batch_size if phase == "train" else args.eval_batch_size
    epoch_num = args.num_train_epochs if phase == "train" else 1
    num_optimization_steps = int(len(examples) / batch_size / args.gradient_accumulation_steps) * epoch_num
    logger.info("Examples#: {}, Batch size: {}".format(len(examples), batch_size * args.gradient_accumulation_steps))
    logger.info("Total num of steps#: {}, Total num of epoch#: {}".format(num_optimization_steps, epoch_num))
    print('[DAN] : batch_size', batch_size)
    print('[DAN] : epoch_num', epoch_num)

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    all_priori = torch.tensor([f.priori for f in features], dtype=torch.float)
    print('[DAN] : all_input_ids', all_input_ids[0])
    print('[DAN] : all_input_mask', all_input_mask[0])
    print('[DAN] : all_segment_ids', all_segment_ids[0])
    print('[DAN] : all_label_ids', all_label_ids[0])
    print('[DAN] : all_priori', all_priori[0])

    all_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_priori)
    if args.do_train_eval:
        sampler = SequentialSampler(all_data)
    else:
        sampler = RandomSampler(all_data) if phase == "train" else SequentialSampler(all_data)
    dataloader = DataLoader(all_data, sampler=sampler, batch_size=batch_size)

    return dataloader, num_optimization_steps, examples

In [13]:
def save_model(model_to_save):
    save_model_dir = os.path.join(args.output_dir, 'saved_model')
    mkdir(save_model_dir)
    output_model_file = os.path.join(save_model_dir, WEIGHTS_NAME)
    torch.save(model_to_save.state_dict(), output_model_file, _use_new_zipfile_serialization=False)

In [14]:
def softmax(input,T=1):
    output = [np.exp(i/T) for i in input]
    output_sum = sum(output)
    final = [i/output_sum for i in output]
    return final

In [15]:
def is_count_number(num):
    return 0 <= num <= 100

# run_train()

In [29]:
def run_train(device, processor, tokenizer, model, writer, phase="train"):
    logger.info("\n************ Start Training *************")

    tr_dataloader, tr_num_steps, tr_examples = get_dataLoader(args, processor, tokenizer, phase="train")
    print('*********************** in run_train() ***********************')
    #print('[DAN] in run_train() : ',tr_dataloader, tr_num_steps, tr_examples)

    model.train()

    loss_fct = torch.nn.KLDivLoss(reduction='batchmean')
    print('[DAN] in run_train() : ', loss_fct)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = \
        [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=tr_num_steps)
    print('[DAN] optimizer in run_train() : ', optimizer)
    optimizer.zero_grad()
    print('[DAN] optimizer after .zero_grad() in run_train() : ', optimizer)

    global_step = 0
    best_acc = 0.0
    n_gpu = torch.cuda.device_count()

    for ep in trange(args.num_train_epochs, desc="Training"):
        print('[DAN] ep in run_train() : ', ep)
        for step, batch in tqdm(enumerate(tr_dataloader)):
            print('[DAN] step: in run_train() ', step)
            print('[DAN] batch: in run_train() ', batch)
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, label_ids, priori = batch
            logits, loss, final_out_logits, origin_gates = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids)
            guide_loss = loss_fct(torch.nn.functional.log_softmax(origin_gates, dim=1), priori)
            loss += args.lmd * guide_loss
            if n_gpu > 1:
                loss = loss.mean()
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
                print('[DAN] loss : in run_train() ', loss)

            writer.add_scalar('{}/loss'.format(phase), loss.item(), global_step)

            loss.backward()
            del loss

            if (step + 1) % args.gradient_accumulation_steps == 0:  # optimizer
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            model.eval()
            torch.set_grad_enabled(False)

            if args.do_eval and (((step + 1) % args.gradient_accumulation_steps == 0 and global_step % args.period == 0) or (ep==0 and step==0)):
                model_to_save = model.module if hasattr(model, 'module') else model

                dev_acc, dev_recall = run_eval(device, processor, tokenizer, model, writer, global_step, tensorboard=True, phase="dev")
                print('[DAN] : dev_acc, dev_recall in run_train() : ', dev_acc, dev_recall)
                if dev_acc > best_acc:
                    best_acc = dev_acc
                    logger.info(">> Save model. Best acc: {:.4}. Epoch {}".format(best_acc, ep))
                    save_model(model_to_save)  # save model
                    logger.info(">> Now the best acc is {:.4}\n, recall is {:.4}".format(dev_acc, dev_recall))

            model.train()
            torch.set_grad_enabled(True)
    print('[DAN] : global_step in run_train() ', global_step)

    return global_step

# run_eval()

In [30]:
def run_eval(device, processor, tokenizer, model, writer, global_step, tensorboard=False,
             phase=None):
    sys.stdout.flush()
    logger.info("\n************ Start {} *************".format(phase))

    model.eval()

    loss_fct = torch.nn.KLDivLoss(reduction='batchmean')
    cross_entropy = nn.CrossEntropyLoss(reduction='none')

    dataloader, num_steps, examples = get_dataLoader(args, processor, tokenizer, phase=phase)

    eval_loss = 0.0
    eval_guide_loss = 0.0
    num_steps = 0
    preds = []
    preds_0, preds_1, preds_2 = [],[],[]
    all_labels = []
    mapping = []
    for step, batch in enumerate(tqdm(dataloader, desc=phase)):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, label_ids, priori = batch
        num_steps += 1

        with torch.no_grad():

            logits, tmp_loss, final_out_logits, origin_gates = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids)
            guide_loss = loss_fct(torch.nn.functional.log_softmax(origin_gates, dim=1), priori)

            eval_loss += tmp_loss.mean().item()
            eval_guide_loss += guide_loss.mean().item()
            logits_sigmoid = final_out_logits
            loss = []
            for l in logits:
                loss.append(cross_entropy(l.squeeze(1), label_ids.view(-1)).view(-1,1))
            if len(loss) == 1:
                loss_mat = loss[0].view(-1,1)
            else:
                loss_mat = torch.cat(loss, dim=1) # bsz * # of experts
            logits_sigmoid_0 = torch.nn.functional.softmax(logits[0].squeeze(1), dim=1)
            logits_sigmoid_1 = torch.nn.functional.softmax(logits[1].squeeze(1), dim=1)
            logits_sigmoid_2 = torch.nn.functional.softmax(logits[2].squeeze(1), dim=1)
            if len(preds) == 0:
                preds.append(logits_sigmoid.detach().cpu().numpy())
                preds_0.append(logits_sigmoid_0.detach().cpu().numpy())
                preds_1.append(logits_sigmoid_1.detach().cpu().numpy())
                preds_2.append(logits_sigmoid_2.detach().cpu().numpy())
            else:
                preds[0] = np.append(preds[0], logits_sigmoid.detach().cpu().numpy(), axis=0)
                preds_0[0] = np.append(preds_0[0], logits_sigmoid_0.detach().cpu().numpy(), axis=0)
                preds_1[0] = np.append(preds_1[0], logits_sigmoid_1.detach().cpu().numpy(), axis=0)
                preds_2[0] = np.append(preds_2[0], logits_sigmoid_2.detach().cpu().numpy(), axis=0)

            labels = label_ids.detach().cpu().numpy().tolist()

            start = step * args.eval_batch_size if not args.do_train_eval else step * args.train_batch_size
            end = start + len(labels)
            batch_range = list(range(start, end))

            idx = [examples[i].idx for i in batch_range]
            labels = label_ids.detach().cpu().numpy().tolist()
            all_labels.extend(labels)
            loss_mat_cpu = loss_mat.detach().cpu().numpy().tolist()
            for i, t_name in enumerate(idx):
                mapping.append([str(loss_mat_cpu[i][0]), str(loss_mat_cpu[i][1]), str(loss_mat_cpu[i][2])])

    result = {}
    result['acc'] = 0
    eval_loss /= num_steps
    eval_guide_loss /= num_steps
    preds = np.argmax(preds[0], axis=1)
    preds_0 = np.argmax(preds_0[0], axis=1)
    preds_1 = np.argmax(preds_1[0], axis=1)
    preds_2 = np.argmax(preds_2[0], axis=1)
    pred_for_test, label_for_test = [] ,[]
    for pred, label in zip(preds,all_labels):
        pred_for_test.append(pred)
        label_for_test.append(label)
            
    result = compute_metrics_fn(np.asarray(pred_for_test), np.asarray(label_for_test))
    result_0 = compute_metrics_fn(np.asarray(preds_0), np.asarray(all_labels))
    result_1 = compute_metrics_fn(np.asarray(preds_1), np.asarray(all_labels))
    result_2 = compute_metrics_fn(np.asarray(preds_2), np.asarray(all_labels))
    result['acc_0'] = result_0['acc']
    result['acc_1'] = result_1['acc']
    result['acc_2'] = result_2['acc']
    result['{}_loss'.format(phase)] = eval_loss
    result['{}_guide_loss'.format(phase)] = eval_guide_loss
    result['global_step'] = global_step
    logger.info(result)
    if tensorboard and writer is not None:
        for key in sorted(result.keys()):
            writer.add_scalar('{}/{}'.format(phase, key), result[key], global_step)
    json.dump(mapping, open('./{}_moe_roberta_lmd_health.json'.format(phase),'w', encoding='utf8'))
        
    model.train()
    return result['acc'], result['macro_recall']

# main()

In [31]:
def main():
    mkdir(args.output_dir)

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    writer = SummaryWriter(os.path.join(args.output_dir, 'events'))
    cache_dir = args.cache_dir

    # device = torch.device("cuda")
    n_gpu = torch.cuda.device_count()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    save_code_log_path = args.output_dir

    logging.basicConfig(format='%(message)s', datefmt='%m/%d/%Y %H:%M', level=logging.INFO,
                        handlers=[logging.FileHandler("{0}/{1}.log".format(save_code_log_path, 'output')),
                                  logging.StreamHandler()])
    logger.info(args)
    logger.info("Command is: %s" % ' '.join(sys.argv))
    logger.info("Device: {}, n_GPU: {}".format(device, n_gpu))
    logger.info("Datasets are loaded from {}\nOutputs will be saved to {}\n".format(args.data_dir, args.output_dir))

    processor = DataProcessor()
    tokenizer = RobertaTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    load_dir = args.load_dir if args.load_dir else args.bert_model
    logger.info('Model is loaded from %s' % load_dir)
    label_list = processor.get_labels()
    config = RobertaConfig.from_json_file(os.path.join(args.bert_model,'config.json'))
    model = RobertaMoEForSequenceClassification(config, num_public_layers=12, num_experts=3,num_labels=3, num_gate_layer=2)
    model.load_roberta(args.bert_model)
    if args.load_dir:
        model.load_state_dict(torch.load(load_dir+'/pytorch_model.bin'))
        print('parameters loaded successfully.')
    model.to(device)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model,device_ids=[0, 1])

    #if args.do_train:
    run_train(device, processor, tokenizer, model, writer, phase="train")

    if args.do_eval:
        run_eval(device, processor, tokenizer, model, writer, global_step=0, tensorboard=False,
                 phase="dev")
        run_eval(device, processor, tokenizer, model, writer, global_step=0, tensorboard=False,
                 phase="test")

    if args.do_test:
        run_eval(device, processor, tokenizer, model, writer, global_step=0, tensorboard=False,
                 phase="test")

    if args.do_train_eval:
        run_eval(device, processor, tokenizer, model, writer, global_step=0, tensorboard=False,
                 phase="train")

# __name__ == '__main__'

In [32]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    #print(os.getcwd())
    parser.add_argument("--do_train", action='store_true')
    parser.add_argument("--do_eval", action='store_true')
    parser.add_argument("--do_test", action='store_true')
    parser.add_argument("--do_train_eval", action='store_true')
    parser.add_argument("--add_unk", action='store_true')
    parser.add_argument("--load_dir", help="load model checkpoints")

    parser.add_argument('-f') 
    '''
    ipykernel_launcher.py: error: unrecognized arguments: -f
    '''
    from pathlib import Path
    #parser.add_argument("--data_dir", help="path to data", default='../data/healthver')
    parser.add_argument("--data_dir", help="path to data", default=os.path.abspath('../../healthver/data'))
    parser.add_argument("--train_set", default="healthver_train")
    parser.add_argument("--dev_set", default="healthver_dev")
    parser.add_argument("--test_set", default="healthver_test")
    parser.add_argument("--output_dir", default='./outputs_healthver')
    parser.add_argument("--cache_dir", default="./roberta", type=str, help="store downloaded pre-trained models")
    parser.add_argument('--period', type=int, default=1000)
    #parser.add_argument("--bert_model", default="../roberta_large", type=str)
    parser.add_argument("--bert_model", default=os.path.join(os.path.abspath('../../roberta-large')), type=str)
    parser.add_argument("--do_lower_case", default=True, help="Set this flag if you are using an uncased model.")
    parser.add_argument("--task_name", default="LPA", type=str)
    parser.add_argument('--response_tag', type=str, help='tag', default='response')
    parser.add_argument("--max_seq_length", default=512)
    parser.add_argument("--train_batch_size", default=32)
    parser.add_argument("--eval_batch_size", default=32)
    parser.add_argument('--debug_mode', action='store_true')
    parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.")
    #parser.add_argument("--num_train_epochs", default=20)
    parser.add_argument("--num_train_epochs", default=10)
    parser.add_argument("--lmd",default=0.1, type=float, help="the ratio of guide loss in the ttl loss")
    parser.add_argument("--warmup_proportion", default=0.3, type=float, help="0.1 = 10%% of training.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1)
    parser.add_argument('--seed', type=int, default=42, help="random seed")

In [33]:
args = parser.parse_args()

from pprint import pprint
pprint(args)
    

Namespace(do_train=False, do_eval=False, do_test=False, do_train_eval=False, add_unk=False, load_dir=None, f='/data/home/acw722/.local/share/jupyter/runtime/kernel-193c23fd-93f6-437f-8df9-943e97a78e14.json', data_dir='/data/home/acw722/STAGE0/healthver/data', train_set='healthver_train', dev_set='healthver_dev', test_set='healthver_test', output_dir='./outputs_healthver', cache_dir='./roberta', period=1000, bert_model='/data/home/acw722/STAGE0/roberta-large', do_lower_case=True, task_name='LPA', response_tag='response', max_seq_length=512, train_batch_size=32, eval_batch_size=32, debug_mode=False, learning_rate=2e-05, num_train_epochs=10, lmd=0.1, warmup_proportion=0.3, gradient_accumulation_steps=1, seed=42)


# main() 파헤치기

In [None]:
main()

Namespace(do_train=False, do_eval=False, do_test=False, do_train_eval=False, add_unk=False, load_dir=None, f='/data/home/acw722/.local/share/jupyter/runtime/kernel-193c23fd-93f6-437f-8df9-943e97a78e14.json', data_dir='/data/home/acw722/STAGE0/healthver/data', train_set='healthver_train', dev_set='healthver_dev', test_set='healthver_test', output_dir='./outputs_healthver', cache_dir='./roberta', period=1000, bert_model='/data/home/acw722/STAGE0/roberta-large', do_lower_case=True, task_name='LPA', response_tag='response', max_seq_length=512, train_batch_size=32, eval_batch_size=32, debug_mode=False, learning_rate=2e-05, num_train_epochs=10, lmd=0.1, warmup_proportion=0.3, gradient_accumulation_steps=1, seed=42)
Command is: /data/home/acw722/.conda/envs/condaenv/lib/python3.11/site-packages/ipykernel_launcher.py -f /data/home/acw722/.local/share/jupyter/runtime/kernel-193c23fd-93f6-437f-8df9-943e97a78e14.json
Device: cpu, n_GPU: 0
Datasets are loaded from /data/home/acw722/STAGE0/healthve

roberta.pooler.dense.weight
roberta.pooler.dense.bias
lm_head.bias
lm_head.dense.weight
lm_head.dense.bias
lm_head.layer_norm.weight
lm_head.layer_norm.bias
lm_head.decoder.weight
roberta loaded successfully.
*********************** in get_dataLoader() ***********************
[DAN] : dataset_dict {'train': 'healthver_train', 'dev': 'healthver_dev', 'test': 'healthver_test'}
[DAN] : label_list [0, 1, 2]


100%|██████████| 10590/10590 [00:00<00:00, 106527.44it/s]
convert to features:   0%|          | 0/10590 [00:00<?, ?it/s]*** Example ***
tokens: <s> Results Ġon Ġthe Ġuse Ġof Ġhydro xy chlor oqu ine Ġas Ġa Ġtreatment Ġfor ĠCov id - 19 Ġhave Ġshown Ġno Ġsignificant Ġdifferences Ġin Ġhealth Ġoutcomes Ġbetween Ġthe Ġcontrol Ġgroup Ġand Ġpatients Ġwho Ġreceived Ġthe Ġexperimental Ġdrug . Ġ </s> does Ġhydro xy chlor oqu ine Ġtreat ĠCO VID - 19 ? HC Q Ġshould Ġbe Ġprescribed Ġas Ġa Ġpart Ġof Ġtreatment Ġfor Ġcritically Ġill ĠCO VID - 19 Ġpatients , Ġwith Ġpossible Ġoutcome Ġof Ġsaving Ġlives . </s>
input_ids: 0 41981 15 5 304 9 13575 32027 39220 32689 833 25 10 1416 13 19150 808 12 1646 33 2343 117 1233 5550 11 474 7762 227 5 797 333 8 1484 54 829 5 14073 1262 4 1437 2 26692 13575 32027 39220 32689 833 3951 6247 43814 12 1646 116 13459 1864 197 28 14255 25 10 233 9 1416 13 11960 4812 6247 43814 12 1646 1484 6 19 678 4258 9 6549 1074 4 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

10590 10590
<__main__.InputExample object at 0x2b4b2a2744d0> <__main__.InputFeatures object at 0x2b4b238e2cd0>
[DAN] : examples <__main__.InputExample object at 0x2b4b2a2744d0>
[DAN] : features <__main__.InputFeatures object at 0x2b4b238e2cd0>
[DAN] : batch_size 32
[DAN] : epoch_num 10
[DAN] : all_input_ids tensor([    0, 41981,    15,     5,   304,     9, 13575, 32027, 39220, 32689,
          833,    25,    10,  1416,    13, 19150,   808,    12,  1646,    33,
         2343,   117,  1233,  5550,    11,   474,  7762,   227,     5,   797,
          333,     8,  1484,    54,   829,     5, 14073,  1262,     4,  1437,
            2, 26692, 13575, 32027, 39220, 32689,   833,  3951,  6247, 43814,
           12,  1646,   116, 13459,  1864,   197,    28, 14255,    25,    10,
          233,     9,  1416,    13, 11960,  4812,  6247, 43814,    12,  1646,
         1484,     6,    19,   678,  4258,     9,  6549,  1074,     4,     2,
            1,     1,     1,     1,     1,     1,     1,     1,    

Training:   0%|          | 0/10 [00:00<?, ?it/s]

[DAN] ep in run_train() :  0



0it [00:00, ?it/s][A

[DAN] step: in run_train()  0
[DAN] batch: in run_train()  [tensor([[    0, 29100, 29693,  ...,     1,     1,     1],
        [    0,   133,   795,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        ...,
        [    0,  6323,  7947,  ...,     1,     1,     1],
        [    0, 26251, 13827,  ...,     1,     1,     1],
        [    0,  7215,   261,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 2, 2, 2, 0, 1, 0, 2, 0, 2, 0, 1, 2, 0, 0, 0, 2, 2, 2, 0, 0, 2, 0,
        1, 0, 0, 1, 0, 2, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at /home/conda/feedstock_root/build_artifacts/pytorch-recipe_1680607356962/work/torch/csrc/utils/python_arg_parser.cpp:1485.)
  next_m.mul_(beta1).add_(1 - beta1, grad)

1it [02:10, 130.15s/it][A

[DAN] step: in run_train()  1
[DAN] batch: in run_train()  [tensor([[    0,  1121,    10,  ...,     1,     1,     1],
        [    0,  5320,  5224,  ...,     1,     1,     1],
        [    0, 41981,    15,  ...,     1,     1,     1],
        ...,
        [    0,   627,   304,  ...,     1,     1,     1],
        [    0,  1106,    47,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 1, 2, 0, 1, 0, 1, 2, 0, 0, 2, 1, 0, 0, 2, 2, 0, 2, 2, 2, 2, 1, 1,
        2, 0, 0, 0, 2, 1, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


2it [04:16, 127.69s/it][A

[DAN] step: in run_train()  2
[DAN] batch: in run_train()  [tensor([[    0, 30420, 46963,  ...,     1,     1,     1],
        [    0, 42866,   467,  ...,     1,     1,     1],
        [    0, 41262,   652,  ...,     1,     1,     1],
        ...,
        [    0,  3573,   534,  ...,     1,     1,     1],
        [    0, 16883, 17379,  ...,     1,     1,     1],
        [    0, 10232, 18957,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 2, 2, 2, 1, 1, 0, 0, 1, 0, 1, 2, 1, 0, 1, 1, 2, 2, 2, 0, 2, 2, 0,
        0, 2, 2, 2, 1, 1, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


3it [06:21, 126.69s/it][A

[DAN] step: in run_train()  3
[DAN] batch: in run_train()  [tensor([[    0, 42866,   467,  ...,     1,     1,     1],
        [    0, 13738,  2258,  ...,     1,     1,     1],
        [    0,   133,  1484,  ...,     1,     1,     1],
        ...,
        [    0,  4892,  1054,  ...,     1,     1,     1],
        [    0,  3063,    12,  ...,     1,     1,     1],
        [    0, 18276, 20676,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 0, 2, 0, 2,
        1, 2, 2, 2, 2, 2, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


4it [08:27, 126.34s/it][A

[DAN] step: in run_train()  4
[DAN] batch: in run_train()  [tensor([[    0, 34892, 17683,  ...,     1,     1,     1],
        [    0,   705, 44780,  ...,     1,     1,     1],
        [    0, 28565, 11281,  ...,     1,     1,     1],
        ...,
        [    0,  3972,  1524,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        [    0,  4763,    54,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 2, 0, 1, 1, 1, 2, 2, 0, 0, 1, 1, 1, 2, 0, 2, 1, 2, 1, 0, 2, 0,
        0, 2, 2, 2, 2, 0, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


5it [10:32, 125.92s/it][A

[DAN] step: in run_train()  5
[DAN] batch: in run_train()  [tensor([[    0,   133,  2812,  ...,     1,     1,     1],
        [    0,   179, 43486,  ...,     1,     1,     1],
        [    0, 40179,  1538,  ...,     1,     1,     1],
        ...,
        [    0,   133,  9161,  ...,     1,     1,     1],
        [    0,  6209,    89,  ...,     1,     1,     1],
        [    0,   970,    16,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 0, 0, 2, 2, 1, 0, 2, 0, 0, 0, 1, 1, 2, 0, 0, 0, 1, 0, 2, 0, 1, 1,
        2, 0, 0, 0, 2, 2, 2, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


6it [12:38, 125.91s/it][A

[DAN] step: in run_train()  6
[DAN] batch: in run_train()  [tensor([[    0, 25101,   239,  ...,     1,     1,     1],
        [    0,   771,  4526,  ...,     1,     1,     1],
        [    0, 10836,    47,  ...,     1,     1,     1],
        ...,
        [    0, 31988, 44197,  ...,     1,     1,     1],
        [    0, 10836, 33104,  ...,     1,     1,     1],
        [    0, 40555,   625,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 2, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 0, 2, 1, 2, 2, 2, 2, 0, 1, 0, 2,
        1, 0, 0, 2, 0, 1, 1, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


7it [14:43, 125.73s/it][A

[DAN] step: in run_train()  7
[DAN] batch: in run_train()  [tensor([[    0,   133,   275,  ...,     1,     1,     1],
        [    0,   565, 29340,  ...,     1,     1,     1],
        [    0,   250, 45837,  ...,     1,     1,     1],
        ...,
        [    0,   858,   892,  ...,     1,     1,     1],
        [    0, 46577,  7018,  ...,     1,     1,     1],
        [    0,   133, 34377,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 1, 2, 1, 1, 1, 0, 1, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 1, 1, 0, 2, 2, 2,
        1, 2, 2, 0, 1, 1, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


8it [16:49, 125.77s/it][A

[DAN] step: in run_train()  8
[DAN] batch: in run_train()  [tensor([[    0,  1779,   951,  ...,     1,     1,     1],
        [    0,   104, 19625,  ...,     1,     1,     1],
        [    0,   448, 40981,  ...,     1,     1,     1],
        ...,
        [    0, 37879,   429,  ...,     1,     1,     1],
        [    0,  7779, 11474,  ...,     1,     1,     1],
        [    0, 16767,  2603,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 2, 2, 2, 2, 0, 2, 1, 2, 2, 1, 0, 2, 1, 2, 0, 1, 2, 1, 0, 2, 0, 1,
        0, 1, 2, 2, 2, 2, 1, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


9it [18:55, 125.70s/it][A

[DAN] step: in run_train()  9
[DAN] batch: in run_train()  [tensor([[    0, 10836, 13256,  ...,     1,     1,     1],
        [    0,   627,   304,  ...,     1,     1,     1],
        [    0, 16767,  2603,  ...,     1,     1,     1],
        ...,
        [    0,   970,    33,  ...,     1,     1,     1],
        [    0, 11475,     5,  ...,     1,     1,     1],
        [    0,   133,  1136,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0, 2, 1, 2, 1, 2, 2, 1, 2,
        1, 2, 2, 1, 0, 0, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


10it [21:01, 125.79s/it][A

[DAN] step: in run_train()  10
[DAN] batch: in run_train()  [tensor([[    0,   243,    16,  ...,     1,     1,     1],
        [    0,   347, 20126,  ...,     1,     1,     1],
        [    0,   627, 11825,  ...,     1,     1,     1],
        ...,
        [    0,   627,  4356,  ...,     1,     1,     1],
        [    0,   627,   304,  ...,     1,     1,     1],
        [    0,   104, 19625,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 0, 2, 2, 0, 0, 2, 2, 1, 2, 1, 1, 2, 2, 0, 0, 2, 1, 2, 2, 1, 2,
        2, 2, 0, 0, 2, 2, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


11it [23:06, 125.63s/it][A

[DAN] step: in run_train()  11
[DAN] batch: in run_train()  [tensor([[    0, 28565,  7018,  ...,     1,     1,     1],
        [    0, 33120,  5842,  ...,     1,     1,     1],
        [    0,   970,    16,  ...,     1,     1,     1],
        ...,
        [    0,  5970,    54,  ...,     1,     1,     1],
        [    0,   673, 13034,  ...,     1,     1,     1],
        [    0, 28565,  7018,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 0, 0, 2, 0, 0, 1, 2, 2, 2, 1, 2, 0, 0, 0, 1, 2, 2, 2, 1, 1, 2, 0,
        0, 2, 2, 0, 2, 2, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


12it [25:12, 125.68s/it][A

[DAN] step: in run_train()  12
[DAN] batch: in run_train()  [tensor([[    0,  1620,    10,  ...,     1,     1,     1],
        [    0, 29038, 42603,  ...,     1,     1,     1],
        [    0,   705, 44780,  ...,     1,     1,     1],
        ...,
        [    0,   705, 44780,  ...,     1,     1,     1],
        [    0, 26799, 13941,  ...,     1,     1,     1],
        [    0,  6323,  6247,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2, 1, 2, 1, 2, 0, 1,
        0, 2, 0, 0, 2, 2, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


13it [27:17, 125.52s/it][A

[DAN] step: in run_train()  13
[DAN] batch: in run_train()  [tensor([[    0,  6335, 43814,  ...,     1,     1,     1],
        [    0,   245,   534,  ...,     1,     1,     1],
        [    0,  1708,  2498,  ...,     1,     1,     1],
        ...,
        [    0,    29, 19625,  ...,     1,     1,     1],
        [    0, 37287, 16483,  ...,     1,     1,     1],
        [    0,   250, 45837,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 1, 2, 2, 1, 2, 1, 2, 2, 2, 0, 2, 0, 2, 2, 1, 2, 0, 0, 1, 0, 2, 0, 2,
        2, 2, 0, 0, 2, 2, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


14it [29:23, 125.61s/it][A

[DAN] step: in run_train()  14
[DAN] batch: in run_train()  [tensor([[    0,   133,   623,  ...,     1,     1,     1],
        [    0,   627,   304,  ...,     1,     1,     1],
        [    0,  3084,     6,  ...,     1,     1,     1],
        ...,
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0,   705, 44780,  ...,     1,     1,     1],
        [    0, 22649,   474,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 2, 2, 0, 0, 2, 2, 2, 0, 1, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 1, 2,
        0, 0, 2, 0, 2, 0, 1, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


15it [31:28, 125.58s/it][A

[DAN] step: in run_train()  15
[DAN] batch: in run_train()  [tensor([[    0,  6323,  1134,  ...,     1,     1,     1],
        [    0,  5771,   627,  ...,     1,     1,     1],
        [    0, 15097,    32,  ...,     1,     1,     1],
        ...,
        [    0,   970,    16,  ...,     1,     1,     1],
        [    0,   245,   534,  ...,     1,     1,     1],
        [    0, 46905,   424,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 2, 2, 2, 2, 2, 0, 2, 1, 0, 0, 1, 0, 2, 0, 2, 0, 0, 1, 2, 1, 2, 0,
        0, 1, 0, 0, 0, 0, 1, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


16it [33:34, 125.69s/it][A

[DAN] step: in run_train()  16
[DAN] batch: in run_train()  [tensor([[    0,   846, 44780,  ...,     1,     1,     1],
        [    0,   133,  6594,  ...,     1,     1,     1],
        [    0, 39807,   523,  ...,     1,     1,     1],
        ...,
        [    0, 12979, 44197,  ...,     1,     1,     1],
        [    0, 11428,  6157,  ...,     1,     1,     1],
        [    0,   133, 34377,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 2, 1, 2, 2, 0, 2, 1, 0, 2,
        0, 2, 0, 0, 2, 0, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


17it [35:40, 125.60s/it][A

[DAN] step: in run_train()  17
[DAN] batch: in run_train()  [tensor([[    0, 15228,   261,  ...,     1,     1,     1],
        [    0, 41981,    15,  ...,     1,     1,     1],
        [    0,  8585,    16,  ...,     1,     1,     1],
        ...,
        [    0, 25826,    90,  ...,     1,     1,     1],
        [    0,   133,  1049,  ...,     1,     1,     1],
        [    0,   170,   218,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 1, 2, 0, 2, 1, 0, 2, 1, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2,
        1, 0, 0, 1, 0, 0, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


18it [37:46, 125.72s/it][A

[DAN] step: in run_train()  18
[DAN] batch: in run_train()  [tensor([[    0,   627,  4356,  ...,     1,     1,     1],
        [    0, 14043, 18957,  ...,     1,     1,     1],
        [    0,   170,   109,  ...,     1,     1,     1],
        ...,
        [    0,   970,    18,  ...,     1,     1,     1],
        [    0, 16883,  8456,  ...,     1,     1,     1],
        [    0,   133,  6793,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 1, 2, 0, 2, 0, 1, 2, 1, 2, 1, 2, 0, 2, 0, 0, 2, 1, 1, 1, 0, 1, 0,
        2, 2, 0, 2, 0, 0, 1, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


19it [39:51, 125.66s/it][A

[DAN] step: in run_train()  19
[DAN] batch: in run_train()  [tensor([[    0,  1993,    82,  ...,     1,     1,     1],
        [    0, 29723,    21,  ...,     1,     1,     1],
        [    0,  3972,   912,  ...,     1,     1,     1],
        ...,
        [    0, 10836, 13575,  ...,     1,     1,     1],
        [    0,   627, 17379,  ...,     1,     1,     1],
        [    0,  7215,   261,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 0, 1, 2, 2, 0, 1, 0, 0, 1, 1, 0, 0, 0, 2, 2, 2, 2, 0, 2, 2, 0, 0,
        2, 2, 2, 2, 2, 0, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


20it [41:57, 125.68s/it][A

[DAN] step: in run_train()  20
[DAN] batch: in run_train()  [tensor([[    0, 30383,    34,  ...,     1,     1,     1],
        [    0, 40450,   303,  ...,     1,     1,     1],
        [    0,  3084,     6,  ...,     1,     1,     1],
        ...,
        [    0,   448, 40981,  ...,     1,     1,     1],
        [    0,  4763,    19,  ...,     1,     1,     1],
        [    0,   627,  4356,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 0, 2, 0, 2, 0, 2, 0, 1, 1, 1, 0, 1, 0, 0, 0, 2, 2, 0, 2, 2, 1, 2,
        1, 2, 1, 0, 1, 1, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


21it [44:03, 125.68s/it][A

[DAN] step: in run_train()  21
[DAN] batch: in run_train()  [tensor([[    0,  1106,    47,  ...,     1,     1,     1],
        [    0,   133,  3508,  ...,     1,     1,     1],
        [    0, 18522, 19899,  ...,     1,     1,     1],
        ...,
        [    0, 15228,   261,  ...,     1,     1,     1],
        [    0, 28970, 10017,  ...,     1,     1,     1],
        [    0, 37167,  1473,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 2, 0, 0, 1, 0, 2, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 1,
        2, 0, 1, 2, 2, 0, 1, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


276it [9:37:27, 129.10s/it][A

[DAN] step: in run_train()  276
[DAN] batch: in run_train()  [tensor([[    0, 37668, 20650,  ...,     1,     1,     1],
        [    0, 16991, 17379,  ...,     1,     1,     1],
        [    0,  9058,  1131,  ...,     1,     1,     1],
        ...,
        [    0,   846, 44780,  ...,     1,     1,     1],
        [    0, 10105,    82,  ...,     1,     1,     1],
        [    0,   133,   810,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 2, 2, 2, 1, 1, 0, 2, 0, 1, 2, 2, 0, 1, 0, 2, 2, 1, 0, 2, 2, 1, 0, 0,
        2, 2, 2, 1, 0, 0, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


277it [9:39:37, 129.41s/it][A

[DAN] step: in run_train()  277
[DAN] batch: in run_train()  [tensor([[    0,   241, 37597,  ...,     1,     1,     1],
        [    0, 26251, 13827,  ...,     1,     1,     1],
        [    0,  1594,   110,  ...,     1,     1,     1],
        ...,
        [    0,   970,    16,  ...,     1,     1,     1],
        [    0,  2387, 11445,  ...,     1,     1,     1],
        [    0, 37167,    10,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 2, 2, 2, 0, 0, 1, 2, 1, 2, 2, 2, 2, 2, 0, 0, 1, 0, 2, 0, 1, 2, 2,
        1, 2, 0, 0, 1, 1, 1, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


278it [9:41:46, 129.30s/it][A

[DAN] step: in run_train()  278
[DAN] batch: in run_train()  [tensor([[    0,   133,  1484,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0,   616,   144,  ...,     1,     1,     1],
        ...,
        [    0, 10787,   910,  ...,     1,     1,     1],
        [    0,   100,   300,  ...,     1,     1,     1],
        [    0,  1620,    10,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 2, 2, 2, 0, 0, 2, 1, 2, 2, 0, 2, 2, 0, 2, 0, 0, 2, 1, 2, 2, 0, 0, 1,
        1, 2, 1, 2, 2, 2, 1, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


279it [9:43:56, 129.55s/it][A

[DAN] step: in run_train()  279
[DAN] batch: in run_train()  [tensor([[    0,   627,  6793,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 16883,  8456,  ...,     1,     1,     1],
        ...,
        [    0, 25826,    90,  ...,     1,     1,     1],
        [    0,   448, 41054,  ...,     1,     1,     1],
        [    0, 27669, 19150,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 1, 2, 0, 2, 0, 2, 2, 0, 2, 1, 0, 1, 2, 2, 0, 0, 0, 0, 2, 2, 1, 2,
        2, 1, 1, 0, 0, 2, 1, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


280it [9:46:05, 129.38s/it][A

[DAN] step: in run_train()  280
[DAN] batch: in run_train()  [tensor([[    0, 41205,    16,  ...,     1,     1,     1],
        [    0, 34892, 17683,  ...,     1,     1,     1],
        [    0,   347,  1417,  ...,     1,     1,     1],
        ...,
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 27004,    15,  ...,     1,     1,     1],
        [    0,  2709,  1246,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 1, 0, 2, 0, 1, 2, 2, 0, 1,
        0, 2, 1, 0, 2, 0, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


281it [9:48:15, 129.49s/it][A

[DAN] step: in run_train()  281
[DAN] batch: in run_train()  [tensor([[    0,   705, 44780,  ...,     1,     1,     1],
        [    0,  4763,    19,  ...,     1,     1,     1],
        [    0,   250,   367,  ...,     1,     1,     1],
        ...,
        [    0, 27847, 38001,  ...,     1,     1,     1],
        [    0,  1121,    10,  ...,     1,     1,     1],
        [    0, 28565,  7018,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 0, 2, 0, 2, 2, 0, 1, 1, 2, 2, 2, 0, 1, 1, 1, 2, 2, 1, 0, 2, 1, 0,
        2, 0, 0, 1, 2, 0, 2, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


282it [9:50:24, 129.50s/it][A

[DAN] step: in run_train()  282
[DAN] batch: in run_train()  [tensor([[    0,  1708,   255,  ...,     1,     1,     1],
        [    0,   245,   534,  ...,     1,     1,     1],
        [    0, 46905,   424,  ...,     1,     1,     1],
        ...,
        [    0,  1106,  7704,  ...,     1,     1,     1],
        [    0,  5771,  6398,  ...,     1,     1,     1],
        [    0, 17906, 35140,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 2, 2, 2, 2, 1, 2, 0, 1, 0, 1, 0, 2, 2, 2, 2, 2, 1, 0, 0, 0, 0, 2,
        0, 1, 0, 0, 0, 1, 1, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


283it [9:52:34, 129.48s/it][A

[DAN] step: in run_train()  283
[DAN] batch: in run_train()  [tensor([[    0, 23329,  4458,  ...,     1,     1,     1],
        [    0, 28565, 11281,  ...,     1,     1,     1],
        [    0, 26251, 13827,  ...,     1,     1,     1],
        ...,
        [    0, 15852,  9077,  ...,     1,     1,     1],
        [    0,   970,    18,  ...,     1,     1,     1],
        [    0,   133,  6793,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 1, 2, 1, 2, 2, 1, 2, 2, 0, 2, 0, 1, 2, 0, 0, 2, 0, 0, 1, 0, 2, 0, 2,
        2, 0, 1, 1, 1, 0, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


284it [9:54:43, 129.40s/it][A

[DAN] step: in run_train()  284
[DAN] batch: in run_train()  [tensor([[    0,  7605,    10,  ...,     1,     1,     1],
        [    0,   245,   534,  ...,     1,     1,     1],
        [    0,   133,  2812,  ...,     1,     1,     1],
        ...,
        [    0,  1620,    10,  ...,     1,     1,     1],
        [    0,   250, 45837,  ...,     1,     1,     1],
        [    0,   133, 34377,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 1, 2, 0, 1, 1, 0, 1, 2, 2, 2, 2, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 1, 2, 2, 2, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


285it [9:56:53, 129.50s/it][A

[DAN] step: in run_train()  285
[DAN] batch: in run_train()  [tensor([[    0, 44537, 46963,  ...,     1,     1,     1],
        [    0, 46905,   424,  ...,     1,     1,     1],
        [    0,   245,   534,  ...,     1,     1,     1],
        ...,
        [    0, 10836,    47,  ...,     1,     1,     1],
        [    0,   133,   623,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 1, 0, 0, 0, 0, 1, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 2, 2, 0, 1, 2, 2,
        2, 1, 2, 1, 2, 2, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


286it [9:59:02, 129.47s/it][A

[DAN] step: in run_train()  286
[DAN] batch: in run_train()  [tensor([[    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 22649,   474,  ...,     1,     1,     1],
        [    0,  1694,    33,  ...,     1,     1,     1],
        ...,
        [    0, 12465,    82,  ...,     1,     1,     1],
        [    0,   133,  1283,  ...,     1,     1,     1],
        [    0, 10975,  6335,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 0, 0, 2, 0, 2, 0, 2, 0, 1, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2, 0, 0, 1, 2,
        0, 1, 2, 1, 0, 0, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


287it [10:01:12, 129.48s/it][A

[DAN] step: in run_train()  287
[DAN] batch: in run_train()  [tensor([[    0,   705, 44780,  ...,     1,     1,     1],
        [    0,  3084,     6,  ...,     1,     1,     1],
        [    0, 15228,   261,  ...,     1,     1,     1],
        ...,
        [    0,  8338,    82,  ...,     1,     1,     1],
        [    0,  4030,   892,  ...,     1,     1,     1],
        [    0,   616,   144,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 0, 0, 2, 2, 0, 2, 1, 1, 2, 0, 2, 2, 2, 1, 2, 2, 2, 0, 0, 2, 2, 2,
        2, 1, 2, 2, 1, 2, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


288it [10:03:21, 129.53s/it][A

[DAN] step: in run_train()  288
[DAN] batch: in run_train()  [tensor([[    0,   771, 10852,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        [    0, 15228,   261,  ...,     1,     1,     1],
        ...,
        [    0, 42866,   467,  ...,     1,     1,     1],
        [    0,  4539,  3841,  ...,     1,     1,     1],
        [    0,  4763,    54,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 1, 2, 2, 2, 1, 1, 2, 0, 2, 2, 1, 0, 2, 2, 1, 2, 0, 0, 0, 1, 1, 2, 0,
        0, 2, 1, 0, 1, 2, 2, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


289it [10:05:31, 129.48s/it][A

[DAN] step: in run_train()  289
[DAN] batch: in run_train()  [tensor([[    0,  8170,  3240,  ...,     1,     1,     1],
        [    0, 34892, 17683,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        ...,
        [    0, 41981,    15,  ...,     1,     1,     1],
        [    0,   438,  1417,  ...,     1,     1,     1],
        [    0,   133,  6793,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 0, 2, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2,
        0, 2, 2, 0, 1, 1, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


290it [10:07:41, 129.67s/it][A

[DAN] step: in run_train()  290
[DAN] batch: in run_train()  [tensor([[    0, 23675,    16,  ...,     1,     1,     1],
        [    0, 26840,   604,  ...,     1,     1,     1],
        [    0,  2895, 17960,  ...,     1,     1,     1],
        ...,
        [    0,   405,   189,  ...,     1,     1,     1],
        [    0,   700, 34377,  ...,     1,     1,     1],
        [    0,   627, 11825,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 1, 0, 2, 2, 2, 1, 2, 0, 1, 1, 1, 0, 1, 2, 0, 2, 0, 1, 2, 2, 2, 0,
        0, 1, 1, 2, 0, 0, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


291it [10:09:50, 129.49s/it][A

[DAN] step: in run_train()  291
[DAN] batch: in run_train()  [tensor([[    0,   846, 44780,  ...,     1,     1,     1],
        [    0,   179, 25068,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        ...,
        [    0, 15228,   261,  ...,     1,     1,     1],
        [    0,   176,   475,  ...,     1,     1,     1],
        [    0,  1121, 17706,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 2, 0, 2, 2, 2, 0, 0, 0, 0, 1, 2, 0, 1, 1, 1, 2, 0, 1, 2, 2, 2, 0,
        1, 0, 2, 1, 1, 0, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


292it [10:12:00, 129.65s/it][A

[DAN] step: in run_train()  292
[DAN] batch: in run_train()  [tensor([[    0,  5593, 10067,  ...,     1,     1,     1],
        [    0, 27847,  7406,  ...,     1,     1,     1],
        [    0,   243,    18,  ...,     1,     1,     1],
        ...,
        [    0,   970,    16,  ...,     1,     1,     1],
        [    0,   970,    16,  ...,     1,     1,     1],
        [    0,   133,   623,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 0, 2, 2, 2, 2, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 2, 1, 0, 1, 1, 1, 2, 2,
        0, 2, 2, 2, 0, 1, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


293it [10:14:09, 129.49s/it][A

[DAN] step: in run_train()  293
[DAN] batch: in run_train()  [tensor([[    0, 10787,  6357,  ...,     1,     1,     1],
        [    0, 26840,    82,  ...,     1,     1,     1],
        [    0, 30420, 46963,  ...,     1,     1,     1],
        ...,
        [    0, 14043, 18957,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        [    0,  7215,   261,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 1, 0, 0, 1, 2, 2, 2, 1, 0, 0, 1, 2, 2, 2, 2, 1, 0, 2, 2, 2, 2, 0,
        1, 2, 2, 0, 0, 0, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


294it [10:16:19, 129.75s/it][A

[DAN] step: in run_train()  294
[DAN] batch: in run_train()  [tensor([[    0,   133,   892,  ...,     1,     1,     1],
        [    0,  1106,    47,  ...,     1,     1,     1],
        [    0, 35490,    10,  ...,     1,     1,     1],
        ...,
        [    0, 10836,  2498,  ...,     1,     1,     1],
        [    0,   970,    16,  ...,     1,     1,     1],
        [    0,  4027,  3218,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 2, 1, 2, 1, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 1, 2, 0, 2, 0, 1, 2,
        2, 2, 0, 1, 2, 2, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


295it [10:18:28, 129.46s/it][A

[DAN] step: in run_train()  295
[DAN] batch: in run_train()  [tensor([[    0,  6335, 43814,  ...,     1,     1,     1],
        [    0,  3573,   534,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        ...,
        [    0,   170,   218,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0,  3750,    42,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 2, 0, 0, 0, 1, 1, 2, 2, 1, 2, 0, 0, 0, 2, 1, 2, 2, 0, 2, 1, 2, 2,
        2, 1, 2, 2, 0, 1, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


296it [10:20:38, 129.65s/it][A

[DAN] step: in run_train()  296
[DAN] batch: in run_train()  [tensor([[    0, 42660,  1780,  ...,     1,     1,     1],
        [    0,   627,    86,  ...,     1,     1,     1],
        [    0, 15228,   261,  ...,     1,     1,     1],
        ...,
        [    0, 33266,  3257,  ...,     1,     1,     1],
        [    0, 10724, 21543,  ...,     1,     1,     1],
        [    0, 24514, 20016,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 2, 2, 0, 2, 0, 2, 1, 0, 2, 1, 2, 0, 2, 2, 2, 0, 2, 1, 2, 0, 1, 0,
        0, 0, 2, 2, 2, 2, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


297it [10:22:47, 129.53s/it][A

[DAN] step: in run_train()  297
[DAN] batch: in run_train()  [tensor([[    0,   970,    16,  ...,     1,     1,     1],
        [    0, 33596,  1484,  ...,     1,     1,     1],
        [    0, 35416,    89,  ...,     1,     1,     1],
        ...,
        [    0,  8170,  3240,  ...,     1,     1,     1],
        [    0, 31231,    47,  ...,     1,     1,     1],
        [    0, 18276, 20676,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 2, 1, 1, 2, 2, 2, 1, 0, 1, 2, 1, 0, 0, 2, 0, 1, 2, 2, 0, 2, 2, 0,
        0, 2, 2, 0, 0, 2, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


298it [10:24:57, 129.65s/it][A

[DAN] step: in run_train()  298
[DAN] batch: in run_train()  [tensor([[    0, 28565,  7018,  ...,     1,     1,     1],
        [    0, 10836,  6247,  ...,     1,     1,     1],
        [    0,   448, 40981,  ...,     1,     1,     1],
        ...,
        [    0,  1620,    55,  ...,     1,     1,     1],
        [    0, 15228,   261,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 0, 2, 0, 0, 2, 0, 2, 2, 2, 1, 2, 0, 1, 0, 2, 2, 0, 2, 2, 0, 2, 2, 1,
        2, 1, 0, 2, 0, 0, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


299it [10:27:07, 129.54s/it][A

[DAN] step: in run_train()  299
[DAN] batch: in run_train()  [tensor([[    0, 19827,  6441,  ...,     1,     1,     1],
        [    0,   133,  2812,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        ...,
        [    0, 23996, 14805,  ...,     1,     1,     1],
        [    0, 29723,    21,  ...,     1,     1,     1],
        [    0, 37071,   121,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 2, 1, 0, 1, 0, 0, 1, 1, 0, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0,
        1, 2, 0, 1, 2, 0, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


325it [11:23:13, 129.43s/it][A

[DAN] step: in run_train()  325
[DAN] batch: in run_train()  [tensor([[    0,   705, 44780,  ...,     1,     1,     1],
        [    0,   133,  1907,  ...,     1,     1,     1],
        [    0, 26145,   594,  ...,     1,     1,     1],
        ...,
        [    0, 11428,  6157,  ...,     1,     1,     1],
        [    0, 46069, 28677,  ...,     1,     1,     1],
        [    0, 12979, 44197,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 1, 1, 0, 0, 2, 0, 0, 1, 0, 2, 1, 0, 2, 2, 2, 2, 2, 2, 0, 2, 1, 2,
        0, 1, 2, 0, 1, 0, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
    


2it [04:18, 129.00s/it][A

[DAN] step: in run_train()  2
[DAN] batch: in run_train()  [tensor([[    0, 10836,    47,  ...,     1,     1,     1],
        [    0,   245,   534,  ...,     1,     1,     1],
        [    0,   347,  1417,  ...,     1,     1,     1],
        ...,
        [    0, 31097,  5194,  ...,     1,     1,     1],
        [    0,   133,  6247,  ...,     1,     1,     1],
        [    0, 11108, 11497,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 1, 1, 0, 2, 2, 1, 2, 0, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 1, 0, 2, 1,
        2, 2, 1, 0, 0, 0, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


3it [06:28, 129.46s/it][A

[DAN] step: in run_train()  3
[DAN] batch: in run_train()  [tensor([[    0,   133,  2812,  ...,     1,     1,     1],
        [    0,   510,  2580,  ...,     1,     1,     1],
        [    0,  1106,    47,  ...,     1,     1,     1],
        ...,
        [    0, 10787, 19619,  ...,     1,     1,     1],
        [    0, 27326, 27415,  ...,     1,     1,     1],
        [    0, 37167,    10,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 0, 2, 0, 2, 1, 2, 2, 2, 1, 0, 2, 1, 0, 2, 1, 2, 2, 1, 2, 2, 0, 2,
        0, 0, 1, 1, 1, 0, 1, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


4it [08:37, 129.25s/it][A

[DAN] step: in run_train()  4
[DAN] batch: in run_train()  [tensor([[    0, 29182,    92,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0,   133,  6793,  ...,     1,     1,     1],
        ...,
        [    0, 29038, 42603,  ...,     1,     1,     1],
        [    0, 46069, 28677,  ...,     1,     1,     1],
        [    0,   133,  3038,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 2, 0, 0, 2, 1, 2, 0, 2, 1, 0, 2, 0, 1, 1, 1, 1, 1, 0, 0, 0, 2, 1,
        0, 0, 0, 2, 1, 0, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


5it [10:47, 129.49s/it][A

[DAN] step: in run_train()  5
[DAN] batch: in run_train()  [tensor([[    0,   133,  1484,  ...,     1,     1,     1],
        [    0,   133, 12833,  ...,     1,     1,     1],
        [    0,  1594,    47,  ...,     1,     1,     1],
        ...,
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 25826,    90,  ...,     1,     1,     1],
        [    0, 39762,    12,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 2, 1, 0, 2, 1, 2, 0, 0, 2, 0, 2,
        1, 0, 2, 2, 0, 0, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


6it [12:56, 129.35s/it][A

[DAN] step: in run_train()  6
[DAN] batch: in run_train()  [tensor([[    0,   133,  6793,  ...,     1,     1,     1],
        [    0,   705, 44780,  ...,     1,     1,     1],
        [    0, 10232, 18957,  ...,     1,     1,     1],
        ...,
        [    0,   627,   727,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 37879,   189,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 1, 2, 0, 1, 1, 0, 1, 0, 0, 0, 2, 0, 2, 1, 0, 0, 0, 2, 2, 1, 0, 0,
        0, 0, 0, 1, 0, 2, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


7it [15:06, 129.54s/it][A

[DAN] step: in run_train()  7
[DAN] batch: in run_train()  [tensor([[    0,  1620,  2569,  ...,     1,     1,     1],
        [    0, 23996, 14805,  ...,     1,     1,     1],
        [    0,  6323,  7947,  ...,     1,     1,     1],
        ...,
        [    0, 25042,  1536,  ...,     1,     1,     1],
        [    0,  9690, 31172,  ...,     1,     1,     1],
        [    0, 16883, 14251,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 2, 0, 0, 2, 1, 1, 2, 1, 0, 2, 0, 2, 2, 2, 1, 2, 0, 2, 0, 2, 1, 2,
        0, 0, 1, 0, 2, 1, 2, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


8it [17:15, 129.43s/it][A

[DAN] step: in run_train()  8
[DAN] batch: in run_train()  [tensor([[    0,  6070, 47182,  ...,     1,     1,     1],
        [    0,   970,    33,  ...,     1,     1,     1],
        [    0, 14229,     5,  ...,     1,     1,     1],
        ...,
        [    0, 10836,   408,  ...,     1,     1,     1],
        [    0,   347,  1417,  ...,     1,     1,     1],
        [    0, 46069, 28677,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 1, 2, 2, 1, 0, 1, 2, 2,
        1, 1, 1, 0, 2, 0, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


9it [19:24, 129.46s/it][A

[DAN] step: in run_train()  9
[DAN] batch: in run_train()  [tensor([[    0, 42866,   467,  ...,     1,     1,     1],
        [    0,  1708,  8890,  ...,     1,     1,     1],
        [    0, 37167,    10,  ...,     1,     1,     1],
        ...,
        [    0, 46994, 47629,  ...,     1,     1,     1],
        [    0,  4763,   447,  ...,     1,     1,     1],
        [    0,   133,  6793,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 0, 2, 0, 0, 1, 2, 1, 2, 1, 2, 0, 0, 0, 2, 0, 2, 1, 0, 1, 2, 2, 2,
        0, 2, 1, 0, 0, 0, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
      


10it [21:34, 129.47s/it][A

[DAN] step: in run_train()  10
[DAN] batch: in run_train()  [tensor([[    0, 28285,  1484,  ...,     1,     1,     1],
        [    0,   133, 34377,  ...,     1,     1,     1],
        [    0,  3972,  1524,  ...,     1,     1,     1],
        ...,
        [    0,   970,    16,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        [    0,   510,  2580,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 0, 2, 0, 1, 1, 1, 2, 1, 1, 0, 1, 0, 2, 1, 2, 2, 2, 1, 2, 2, 1, 2,
        0, 0, 2, 2, 1, 0, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


11it [23:43, 129.41s/it][A

[DAN] step: in run_train()  11
[DAN] batch: in run_train()  [tensor([[    0, 16991, 37662,  ...,     1,     1,     1],
        [    0,  1620,    55,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        ...,
        [    0,   133, 13387,  ...,     1,     1,     1],
        [    0,    29, 19625,  ...,     1,     1,     1],
        [    0, 16767,  2603,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 1, 1, 2, 0, 2, 1, 2, 1, 1, 2, 0, 2, 0, 0,
        0, 0, 2, 2, 2, 0, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


12it [25:53, 129.50s/it][A

[DAN] step: in run_train()  12
[DAN] batch: in run_train()  [tensor([[    0, 21518,   762,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0,  7215,   261,  ...,     1,     1,     1],
        ...,
        [    0, 46905,   424,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        [    0,   970,    16,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 1, 1, 1, 1, 0, 2, 1, 0, 2, 0, 1, 2, 2, 0, 2, 1, 2, 0, 0, 2, 2, 2, 2,
        1, 1, 1, 2, 2, 1, 1, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


13it [28:02, 129.41s/it][A

[DAN] step: in run_train()  13
[DAN] batch: in run_train()  [tensor([[    0, 28565,  7018,  ...,     1,     1,     1],
        [    0, 12979, 44197,  ...,     1,     1,     1],
        [    0, 37172,     7,  ...,     1,     1,     1],
        ...,
        [    0,   771, 10852,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0,   565, 29340,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 1, 0, 2, 2, 0, 0, 0, 1, 0, 0, 2, 0, 2, 2, 0, 1, 2, 2, 2, 1, 0, 1, 0,
        2, 2, 2, 2, 2, 0, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


14it [30:12, 129.61s/it][A

[DAN] step: in run_train()  14
[DAN] batch: in run_train()  [tensor([[    0, 15228,   261,  ...,     1,     1,     1],
        [    0, 15228,   261,  ...,     1,     1,     1],
        [    0,   858,   892,  ...,     1,     1,     1],
        ...,
        [    0, 12861,   810,  ...,     1,     1,     1],
        [    0,   133,  6247,  ...,     1,     1,     1],
        [    0, 10836, 28793,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 1, 0, 2, 0, 0, 2, 0, 2, 1, 0, 2, 0, 2, 0, 1, 0, 0, 1, 0, 2, 2, 2,
        1, 1, 2, 1, 1, 2, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


15it [32:21, 129.38s/it][A

[DAN] step: in run_train()  15
[DAN] batch: in run_train()  [tensor([[    0,   705, 44780,  ...,     1,     1,     1],
        [    0, 19751, 43022,  ...,     1,     1,     1],
        [    0,  1106,    47,  ...,     1,     1,     1],
        ...,
        [    0, 13863,    89,  ...,     1,     1,     1],
        [    0, 17906, 35140,  ...,     1,     1,     1],
        [    0, 37071,   121,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 1, 0, 0, 0, 2, 2, 0, 0, 2, 0, 1, 0, 0, 2, 2, 1, 0, 2, 0, 0, 0, 0, 2,
        0, 2, 2, 0, 0, 2, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


16it [34:31, 129.59s/it][A

[DAN] step: in run_train()  16
[DAN] batch: in run_train()  [tensor([[    0,  3908,     5,  ...,     1,     1,     1],
        [    0,   250,  4946,  ...,     1,     1,     1],
        [    0, 17143,    11,  ...,     1,     1,     1],
        ...,
        [    0,  8585,    16,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        [    0, 14696, 12081,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 0, 1, 1, 2, 0, 2, 0, 1, 1, 1, 2, 0, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2,
        1, 2, 1, 2, 0, 1, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


17it [36:40, 129.34s/it][A

[DAN] step: in run_train()  17
[DAN] batch: in run_train()  [tensor([[    0,   104, 22210,  ...,     1,     1,     1],
        [    0,   133, 34377,  ...,     1,     1,     1],
        [    0,   133,   778,  ...,     1,     1,     1],
        ...,
        [    0, 35193,  5298,  ...,     1,     1,     1],
        [    0,  8585,    16,  ...,     1,     1,     1],
        [    0, 28565,  7018,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 1, 0, 1, 2, 2, 0, 2, 2, 0, 2, 0, 1, 1, 0, 2, 0, 2, 0, 0, 1, 1, 0,
        2, 0, 0, 1, 2, 1, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


18it [38:50, 129.56s/it][A

[DAN] step: in run_train()  18
[DAN] batch: in run_train()  [tensor([[    0,  1121,    41,  ...,     1,     1,     1],
        [    0,  6209, 40221,  ...,     1,     1,     1],
        [    0,  7199,   129,  ...,     1,     1,     1],
        ...,
        [    0, 15097,    32,  ...,     1,     1,     1],
        [    0,  5771,     5,  ...,     1,     1,     1],
        [    0,  1264,   678,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 0, 1, 2, 1, 2, 2, 0, 0, 2, 0, 1, 2, 0,
        0, 1, 1, 2, 0, 2, 2, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


19it [40:59, 129.41s/it][A

[DAN] step: in run_train()  19
[DAN] batch: in run_train()  [tensor([[    0,  1121,    10,  ...,     1,     1,     1],
        [    0,   133, 39500,  ...,     1,     1,     1],
        [    0,   133,   623,  ...,     1,     1,     1],
        ...,
        [    0, 23996, 34144,  ...,     1,     1,     1],
        [    0,   133,  6793,  ...,     1,     1,     1],
        [    0,   495, 17821,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 2, 2, 2, 1, 2, 0, 1, 2, 0, 2, 1, 1, 2, 0, 1, 2, 2, 0, 2, 0, 1, 0,
        2, 0, 0, 0, 2, 0, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


20it [43:09, 129.55s/it][A

[DAN] step: in run_train()  20
[DAN] batch: in run_train()  [tensor([[    0, 44537, 46963,  ...,     1,     1,     1],
        [    0,  6209,    89,  ...,     1,     1,     1],
        [    0,  8585,    16,  ...,     1,     1,     1],
        ...,
        [    0, 19827,  6441,  ...,     1,     1,     1],
        [    0,   104, 22210,  ...,     1,     1,     1],
        [    0, 10836, 47379,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 0, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 0,
        2, 2, 1, 0, 2, 0, 2, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


21it [45:18, 129.37s/it][A

[DAN] step: in run_train()  21
[DAN] batch: in run_train()  [tensor([[    0,   250,  4069,  ...,     1,     1,     1],
        [    0, 27735,  5842,  ...,     1,     1,     1],
        [    0, 35490, 17379,  ...,     1,     1,     1],
        ...,
        [    0,   417,  3463,  ...,     1,     1,     1],
        [    0,  6209,    24,  ...,     1,     1,     1],
        [    0, 14175, 12102,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 1, 0, 0, 0, 2, 1, 2, 1, 0, 1, 0, 2, 1, 0, 0, 2, 0, 0, 1, 1, 2, 0,
        2, 0, 0, 0, 0, 0, 1, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


22it [47:27, 129.47s/it][A

[DAN] step: in run_train()  22
[DAN] batch: in run_train()  [tensor([[    0,   627, 47268,  ...,     1,     1,     1],
        [    0,  7215,   261,  ...,     1,     1,     1],
        [    0,   448, 40981,  ...,     1,     1,     1],
        ...,
        [    0,  9981, 12478,  ...,     1,     1,     1],
        [    0,   597,  6294,  ...,     1,     1,     1],
        [    0, 26369,  2964,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 1, 0, 2, 2, 0, 1, 2, 2, 2, 1, 2, 2, 2, 2, 0, 2, 0, 2, 2, 1, 1, 2,
        0, 2, 0, 2, 0, 2, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


23it [49:36, 129.21s/it][A

[DAN] step: in run_train()  23
[DAN] batch: in run_train()  [tensor([[    0,   133,  6793,  ...,     1,     1,     1],
        [    0, 18276, 20676,  ...,     1,     1,     1],
        [    0,   133,  6793,  ...,     1,     1,     1],
        ...,
        [    0, 16767,  2603,  ...,     1,     1,     1],
        [    0,   970,   855,  ...,     1,     1,     1],
        [    0, 25826,    90,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 2, 2, 1, 2, 1, 0, 1, 0, 0, 2, 0, 0, 2, 0, 0, 2, 1, 0, 2, 0, 1, 2, 1,
        1, 0, 0, 0, 1, 2, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


24it [51:46, 129.42s/it][A

[DAN] step: in run_train()  24
[DAN] batch: in run_train()  [tensor([[    0,   970,    33,  ...,     1,     1,     1],
        [    0,   771, 10852,  ...,     1,     1,     1],
        [    0, 31157, 10067,  ...,     1,     1,     1],
        ...,
        [    0, 18276, 20676,  ...,     1,     1,     1],
        [    0,  6323,  1668,  ...,     1,     1,     1],
        [    0,   243,    16,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0, 1, 0, 2, 0, 1, 1, 1, 0, 0, 0, 0, 2, 1,
        1, 0, 2, 2, 2, 2, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


25it [53:54, 129.16s/it][A

[DAN] step: in run_train()  25
[DAN] batch: in run_train()  [tensor([[    0,   133,  6793,  ...,     1,     1,     1],
        [    0, 33402,  6435,  ...,     1,     1,     1],
        [    0,   133,  3652,  ...,     1,     1,     1],
        ...,
        [    0,  6209, 40221,  ...,     1,     1,     1],
        [    0, 28565,  7018,  ...,     1,     1,     1],
        [    0, 16767,  2603,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 2, 0, 2, 1, 2, 0, 1, 1, 2, 2, 1, 0, 1, 1, 2, 0, 1, 1, 0, 2, 2, 1,
        0, 1, 1, 1, 1, 2, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


26it [56:04, 129.41s/it][A

[DAN] step: in run_train()  26
[DAN] batch: in run_train()  [tensor([[    0, 15228,   261,  ...,     1,     1,     1],
        [    0,   133,  1283,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        ...,
        [    0,    90, 15975,  ...,     1,     1,     1],
        [    0,   705, 44780,  ...,     1,     1,     1],
        [    0, 10836,    47,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 2, 1, 2, 0, 2, 2, 2, 2, 2, 2, 1, 0, 0, 0, 1, 0, 0, 2, 0, 2, 0, 0,
        2, 2, 0, 1, 1, 2, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


27it [58:13, 129.19s/it][A

[DAN] step: in run_train()  27
[DAN] batch: in run_train()  [tensor([[    0,   970,    16,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 18348,  1452,  ...,     1,     1,     1],
        ...,
        [    0,   597,  6294,  ...,     1,     1,     1],
        [    0, 37666,  3218,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 1, 1, 1, 1, 2, 0, 1, 1, 0, 2, 2, 2,
        2, 2, 2, 1, 2, 1, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


28it [1:00:23, 129.24s/it][A

[DAN] step: in run_train()  28
[DAN] batch: in run_train()  [tensor([[    0, 40437, 26672,  ...,     1,     1,     1],
        [    0,    29,  2726,  ...,     1,     1,     1],
        [    0,   133, 34377,  ...,     1,     1,     1],
        ...,
        [    0, 23675,    16,  ...,     1,     1,     1],
        [    0,   725, 44206,  ...,     1,     1,     1],
        [    0,  3573,   534,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 1, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0,
        2, 1, 0, 0, 2, 2, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


29it [1:02:31, 129.17s/it][A

[DAN] step: in run_train()  29
[DAN] batch: in run_train()  [tensor([[    0, 15228,   261,  ...,     1,     1,     1],
        [    0, 13841,    64,  ...,     1,     1,     1],
        [    0,  5970,    54,  ...,     1,     1,     1],
        ...,
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 40437, 26672,  ...,     1,     1,     1],
        [    0,  3573,   534,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 1,
        2, 1, 2, 2, 1, 0, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


30it [1:04:41, 129.18s/it][A

[DAN] step: in run_train()  30
[DAN] batch: in run_train()  [tensor([[    0,   243,    16,  ...,     1,     1,     1],
        [    0,   438,  2279,  ...,     1,     1,     1],
        [    0,  9058,  1131,  ...,     1,     1,     1],
        ...,
        [    0, 34892, 17683,  ...,     1,     1,     1],
        [    0,   133,  6247,  ...,     1,     1,     1],
        [    0,  1121, 17706,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 1, 2, 0, 2, 2, 2, 1, 2, 2, 2, 1, 0, 2, 1, 0, 1, 2, 2, 0, 0, 2, 2, 0,
        2, 0, 0, 2, 0, 0, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


31it [1:06:50, 129.20s/it][A

[DAN] step: in run_train()  31
[DAN] batch: in run_train()  [tensor([[    0, 27735,  5842,  ...,     1,     1,     1],
        [    0,   448, 40981,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        ...,
        [    0,   133,  6247,  ...,     1,     1,     1],
        [    0, 37879,    16,  ...,     1,     1,     1],
        [    0, 29788,  1152,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 2, 1, 2, 2, 0, 0, 1, 0, 2, 2, 1,
        0, 0, 0, 0, 2, 2, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


32it [1:08:59, 129.14s/it][A

[DAN] step: in run_train()  32
[DAN] batch: in run_train()  [tensor([[    0, 35416,    89,  ...,     1,     1,     1],
        [    0, 26145,   594,  ...,     1,     1,     1],
        [    0, 13755,  1614,  ...,     1,     1,     1],
        ...,
        [    0,  6323,   121,  ...,     1,     1,     1],
        [    0, 46069, 28677,  ...,     1,     1,     1],
        [    0, 26145,   594,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 1, 1, 2, 0, 2, 2, 1, 0, 0, 1, 1, 2, 0, 0, 2, 0, 2, 1, 2, 0, 2, 2, 0,
        2, 1, 0, 2, 2, 2, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


33it [1:11:08, 129.23s/it][A

[DAN] step: in run_train()  33
[DAN] batch: in run_train()  [tensor([[    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 28565,  7018,  ...,     1,     1,     1],
        [    0, 18348,  1452,  ...,     1,     1,     1],
        ...,
        [    0,   970,    16,  ...,     1,     1,     1],
        [    0,  1185,    64,  ...,     1,     1,     1],
        [    0,  2409,    89,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 0, 2, 0, 0, 0, 0, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 0, 2, 1, 0,
        2, 1, 0, 2, 2, 0, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


34it [1:13:17, 129.13s/it][A

[DAN] step: in run_train()  34
[DAN] batch: in run_train()  [tensor([[    0,  5016, 38636,  ...,     1,     1,     1],
        [    0,   771,  4526,  ...,     1,     1,     1],
        [    0,  3573,   534,  ...,     1,     1,     1],
        ...,
        [    0, 14043, 18957,  ...,     1,     1,     1],
        [    0,  1708,  8890,  ...,     1,     1,     1],
        [    0,   448, 40981,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 2, 2, 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 2, 1, 2, 2, 2, 0, 1, 0, 0,
        2, 2, 2, 2, 0, 2, 2, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


35it [1:15:27, 129.34s/it][A

[DAN] step: in run_train()  35
[DAN] batch: in run_train()  [tensor([[    0, 33780,    82,  ...,     1,     1,     1],
        [    0,  1185,    64,  ...,     1,     1,     1],
        [    0, 10787,   910,  ...,     1,     1,     1],
        ...,
        [    0,  1106,    47,  ...,     1,     1,     1],
        [    0,   133, 34377,  ...,     1,     1,     1],
        [    0, 11970,  3606,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 2, 0, 1, 2, 2, 2, 2, 1, 2, 2, 2, 0, 2, 1, 0, 0, 1, 2, 2, 1, 1, 0,
        2, 0, 0, 2, 0, 2, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


36it [1:17:36, 129.22s/it][A

[DAN] step: in run_train()  36
[DAN] batch: in run_train()  [tensor([[    0,   438,  2279,  ...,     1,     1,     1],
        [    0, 32251,    16,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        ...,
        [    0,   846, 44780,  ...,     1,     1,     1],
        [    0,   970,    16,  ...,     1,     1,     1],
        [    0,   510,  3181,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 1, 2, 2, 0, 2, 1, 1, 0, 1, 0, 1, 0, 2, 0, 1, 1, 0, 1, 0, 0, 2, 1, 2,
        2, 1, 2, 2, 2, 1, 1, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


37it [1:19:46, 129.38s/it][A

[DAN] step: in run_train()  37
[DAN] batch: in run_train()  [tensor([[    0, 39762,    12,  ...,     1,     1,     1],
        [    0, 31921,  8616,  ...,     1,     1,     1],
        [    0,  1708,  8890,  ...,     1,     1,     1],
        ...,
        [    0,  1708,  8890,  ...,     1,     1,     1],
        [    0, 37167,    10,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 2, 2, 2, 0, 0, 2, 1, 2, 2, 2, 2, 2, 0, 1, 2, 1, 0, 1, 0, 2, 0, 0,
        1, 1, 1, 0, 2, 2, 1, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


38it [1:21:54, 129.11s/it][A

[DAN] step: in run_train()  38
[DAN] batch: in run_train()  [tensor([[    0,   534,  6082,  ...,     1,     1,     1],
        [    0,   970,    32,  ...,     1,     1,     1],
        [    0,   642,  2580,  ...,     1,     1,     1],
        ...,
        [    0, 19751, 43022,  ...,     1,     1,     1],
        [    0,  1106,    47,  ...,     1,     1,     1],
        [    0, 10787,  6357,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 2, 0, 2, 1, 1, 0, 2, 2, 2, 0, 2, 0, 1, 1, 2, 1, 1, 0, 0, 0, 0, 2,
        2, 1, 2, 0, 1, 1, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


39it [1:24:04, 129.31s/it][A

[DAN] step: in run_train()  39
[DAN] batch: in run_train()  [tensor([[    0,  1121,   171,  ...,     1,     1,     1],
        [    0, 31921,  8616,  ...,     1,     1,     1],
        [    0,   970,    18,  ...,     1,     1,     1],
        ...,
        [    0, 44537, 46963,  ...,     1,     1,     1],
        [    0,   133,  9161,  ...,     1,     1,     1],
        [    0, 16767,  2603,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 0, 0, 2, 0, 2, 1, 2, 2, 0, 0, 0, 1, 2, 2, 1, 2, 0, 0, 2, 1, 2, 0,
        1, 0, 2, 2, 0, 1, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


40it [1:26:13, 129.15s/it][A

[DAN] step: in run_train()  40
[DAN] batch: in run_train()  [tensor([[    0, 15228,   261,  ...,     1,     1,     1],
        [    0,  1121,   937,  ...,     1,     1,     1],
        [    0,  4027,  3218,  ...,     1,     1,     1],
        ...,
        [    0,  3084,  1262,  ...,     1,     1,     1],
        [    0,  1779,     5,  ...,     1,     1,     1],
        [    0,  5320,  5224,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 0, 0, 0, 2, 0, 2, 0, 2, 1, 1, 2, 2, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,
        2, 1, 0, 2, 2, 0, 1, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


41it [1:28:23, 129.36s/it][A

[DAN] step: in run_train()  41
[DAN] batch: in run_train()  [tensor([[    0,   133, 45837,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0,  4148,   902,  ...,     1,     1,     1],
        ...,
        [    0, 15852,   239,  ...,     1,     1,     1],
        [    0,   133, 20843,  ...,     1,     1,     1],
        [    0, 12861,   810,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 0, 2, 2, 1, 1, 1, 1, 2, 2, 2, 0, 0, 1, 2,
        2, 2, 0, 2, 2, 0, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


42it [1:30:31, 129.07s/it][A

[DAN] step: in run_train()  42
[DAN] batch: in run_train()  [tensor([[    0, 14696, 12081,  ...,     1,     1,     1],
        [    0,   970,  1534,  ...,     1,     1,     1],
        [    0,   133,   275,  ...,     1,     1,     1],
        ...,
        [    0, 46905,   424,  ...,     1,     1,     1],
        [    0, 14043, 18957,  ...,     1,     1,     1],
        [    0,  8585,    16,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 0, 1, 1, 0, 2, 2, 0, 2, 0, 2, 0, 1, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2,
        0, 2, 2, 0, 0, 1, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


43it [1:32:41, 129.31s/it][A

[DAN] step: in run_train()  43
[DAN] batch: in run_train()  [tensor([[    0,  6209,    89,  ...,     1,     1,     1],
        [    0,  4539, 15610,  ...,     1,     1,     1],
        [    0,   495, 17821,  ...,     1,     1,     1],
        ...,
        [    0, 15228,   261,  ...,     1,     1,     1],
        [    0,  4993,    70,  ...,     1,     1,     1],
        [    0,  7215,   261,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 1, 1, 1, 1, 2, 1, 2, 0, 1, 2, 0, 1, 0, 0, 1, 2, 2, 2, 0, 2, 2, 2, 2,
        0, 0, 2, 0, 2, 0, 2, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


44it [1:34:50, 129.13s/it][A

[DAN] step: in run_train()  44
[DAN] batch: in run_train()  [tensor([[    0, 28565,  7018,  ...,     1,     1,     1],
        [    0,  3762,  2260,  ...,     1,     1,     1],
        [    0,  1121, 17706,  ...,     1,     1,     1],
        ...,
        [    0,   771,  8141,  ...,     1,     1,     1],
        [    0,  3573,   534,  ...,     1,     1,     1],
        [    0,  3573,   534,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 2, 1, 2, 2, 1, 2, 2, 1, 2, 1, 0, 2, 2, 0, 2, 0, 2, 2, 2, 1, 1, 2,
        0, 1, 0, 2, 2, 0, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


45it [1:37:00, 129.38s/it][A

[DAN] step: in run_train()  45
[DAN] batch: in run_train()  [tensor([[    0,  2362,  3218,  ...,     1,     1,     1],
        [    0,   368,   261,  ...,     1,     1,     1],
        [    0, 26145,  4467,  ...,     1,     1,     1],
        ...,
        [    0,  9690, 31172,  ...,     1,     1,     1],
        [    0,   438,  2279,  ...,     1,     1,     1],
        [    0, 23329,  4458,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 0, 1, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 1, 2, 1, 0, 1, 1, 0,
        1, 2, 1, 2, 0, 0, 1, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


46it [1:39:08, 129.13s/it][A

[DAN] step: in run_train()  46
[DAN] batch: in run_train()  [tensor([[    0,   133,  6953,  ...,     1,     1,     1],
        [    0, 21518,   762,  ...,     1,     1,     1],
        [    0, 31336,  1484,  ...,     1,     1,     1],
        ...,
        [    0, 39914, 23862,  ...,     1,     1,     1],
        [    0,  1106,    47,  ...,     1,     1,     1],
        [    0,   133,  3551,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 2, 0, 2, 0, 2, 1, 0, 1, 2, 2, 1, 1, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1,
        1, 2, 2, 1, 2, 1, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


47it [1:41:18, 129.36s/it][A

[DAN] step: in run_train()  47
[DAN] batch: in run_train()  [tensor([[    0,  4539, 15610,  ...,     1,     1,     1],
        [    0, 23996, 34144,  ...,     1,     1,     1],
        [    0,   241,  1090,  ...,     1,     1,     1],
        ...,
        [    0,   616,   144,  ...,     1,     1,     1],
        [    0, 39914, 23862,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 2, 2, 0, 1, 0, 0, 0, 1, 2, 0, 2, 2, 2, 0, 1, 2, 2, 2, 0, 0, 1, 0,
        0, 2, 2, 2, 1, 2, 2, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


48it [1:43:27, 129.18s/it][A

[DAN] step: in run_train()  48
[DAN] batch: in run_train()  [tensor([[    0,   133,  9161,  ...,     1,     1,     1],
        [    0,   438,  1417,  ...,     1,     1,     1],
        [    0, 27847,    83,  ...,     1,     1,     1],
        ...,
        [    0, 30420, 46963,  ...,     1,     1,     1],
        [    0, 15228,   261,  ...,     1,     1,     1],
        [    0,  8585,    16,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 1, 0, 0, 2, 0, 0, 0, 2, 2, 2, 2, 0, 1, 0, 0, 1, 2, 0, 2, 1, 2, 2,
        2, 2, 1, 0, 0, 1, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


49it [1:45:37, 129.39s/it][A

[DAN] step: in run_train()  49
[DAN] batch: in run_train()  [tensor([[    0,   405,    16,  ...,     1,     1,     1],
        [    0,  2590, 13214,  ...,     1,     1,     1],
        [    0,   133,  3038,  ...,     1,     1,     1],
        ...,
        [    0,   448, 41054,  ...,     1,     1,     1],
        [    0, 40948,    12,  ...,     1,     1,     1],
        [    0,   133,  6793,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2, 2, 1, 0, 2, 1, 2, 0, 2, 2, 0, 1, 0,
        1, 1, 0, 0, 0, 1, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


50it [1:47:45, 129.21s/it][A

[DAN] step: in run_train()  50
[DAN] batch: in run_train()  [tensor([[    0,  1594,    47,  ...,     1,     1,     1],
        [    0,   347, 34298,  ...,     1,     1,     1],
        [    0,   104, 22210,  ...,     1,     1,     1],
        ...,
        [    0, 44537, 46963,  ...,     1,     1,     1],
        [    0,   241,  1090,  ...,     1,     1,     1],
        [    0, 14696, 12081,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 2, 2, 2, 2, 1, 1, 0, 0, 2, 0, 0, 1, 2, 1, 2, 0, 2, 2, 0, 2, 1, 1,
        0, 0, 1, 1, 2, 2, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


51it [1:49:55, 129.37s/it][A

[DAN] step: in run_train()  51
[DAN] batch: in run_train()  [tensor([[    0,  1121,     5,  ...,     1,     1,     1],
        [    0,   448, 40981,  ...,     1,     1,     1],
        [    0, 10836,    10,  ...,     1,     1,     1],
        ...,
        [    0, 35469,    10,  ...,     1,     1,     1],
        [    0,   970,    16,  ...,     1,     1,     1],
        [    0,   846,   853,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 1, 2, 0, 2, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2, 2, 0, 0, 1, 0, 2, 0, 0, 0,
        2, 1, 2, 2, 2, 1, 0, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


52it [1:52:04, 129.18s/it][A

[DAN] step: in run_train()  52
[DAN] batch: in run_train()  [tensor([[    0,   642,  2580,  ...,     1,     1,     1],
        [    0, 31988, 44197,  ...,     1,     1,     1],
        [    0,   448, 40981,  ...,     1,     1,     1],
        ...,
        [    0, 29038,  6166,  ...,     1,     1,     1],
        [    0,  8585,    16,  ...,     1,     1,     1],
        [    0,   179,     5,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 1, 1, 1, 2, 1, 0, 0, 2, 2, 2, 2, 1, 0, 0, 0, 0, 2, 2, 1, 2, 1, 0,
        0, 2, 0, 2, 0, 1, 1, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


53it [1:54:14, 129.50s/it][A

[DAN] step: in run_train()  53
[DAN] batch: in run_train()  [tensor([[    0, 40555,   625,  ...,     1,     1,     1],
        [    0, 31812,  5737,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        ...,
        [    0,   510,  2580,  ...,     1,     1,     1],
        [    0,  9089, 42286,  ...,     1,     1,     1],
        [    0,   133,  5808,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 2, 0, 1, 2, 0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 1, 0, 1, 0, 0, 2, 0,
        0, 1, 1, 2, 2, 0, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


54it [1:56:23, 129.27s/it][A

[DAN] step: in run_train()  54
[DAN] batch: in run_train()  [tensor([[    0, 16767,  2603,  ...,     1,     1,     1],
        [    0,   133, 13387,  ...,     1,     1,     1],
        [    0, 10777,    12,  ...,     1,     1,     1],
        ...,
        [    0,   438,  1417,  ...,     1,     1,     1],
        [    0,   133,  6247,  ...,     1,     1,     1],
        [    0, 17485, 30156,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 0, 1, 0, 1, 0, 2, 2, 0, 1, 1, 2, 1, 0, 0, 0, 2, 0, 0, 0, 2, 1, 2,
        2, 2, 2, 0, 2, 1, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


55it [1:58:32, 129.16s/it][A

[DAN] step: in run_train()  55
[DAN] batch: in run_train()  [tensor([[    0,   771, 10852,  ...,     1,     1,     1],
        [    0,  6323,    82,  ...,     1,     1,     1],
        [    0,   133,  6793,  ...,     1,     1,     1],
        ...,
        [    0, 26840,    82,  ...,     1,     1,     1],
        [    0,  7215,   261,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 2, 2, 1, 0, 1, 0, 0, 1, 2, 1, 2, 0, 1, 0, 0, 0, 2, 2, 1, 0, 2, 0, 2,
        0, 0, 0, 0, 1, 2, 1, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


56it [2:00:41, 129.07s/it][A

[DAN] step: in run_train()  56
[DAN] batch: in run_train()  [tensor([[    0,   605, 10852,  ...,     1,     1,     1],
        [    0,   448, 41054,  ...,     1,     1,     1],
        [    0,   133,  2608,  ...,     1,     1,     1],
        ...,
        [    0,   245,   534,  ...,     1,     1,     1],
        [    0, 38191,  1650,  ...,     1,     1,     1],
        [    0, 28565,  7018,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 2, 2, 1, 2, 2, 1, 2, 1, 0, 2, 1, 2, 0, 2, 1, 2, 2, 0, 1, 0, 1, 1,
        0, 1, 0, 1, 1, 1, 1, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


57it [2:02:50, 129.01s/it][A

[DAN] step: in run_train()  57
[DAN] batch: in run_train()  [tensor([[    0,   250,  8074,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 15852,    97,  ...,     1,     1,     1],
        ...,
        [    0,   705, 44780,  ...,     1,     1,     1],
        [    0, 16883, 34154,  ...,     1,     1,     1],
        [    0, 13738,  2258,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 0, 1, 1, 0, 0, 2, 2, 2, 2, 0, 2, 1, 1, 2, 2, 2, 2, 0, 0, 1, 0, 1,
        2, 1, 2, 0, 1, 0, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


58it [2:04:59, 129.09s/it][A

[DAN] step: in run_train()  58
[DAN] batch: in run_train()  [tensor([[    0,   705, 44780,  ...,     1,     1,     1],
        [    0,  1594,   110,  ...,     1,     1,     1],
        [    0, 34892, 31755,  ...,     1,     1,     1],
        ...,
        [    0,   846, 44780,  ...,     1,     1,     1],
        [    0,  5771,   579,  ...,     1,     1,     1],
        [    0,   133,   731,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 1, 2, 0, 0, 1, 0, 0, 1, 0, 2, 1, 0, 1, 1, 0, 1, 0, 2, 0, 0, 2, 1,
        0, 2, 0, 2, 0, 0, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


59it [2:07:07, 128.94s/it][A

[DAN] step: in run_train()  59
[DAN] batch: in run_train()  [tensor([[    0,   133,   275,  ...,     1,     1,     1],
        [    0,   448, 41054,  ...,     1,     1,     1],
        [    0, 19027,  7018,  ...,     1,     1,     1],
        ...,
        [    0,  1121,    10,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0,  8585,    16,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 2, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 1, 0, 2, 0, 1, 2, 0, 0, 1, 2, 0,
        2, 2, 2, 2, 2, 2, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


60it [2:09:17, 129.06s/it][A

[DAN] step: in run_train()  60
[DAN] batch: in run_train()  [tensor([[    0,  4763,    19,  ...,     1,     1,     1],
        [    0,  8585,    16,  ...,     1,     1,     1],
        [    0, 13424,  3792,  ...,     1,     1,     1],
        ...,
        [    0,   705, 44780,  ...,     1,     1,     1],
        [    0,  1594,   110,  ...,     1,     1,     1],
        [    0, 20770, 27415,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 1, 2, 2, 1, 1, 2, 0, 0, 2, 1, 2, 2, 1, 2, 1, 0, 1, 2, 0, 2, 1, 1,
        2, 0, 1, 2, 1, 1, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


61it [2:11:25, 128.92s/it][A

[DAN] step: in run_train()  61
[DAN] batch: in run_train()  [tensor([[    0, 13755,    89,  ...,     1,     1,     1],
        [    0, 33353,  1925,  ...,     1,     1,     1],
        [    0,   104, 19625,  ...,     1,     1,     1],
        ...,
        [    0, 46577,  7018,  ...,     1,     1,     1],
        [    0, 12861,   810,  ...,     1,     1,     1],
        [    0, 40948,    12,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 2, 2, 1, 0, 2, 2, 1, 0, 0, 0, 2, 2,
        0, 2, 2, 2, 2, 0, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


62it [2:13:35, 129.06s/it][A

[DAN] step: in run_train()  62
[DAN] batch: in run_train()  [tensor([[    0, 30420, 46963,  ...,     1,     1,     1],
        [    0,   170,   218,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        ...,
        [    0,   133,   810,  ...,     1,     1,     1],
        [    0, 31626,  5895,  ...,     1,     1,     1],
        [    0,   705, 44780,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 1, 0, 2, 2, 1, 1, 0, 1, 2, 2, 2, 0, 1, 2, 1, 0, 1, 2, 2, 0, 2,
        0, 2, 2, 2, 2, 2, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


63it [2:15:43, 128.85s/it][A

[DAN] step: in run_train()  63
[DAN] batch: in run_train()  [tensor([[    0,   133,  6793,  ...,     1,     1,     1],
        [    0,   417,  3463,  ...,     1,     1,     1],
        [    0, 43872,     6,  ...,     1,     1,     1],
        ...,
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 10836, 47379,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 1, 1, 0, 2, 0, 0, 2, 2, 1, 1, 1, 0,
        2, 1, 1, 0, 0, 2, 1, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


64it [2:17:53, 129.13s/it][A

[DAN] step: in run_train()  64
[DAN] batch: in run_train()  [tensor([[    0, 10462,     6,  ...,     1,     1,     1],
        [    0, 10724, 21543,  ...,     1,     1,     1],
        [    0,   133,   775,  ...,     1,     1,     1],
        ...,
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 40948,    12,  ...,     1,     1,     1],
        [    0, 10836,  6247,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 2, 2, 2, 1, 0, 0, 0, 2, 1, 1, 2, 2, 0, 2, 1, 1, 2, 2, 2, 0, 2, 1, 1,
        0, 0, 0, 2, 1, 0, 0, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


65it [2:20:01, 128.81s/it][A

[DAN] step: in run_train()  65
[DAN] batch: in run_train()  [tensor([[    0, 44537, 46963,  ...,     1,     1,     1],
        [    0,  1620,    10,  ...,     1,     1,     1],
        [    0,   329,  3976,  ...,     1,     1,     1],
        ...,
        [    0,  1121,   298,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 36746, 26324,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 2, 1, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 2, 0, 0, 1, 0, 0,
        2, 2, 1, 0, 1, 0, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


66it [2:22:11, 129.03s/it][A

[DAN] step: in run_train()  66
[DAN] batch: in run_train()  [tensor([[    0, 46994, 47629,  ...,     1,     1,     1],
        [    0,  4539,    12,  ...,     1,     1,     1],
        [    0, 10836, 10029,  ...,     1,     1,     1],
        ...,
        [    0,  1594,    51,  ...,     1,     1,     1],
        [    0, 44537, 46963,  ...,     1,     1,     1],
        [    0,   170,   218,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 2, 2, 2, 0, 2, 1, 0, 2, 2, 0, 2, 1, 1, 2, 1, 1, 1, 0, 1, 0, 0,
        0, 2, 0, 0, 1, 1, 2, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


67it [2:24:19, 128.88s/it][A

[DAN] step: in run_train()  67
[DAN] batch: in run_train()  [tensor([[    0,  2264,    32,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0,   104, 22210,  ...,     1,     1,     1],
        ...,
        [    0,  4539, 15610,  ...,     1,     1,     1],
        [    0,   250,   652,  ...,     1,     1,     1],
        [    0, 28285,  1484,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 1, 2, 2, 2, 2, 2, 1, 0, 0,
        2, 2, 2, 2, 2, 0, 1, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


68it [2:26:29, 129.14s/it][A

[DAN] step: in run_train()  68
[DAN] batch: in run_train()  [tensor([[    0,  4763,   189,  ...,     1,     1,     1],
        [    0,   705, 44780,  ...,     1,     1,     1],
        [    0, 15228,   261,  ...,     1,     1,     1],
        ...,
        [    0,   250,  8074,  ...,     1,     1,     1],
        [    0, 33120,  5842,  ...,     1,     1,     1],
        [    0,  8585,    16,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 1, 1, 2, 0, 1, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 1, 0, 1, 0, 2, 0, 0,
        0, 0, 2, 0, 0, 2, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


69it [2:28:37, 128.89s/it][A

[DAN] step: in run_train()  69
[DAN] batch: in run_train()  [tensor([[    0, 40022,  1253,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0,   100,   348,  ...,     1,     1,     1],
        ...,
        [    0,  1708,    23,  ...,     1,     1,     1],
        [    0,  5771,     5,  ...,     1,     1,     1],
        [    0,  6323,  7947,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 2, 1, 1, 2, 0, 1, 0, 2, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 2, 1, 2, 1, 0,
        1, 2, 2, 0, 0, 0, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


70it [2:30:47, 129.16s/it][A

[DAN] step: in run_train()  70
[DAN] batch: in run_train()  [tensor([[    0, 39762,    12,  ...,     1,     1,     1],
        [    0, 16883, 14251,  ...,     1,     1,     1],
        [    0,   846, 44780,  ...,     1,     1,     1],
        ...,
        [    0, 14696, 12081,  ...,     1,     1,     1],
        [    0,  6335, 43814,  ...,     1,     1,     1],
        [    0, 44537, 46963,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 1, 0, 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 1, 2, 0, 0, 1, 2, 0,
        2, 0, 2, 0, 0, 2, 2, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


71it [2:32:55, 128.96s/it][A

[DAN] step: in run_train()  71
[DAN] batch: in run_train()  [tensor([[    0, 46000,    33,  ...,     1,     1,     1],
        [    0, 28565, 11281,  ...,     1,     1,     1],
        [    0, 36949, 17683,  ...,     1,     1,     1],
        ...,
        [    0, 44537, 46963,  ...,     1,     1,     1],
        [    0, 14696, 12081,  ...,     1,     1,     1],
        [    0,   250,   650,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([1, 1, 0, 1, 0, 1, 2, 1, 2, 0, 2, 2, 0, 0, 2, 1, 1, 1, 1, 1, 2, 1, 1, 2,
        0, 2, 1, 2, 2, 0, 2, 0]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


72it [2:35:05, 129.17s/it][A

[DAN] step: in run_train()  72
[DAN] batch: in run_train()  [tensor([[    0, 32663,    14,  ...,     1,     1,     1],
        [    0,   771,  4526,  ...,     1,     1,     1],
        [    0,   250,   652,  ...,     1,     1,     1],
        ...,
        [    0, 10836,   525,  ...,     1,     1,     1],
        [    0,  9089, 42286,  ...,     1,     1,     1],
        [    0,  6276,  5895,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 2, 0, 1, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 2, 1,
        1, 1, 0, 2, 2, 1, 0, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


73it [2:37:14, 129.03s/it][A

[DAN] step: in run_train()  73
[DAN] batch: in run_train()  [tensor([[    0,  1185,   189,  ...,     1,     1,     1],
        [    0,   347,  1417,  ...,     1,     1,     1],
        [    0,   705, 44780,  ...,     1,     1,     1],
        ...,
        [    0, 37668, 20650,  ...,     1,     1,     1],
        [    0,   133, 34377,  ...,     1,     1,     1],
        [    0,  1106,    47,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 2, 1, 1, 2, 2, 2, 0, 0, 0, 2, 1, 0, 1, 0, 0, 2, 0, 2, 1, 2, 0, 2,
        0, 2, 2, 0, 1, 1, 1, 2]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     


74it [2:39:23, 129.12s/it][A

[DAN] step: in run_train()  74
[DAN] batch: in run_train()  [tensor([[    0,   250,  6247,  ...,     1,     1,     1],
        [    0, 28565,  7018,  ...,     1,     1,     1],
        [    0, 10836, 28793,  ...,     1,     1,     1],
        ...,
        [    0,   846, 44780,  ...,     1,     1,     1],
        [    0,   133, 11801,  ...,     1,     1,     1],
        [    0,   448, 40981,  ...,     1,     1,     1]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 0, 2, 1, 2, 0, 2, 1, 1,
        0, 0, 1, 1, 1, 0, 2, 1]), tensor([[0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
        [0.3000, 0.3000, 0.3000],
     

In [None]:
args.output_dir

In [None]:
mkdir(args.output_dir)

In [None]:
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
print(args.train_batch_size)

### 궁금증. train_batch_size(32로 설정)을 왜 gradient_acc_step로 나눈 정수로 업데이트할까? 

In [None]:
args.output_dir

In [None]:
writer = SummaryWriter(os.path.join(args.output_dir, 'events'))
writer

### 궁금증. outputs_healthver/events란 폴더 내 많은 이진파일들을 왜 저장할까?

In [None]:
cache_dir = args.cache_dir
cache_dir

### 궁금증. cache_dir에는 어떤 데이터를 저장할까?

In [None]:
n_gpu = torch.cuda.device_count()
n_gpu

In [None]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)

### 궁금증. manual_seed()는 무엇일까?

In [None]:
if n_gpu > 0:
    torch.cuda.manual_seed_all(args.seed)

In [None]:
save_code_log_path = args.output_dir
save_code_log_path

In [None]:
logging.basicConfig(format='%(message)s', datefmt='%m/%d/%Y %H:%M', level=logging.INFO,
                        handlers=[logging.FileHandler("{0}/{1}.log".format(save_code_log_path, 'output')),
                                  logging.StreamHandler()])
logger.info(args)
logger.info("Command is: %s" % ' '.join(sys.argv))
logger.info("Device: {}, n_GPU: {}".format(device, n_gpu))
logger.info("Datasets are loaded from {}\nOutputs will be saved to {}\n".format(args.data_dir, args.output_dir))

In [None]:
sys.argv

In [None]:
#processor = DataProcesser()

In [None]:
class DataProcessor(object):
    def get_examples(self, data_dir, dataset=None):
        logger.info('Get examples from: {}.csv'.format(dataset))
        return self._create_examples(self._read_csv(os.path.join(data_dir, "{}.csv".format(dataset))))

    def get_labels(self):
        return [0, 1, 2], len([0, 1, 2])

    def _read_csv(cls, input_file):
        data = pd.read_csv(input_file)
        lines = []
        for i in range(len(data)):
            lines.append(data.iloc[i])
        return lines

    def _create_examples(self, lines, max_evidences=5):
        examples = []
        obj = TfIdfSimilarity()
        for i, datapoint in enumerate(tqdm(lines)):
            #  sent1 = '[CONTEXT]: ' + ' [EOT] '.join(example['context'][-2:]) + ' [RESPONSE]: ' + sent1
            primi_idx = datapoint['id']
            text_a = datapoint['claim']
            text_b = datapoint['question'] + datapoint['evidence']
            label = LABELS[datapoint['label']]
            # priori = get_priori(obj, text_a, datapoint['question'],datapoint['evidence'], T = 1)
            priori = [0.3,0.3,0.3]
            examples.append((InputExample(idx=primi_idx, text_a=text_a, text_b=text_b, label=label, priori=priori)))
        return examples

In [None]:
processor = DataProcessor()
processor

In [None]:
print(args.bert_model)
print(args.do_lower_case)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
tokenizer

In [None]:
print(args.load_dir)

In [None]:
load_dir = args.load_dir if args.load_dir else args.bert_model
logger.info('Model is loaded from %s' % load_dir)
label_list = processor.get_labels()

print(load_dir)
print(label_list)

In [None]:
config = RobertaConfig.from_json_file(os.path.join(args.bert_model,'config.json'))
config

In [None]:
# 7초
model = RobertaMoEForSequenceClassification(config, num_public_layers=12, num_experts=3, num_labels=3, num_gate_layer=2)
model

In [None]:
print(args.bert_model)

In [None]:
model.load_roberta(args.bert_model)

In [None]:
if args.load_dir:
    model.load_state_dict(torch.load(load_dir+'/pytorch_model.bin'))
    print('parameters loaded successfully.')

### 궁금증. 이 if문은 왜 적용되지 않았을까? if문 내부는 무엇을 동작할까?

In [None]:
model.to(device)

In [None]:
if n_gpu > 1:
    model = torch.nn.DataParallel(model,device_ids=[0, 1])

### 궁금증. GPU가 3개면 id는 어떻게 할당될까?

In [None]:
#if args.do_train:
#    run_train(device, processor, tokenizer, model, writer, phase="train")

print(args.do_train)

# run_train() 파헤치기

In [None]:
# 선언 : def run_train(device, processor, tokenizer, model, writer, phase="train")
print(device)
print(processor)
print(tokenizer)
print(model)
print(writer)#, phase="train")

### get_dataLoader() 파헤치기

In [None]:
#tr_dataloader, tr_num_steps, tr_examples = get_dataLoader(args, processor, tokenizer, phase="train")
dataset_dict = {"train": args.train_set, "dev": args.dev_set, "test": args.test_set}
dataset_dict

In [None]:
#label_list, _ = processor.get_labels()
processor.get_labels()

In [None]:
label_list, _ = processor.get_labels()

In [None]:
args.data_dir

In [None]:
phase="train"
dataset_dict[phase]

In [None]:
examples = processor.get_examples(args.data_dir, dataset_dict[phase])
examples

In [None]:
print(len(examples))

# 학습데이터 샘플보기
### LABELS = {"Supports":0, "Refutes":1, "Neutral":2}

In [None]:
one_example = examples[0]
print(one_example.idx)
pprint(one_example.text_a) #response(claim)
pprint(one_example.text_b) #question + evidence
print(one_example.label)
print(one_example.priori)

In [None]:
one_example = examples[1]
print(one_example.idx)
pprint(one_example.text_a)
pprint(one_example.text_b)
print(one_example.label)
print(one_example.priori)

In [None]:
one_example = examples[2]
print(one_example.idx)
pprint(one_example.text_a) #R
pprint(one_example.text_b) #Q+E
print(one_example.label)
print(one_example.priori)

### convert_examples_to_features() 파헤치기

In [None]:
#이 함수는 get_dataLoader()에서 한 번만 사용됨
#선언 : def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
print(examples)
print(label_list)
print(args.max_seq_length)
print(tokenizer)

In [None]:
label_map = {label: i for i, label in enumerate(label_list)}
label_map

### examples(10590건) 중에서 첫행만 샘플로 확인

In [None]:
max_seq_length = args.max_seq_length
features = []

samples = examples[:1]
samples

for (ex_index, example) in enumerate(tqdm(samples, desc="convert to features")):
    logger.info("\n")
    logger.info(">>>>>ex_index: %d" % (ex_index))
    label_id = label_map[example.label]

    tokens_a = tokenizer.tokenize(example.text_a) #text_a : response(claim)
    tokens_b = tokenizer.tokenize(example.text_b)
    logger.info(label_id)
    logger.info(example.text_a)
    logger.info(tokens_a)
    logger.info(example.text_b)
    logger.info(tokens_b)

    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

    tokens = ["<s>"] + tokens_a + ["</s>"]
    segment_ids = [0] * (len(tokens_a) + 2)
    tokens += tokens_b + ["</s>"]
    segment_ids += [1] * (len(tokens_b) + 1)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    padding = [1] * (max_seq_length - len(input_ids))
    input_mask += [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    segment_ids += padding
    #print(len(input_ids))
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    features.append(InputFeatures(input_ids=input_ids,
                                    input_mask=input_mask,
                                    segment_ids=segment_ids,
                                    label_id=label_id,
                                    priori=example.priori))
    
    
    logger.info("*** Example ***")
    logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
    logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
    logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
    logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
    logger.info("label: %s (id = %d)" % (example.label, label_id))



### 제대로 돌리기(convert examples into features)

In [None]:
max_seq_length = args.max_seq_length
features = []

#cpu로 돌려보기위해 example수를 배치사이즈인 32로 줄여보기
examples = examples[:32]

for (ex_index, example) in enumerate(tqdm(examples, desc="convert to features")):
    #logger.info("\n")
    #logger.info(">>>>>ex_index: %d" % (ex_index))
    label_id = label_map[example.label]

    tokens_a = tokenizer.tokenize(example.text_a) #text_a : response(claim)
    tokens_b = tokenizer.tokenize(example.text_b)
     # 첫 샘플만 logger.info()로 찍어내기
    if ex_index < 1:
        logger.info(label_id)
        logger.info(example.text_a)
        logger.info(tokens_a)
        logger.info(example.text_b)
        logger.info(tokens_b)

    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

    tokens = ["<s>"] + tokens_a + ["</s>"]
    segment_ids = [0] * (len(tokens_a) + 2)
    tokens += tokens_b + ["</s>"]
    segment_ids += [1] * (len(tokens_b) + 1)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    padding = [1] * (max_seq_length - len(input_ids))
    input_mask += [0] * (max_seq_length - len(input_ids))
    input_ids += padding
    segment_ids += padding
    #print(len(input_ids))
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    features.append(InputFeatures(input_ids=input_ids,
                                    input_mask=input_mask,
                                    segment_ids=segment_ids,
                                    label_id=label_id,
                                    priori=example.priori))
    
    # 첫 샘플만 logger.info()로 찍어내기
    if ex_index < 1:
        logger.info("*** Example ***")
        logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
        logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
        logger.info("label: %s (id = %d)" % (example.label, label_id))

In [None]:
features

In [None]:
len(features)

### 어떻게 매핑(문자-임베딩표현)되었는지 탐색

In [None]:
print(len(examples[0].text_a))
print(len(examples[0].text_b))
print(len(features[0].input_ids)) #max_length

In [None]:
print(features[0].input_ids)

### features를 다시 get_dataLoader()에게 반환

In [None]:
features

In [None]:
batch_size = args.train_batch_size if phase == "train" else args.eval_batch_size
batch_size

In [None]:
epoch_num = args.num_train_epochs if phase == "train" else 1
epoch_num

In [None]:
num_optimization_steps = int(len(examples) / batch_size / args.gradient_accumulation_steps) * epoch_num
num_optimization_steps

### 궁금증. 어떻게 최적화 횟수가 이렇게 계산될까? int(10590/32/??)*1

In [None]:
logger.info("Examples#: {}, Batch size: {}".format(len(examples), batch_size * args.gradient_accumulation_steps))
logger.info("Total num of steps#: {}, Total num of epoch#: {}".format(num_optimization_steps, epoch_num))


In [None]:
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
all_priori = torch.tensor([f.priori for f in features], dtype=torch.float)


In [None]:
print(all_input_ids.shape)
print(all_input_mask.shape)
print(all_segment_ids.shape)
print(all_label_ids.shape)
print(all_priori.shape)

In [None]:
all_priori

In [None]:
all_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_priori)
all_data

In [None]:
if args.do_train_eval:
    logger.info('if')
    sampler = SequentialSampler(all_data)
else:
    logger.info('else')
    sampler = RandomSampler(all_data) if phase == "train" else SequentialSampler(all_data)
sampler

##### RandomSampler 객체면, else에서 phase='train'이 동작한 것을 추측

In [None]:
dataloader = DataLoader(all_data, sampler=sampler, batch_size=batch_size)
dataloader

##### get_dataLoader()가 run_train()으로 최종반환하는 것 : return dataloader, num_optimization_steps, examples

In [None]:
print(dataloader) #dataloader 객체에 features가 다 저장되어있음

In [None]:
print(num_optimization_steps)

In [None]:
print(examples)

In [None]:
# run_train()에서 필요한 세 변수 정의
tr_dataloader, tr_num_steps, tr_examples = dataloader, num_optimization_steps, examples

### 다시 run_train() *get_dataLoader() 종류후

In [None]:
model.train()

In [None]:
loss_fct = torch.nn.KLDivLoss(reduction='batchmean')
loss_fct

In [None]:
model.named_parameters()

In [None]:
param_optimizer = list(model.named_parameters())
param_optimizer

In [None]:
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = \
    [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
optimizer_grouped_parameters

In [None]:
optimizer_grouped_parameters[0]

In [None]:
optimizer_grouped_parameters[1]

In [None]:
optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=tr_num_steps)
optimizer

In [None]:
optimizer.zero_grad()
optimizer

In [None]:
global_step = 0
best_acc = 0.0
n_gpu = torch.cuda.device_count()
n_gpu

### run_train() 학습 for문 진입

In [None]:
args.num_train_epochs

In [None]:
len(tr_dataloader)

##### examples 수를 배치사이즈인 32로 맞춘 후, tr_dataloader 가 1인 경우로 만들어서 for 문 안을 파헤치기

In [None]:
for ep in trange(args.num_train_epochs, desc="Training"):
    for step, batch in tqdm(enumerate(tr_dataloader)):
        logging.info(">>>>> step : ")
        logging.info(step)
        logging.info(">>>>> length of items in batch : ")
        logging.info(len(batch))

        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, label_ids, priori = batch
        logging.info(">>>>> input_ids : ")
        logging.info(input_ids.shape)
        logging.info(input_ids)
        logging.info(">>>>> input_mask : ")
        logging.info(input_mask.shape)
        logging.info(input_mask)
        logging.info(">>>>> label_ids : ")
        logging.info(label_ids.shape)
        logging.info(label_ids)
        logging.info(">>>>> priori : ")
        logging.info(priori.shape)
        logging.info(priori)
        
        logits, loss, final_out_logits, origin_gates = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids)
        logging.info(">>>>> logits : ")
        logging.info(len(logits))
        logging.info(logits[0].shape)
        logging.info(logits)
        logging.info(">>>>> loss : ")
        logging.info(loss.shape)
        logging.info(loss)
        logging.info(">>>>> final_out_logits : ")
        logging.info(final_out_logits.shape)
        logging.info(final_out_logits)
        logging.info(">>>>> origin_gates : ")
        logging.info(origin_gates.shape)
        logging.info(origin_gates)
        
        guide_loss = loss_fct(torch.nn.functional.log_softmax(origin_gates, dim=1), priori)
        loss += args.lmd * guide_loss
        logging.info(">>>>> guide_loss : ")
        logging.info(guide_loss)
        logging.info(">>>>> loss : ")
        logging.info(loss)

        if n_gpu > 1:
            loss = loss.mean()
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps
        ## TensorboardX에 기록
        #writer.add_scalar('{}/loss'.format(phase), loss.item(), global_step)

        logging.info(">>>>> loss mean : ")
        logging.info(loss)

        loss.backward()

        logging.info(">>>>> loss after loss.backward(): ")
        logging.info(loss)

        del loss

        logging.info(">>>>> step: ")
        logging.info(step)
        logging.info(">>>>> args.gradient_accumulation_steps: ")
        logging.info(args.gradient_accumulation_steps)
        if (step + 1) % args.gradient_accumulation_steps == 0:  # optimizer
            optimizer.step()
            optimizer.zero_grad()
            global_step += 1 


        break;
        model.eval()
        torch.set_grad_enabled(False)

        if args.do_eval and (((step + 1) % args.gradient_accumulation_steps == 0 and global_step % args.period == 0) or (ep==0 and step==0)):
            model_to_save = model.module if hasattr(model, 'module') else model

            dev_acc, dev_recall = run_eval(device, processor, tokenizer, model, writer, global_step, tensorboard=True,
                                phase="dev")
            if dev_acc > best_acc:
                best_acc = dev_acc
                logger.info(">> Save model. Best acc: {:.4}. Epoch {}".format(best_acc, ep))
                save_model(model_to_save)  # save model
                logger.info(">> Now the best acc is {:.4}\n, recall is {:.4}".format(dev_acc, dev_recall))

        model.train()
        torch.set_grad_enabled(True)

In [None]:
# run_train() 반환 값: return global_step
global_step