In [1]:
from __future__ import absolute_import, division, print_function


import pdb
import argparse
import glob
import logging

import os
import pickle
import random

import numpy as np
import torch
import torch.nn.init as init
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tensorboardX import SummaryWriter
from tqdm import tqdm, trange
from collections import defaultdict

# from azure.cosmosdb.table.tableservice import TableService
# from azure.cosmosdb.table.models import Entity
from datetime import datetime



from func import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
                                  BertConfig, BertForLatentConnector, BertTokenizer,
                                  GPT2Config, GPT2ForLatentConnector, GPT2Tokenizer,
                                  OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
                                  RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)

from utils import (weight_init, calc_iwnll, calc_rec, calc_mi, calc_au, BucketingDataLoader, TextDataset_Split, TextDataset_2Tokenizers, frange_cycle_linear, frange_cycle_zero_linear)


from modules import VAE


# logging.getLogger("azure").setLevel(logging.WARNING)
# logging.getLogger("TableService").setLevel(logging.WARNING)

logger = logging.getLogger(__name__)


MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2ForLatentConnector, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'bert': (BertConfig, BertForLatentConnector, BertTokenizer),
    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
}

    

def load_and_cache_examples(args, tokenizer, evaluate=False):
    if isinstance(tokenizer, list):
        dataset = TextDataset_2Tokenizers(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
    else:
        dataset = TextDataset_Split(tokenizer, args, file_path=args.eval_data_file if evaluate else args.train_data_file, block_size=args.block_size)
    return dataset

def build_dataload_and_cache_examples(args, tokenizer, evaluate=False):
    if isinstance(tokenizer, list):
        if not evaluate:
            args.batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
            file_path=args.train_data_file
        else:
            args.batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)  
            file_path=args.eval_data_file
        dataloader = BucketingDataLoader(file_path, args.batch_size, args.max_seq_length, tokenizer, args, bucket=100, shuffle=True)
    else:
        pass 
    return dataloader




def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def mask_tokens(inputs, tokenizer, args):
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    
    masked_indices = torch.bernoulli(torch.full(labels.shape, args.mlm_probability)).to(torch.uint8)
    labels[masked_indices==1] = -1  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).to(torch.uint8) & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).to(torch.uint8) & masked_indices & ~indices_replaced
    indices_random = indices_random
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

def weights_init_rondom(model):
    model = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
    model_state_dict = model.state_dict()
    for key in model_state_dict:
        pdb.set_trace()
        if 'encoder' in key:
            init.normal_(model_state_dict[key].data)  
        # weight_init(item)

def save_checkpoint(model_vae, optimizer, global_step, args):

    # Create output directory if needed
    # Save model checkpoint
    output_encoder_dir = os.path.join(args.output_dir, 'checkpoint-encoder-{}'.format(global_step))
    output_decoder_dir = os.path.join(args.output_dir, 'checkpoint-decoder-{}'.format(global_step))
    if not os.path.exists(output_encoder_dir) and args.local_rank in [-1, 0]:
        os.makedirs(output_encoder_dir)
    if not os.path.exists(output_decoder_dir) and args.local_rank in [-1, 0]:
        os.makedirs(output_decoder_dir)

    logger.info("Saving encoder model checkpoint to %s", output_encoder_dir)
    logger.info("Saving decoder model checkpoint to %s", output_decoder_dir)
    # Save a trained model, configuration and tokenizer using `save_pretrained()`.
    # They can then be reloaded using `from_pretrained()`

    model_encoder_to_save = model_vae.module.encoder if hasattr(model_vae, 'module') else model_vae.encoder  # Take care of distributed/parallel training
    model_decoder_to_save = model_vae.module.decoder if hasattr(model_vae, 'module') else model_vae.decoder  # Take care of distributed/parallel training

    # Good practice: save your training arguments together with the trained model
    if args.use_philly:
        save_solid = False
        while not save_solid:
            try:
                model_encoder_to_save.save_pretrained(output_encoder_dir)
                torch.save(args, os.path.join(output_encoder_dir, 'training_encoder_args.bin'))
                save_solid = True
            except:
                pass
    else:
        model_encoder_to_save.save_pretrained(output_encoder_dir)
        torch.save(args, os.path.join(output_encoder_dir, 'training_encoder_args.bin'))


    if args.use_philly:
        save_solid = False
        while not save_solid:
            try:
                model_decoder_to_save.save_pretrained(output_decoder_dir)
                torch.save(args, os.path.join(output_decoder_dir, 'training_decoder_args.bin'))
                save_solid = True
            except:
                pass
    else:
        model_decoder_to_save.save_pretrained(output_decoder_dir)
        torch.save(args, os.path.join(output_decoder_dir, 'training_encoder_args.bin'))


    # save the full model and optmizer into a checkpoint
    # model_to_save = model_vae.module if hasattr(model_vae, 'module') else model_vae  # Take care of distributed/parallel training

    # checkpoint = {
    # 'iter': global_step,
    # 'model_state_dict': model_to_save.state_dict(),
    # 'optimizer_state_dict': optimizer.state_dict(),
    # 'beta': model_to_save.args.beta,
    # 'args': args
    # }

    # output_full_dir = os.path.join(args.output_dir, 'checkpoint-full-{}'.format(global_step))
    # if not os.path.exists(output_full_dir) and args.local_rank in [-1, 0]:
    #     os.makedirs(output_full_dir)

    # logger.info("Start saving full model checkpoint to %s", output_full_dir)
    # if args.use_philly:
    #     save_solid = False
    #     n_save_attempts = 0
    #     while not save_solid:
    #         try:
    #             n_save_attempts += 1
    #             logger.info(f"Saving full checkpoint: {n_save_attempts} attempts made")
    #             torch.save(checkpoint, os.path.join(output_full_dir, 'training.bin'))
    #             logger.info("Saving full checkpoint to %s,", output_full_dir)
    #             save_solid = True
    #         except:
    #             pass
    # else:
    #     torch.save(checkpoint, os.path.join(output_full_dir, 'training.bin'))
    #     logger.info("Saving full checkpoint to %s", output_full_dir)



def load_checkpoint(args, loading_step=None):

    args.encoder_model_type = args.encoder_model_type.lower()
    args.decoder_model_type = args.decoder_model_type.lower()
    if loading_step:
        global_step = args.gloabl_step_eval
    else:
        global_step = args.gloabl_step_eval

    output_encoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-encoder-{}'.format(global_step))
    output_decoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-decoder-{}'.format(global_step)) 
    output_full_dir    = os.path.join(args.checkpoint_dir, 'checkpoint-full-{}'.format(global_step)) 

    checkpoints = [ [output_encoder_dir, output_decoder_dir] ]
    logger.info("Evaluate the following checkpoints: %s", checkpoints)

    # Load a trained Encoder model and vocabulary
    encoder_config_class, encoder_model_class, encoder_tokenizer_class = MODEL_CLASSES[args.encoder_model_type]
    model_encoder = encoder_model_class.from_pretrained(output_encoder_dir, latent_size=args.latent_size)
    tokenizer_encoder = encoder_tokenizer_class.from_pretrained(args.encoder_tokenizer_name if args.encoder_tokenizer_name else args.encoder_model_name_or_path, do_lower_case=args.do_lower_case)

    model_encoder.to(args.device)
    if args.block_size <= 0:
        args.block_size = tokenizer_encoder.max_len_single_sentence  # Our input block size will be the max possible for the model
    args.block_size = min(args.block_size, tokenizer_encoder.max_len_single_sentence)

    # Load a trained Decoder model and vocabulary
    decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES[args.decoder_model_type]
    model_decoder = decoder_model_class.from_pretrained(output_decoder_dir, latent_size=args.latent_size)
    tokenizer_decoder = decoder_tokenizer_class.from_pretrained(args.decoder_tokenizer_name if args.decoder_tokenizer_name else args.decoder_model_name_or_path, do_lower_case=args.do_lower_case)
    model_decoder.to(args.device)
    if args.block_size <= 0:
        args.block_size = tokenizer_decoder.max_len_single_sentence  # Our input block size will be the max possible for the model
    args.block_size = min(args.block_size, tokenizer_decoder.max_len_single_sentence)

    # Load full model
    checkpoint = torch.load(os.path.join(output_full_dir, 'training.bin'))




def train(args, train_dataloader, model_vae, encoder_tokenizer, decoder_tokenizer, table_name):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    # train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)


    # model_encoder, model_decoder, model_connector = model_vae.encoder,  model_vae.decoder, model_vae.linear
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model_vae.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model_vae.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)


    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model_vae, optimizer = amp.initialize(model_vae, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model_vae = torch.nn.DataParallel(model_vae, device_ids=range(args.n_gpu)).to(args.device)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model_vae = torch.nn.parallel.DistributedDataParallel(model_vae, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)


    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", train_dataloader.num_examples)
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model_vae.zero_grad()
   
    # model_vae = model_vae.module if hasattr(model_vae, 'module') else model_vae  # Take care of distributed/parallel training   
    
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])

    n_iter = int(args.num_train_epochs) * len(train_dataloader)
    beta_t_list = frange_cycle_zero_linear(n_iter, start=0.0, stop=args.beta,  n_cycle=1, ratio_increase=args.ratio_increase, ratio_zero=args.ratio_zero)

    tmp_list = []
    set_seed(args)  # Added here for reproducibility (even between python 2 and 3)
    for epoch in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            tokenized_text0, tokenized_text1, tokenized_text_lengths = batch
            # tokenized_text0 = tokenized_text0.to(args.device)
            # tokenized_text1 = tokenized_text1.to(args.device)
            # prepare input-output data for reconstruction

            # if (tokenized_text0>len(encoder_tokenizer)).sum().item()>0.0 or (tokenized_text1>len(decoder_tokenizer)).sum().item()>0.0: 
            #     pdb.set_trace()
            #     continue

            inputs, labels = mask_tokens(tokenized_text0, encoder_tokenizer, args) if args.mlm else (tokenized_text0, tokenized_text1)
            labels = tokenized_text1

            tokenized_text1 = tokenized_text1.to(args.device)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)

            model_vae.train()

            beta_t = beta_t_list[step +  epoch*len(epoch_iterator)]
            model_vae.args.beta = beta_t

            if beta_t == 0.0:
                model_vae.args.fb_mode = 0
            else:
                model_vae.args.fb_mode = 1
            
            if args.use_deterministic_connect:
                model_vae.args.fb_mode = 2


            loss_rec, loss_kl, loss = model_vae(inputs, labels)


            # Chunyuan: loss_rec size is [4], while latent_z size is [12]
            if args.n_gpu > 1:
                loss_rec = loss_rec.mean()  # mean() to average on multi-gpu parallel training
                loss_kl = loss_kl.mean()
                loss = loss.mean()

            if args.use_philly:
                print("PROGRESS: {}%".format(round(100 * (step +  epoch*len(epoch_iterator) ) /(int(args.num_train_epochs) *  len(epoch_iterator)) , 4))) 
                print("EVALERR: {}%".format(loss_rec)) 
            loss = torch.mean(loss)
            loss_rec = torch.mean(loss_rec)
            loss_kl = torch.mean(loss_kl)
            
            epoch_iterator.set_description(
                (
                    f'iter: {step +  epoch*len(epoch_iterator) }; loss: {loss.item():.3f}; '
                    f'loss_rec: {loss_rec.item():.3f}; loss_kl: {loss_kl.item():.3f}; '
                    f'beta: {model_vae.args.beta:.3f}'
                )
            )

            # if global_step % 5 == 0:
            #     row = {
            #             'PartitionKey': 'MILU_Rule_Rule_Template',
            #             'RowKey': str(datetime.now()),
            #             'ExpName' : args.ExpName, 
            #             'iter': str( step +  epoch*len(epoch_iterator) ),
            #             'loss': str( loss.item()),
            #             'loss_rec': str(loss_rec.item()),
            #             'loss_kl': str(loss_kl.item()),
            #             'beta': str(model_vae.args.beta)
            #         }
            #     # pdb.set_trace()
            #     ts.insert_entity(table_name, row)

            # pdb.set_trace()

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()                                   
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model_vae.parameters(), args.max_grad_norm)

                optimizer.step()

                scheduler.step()  # Update learning rate schedule

                model_vae.zero_grad()

                global_step += 1


                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model_vae, encoder_tokenizer, decoder_tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    save_checkpoint(model_vae, optimizer, global_step, args)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

            
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step, optimizer


def evaluate(args, model_vae, encoder_tokenizer, decoder_tokenizer, table_name, prefix="", subset="test"):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    # if subset == 'test':
    #     eval_dataset = load_and_cache_examples(args, [encoder_tokenizer, decoder_tokenizer], evaluate=True)
    # elif subset == 'train':
    #     eval_dataset = load_and_cache_examples(args, [encoder_tokenizer, decoder_tokenizer], evaluate=False)
    logger.info("***** Running evaluation on {} dataset *****".format(subset))

    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir)

    args.per_gpu_eval_batch_size = 1
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    # eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    # eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    eval_dataloader = build_dataload_and_cache_examples(args, [encoder_tokenizer, decoder_tokenizer], evaluate=True)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataloader))
    logger.info("  Batch size = %d", args.eval_batch_size)
    
    model_vae.eval()

    model_vae =  model_vae.module if hasattr(model_vae, 'module') else model_vae  # Take care of distributed/parallel training

    mi = calc_mi(model_vae, eval_dataloader, args)
    au = calc_au(model_vae, eval_dataloader, delta=0.01, args=args)[0]
    ppl, elbo, nll, kl = calc_iwnll(model_vae, eval_dataloader, args, ns=100)

    result = {
        "perplexity": ppl, "elbo": elbo, "kl": kl, "nll": nll, "au": au, "mi": mi
    }

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    row = {
            'PartitionKey': 'MILU_Rule_Rule_Template',
            'RowKey': str(datetime.now()),
            'ExpName' : args.ExpName, 
            'test_perplexity': str( ppl ),
            'test_elbo': str( elbo ),
            'test_nll': str(nll),
            'test_au': str(au),
            'test_mi': str(mi)
        }
    # pdb.set_trace()
    # ts.insert_entity(table_name, row)


    return result




def evaluate_rec(args, model_vae, encoder_tokenizer, decoder_tokenizer, table_name, prefix="", subset="test"):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    if subset == 'test':
        eval_dataloader = build_dataload_and_cache_examples(args, [encoder_tokenizer, decoder_tokenizer], evaluate=True)
    elif subset == 'train':
        eval_dataloader = build_dataload_and_cache_examples(args, [encoder_tokenizer, decoder_tokenizer], evaluate=False)
    logger.info("***** Running evaluation on {} dataset *****".format(subset))

    # Note that DistributedSampler samples randomly
    # eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    # eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # eval_dataloader = build_dataload_and_cache_examples(args, [encoder_tokenizer, decoder_tokenizer], evaluate=True)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataloader))
    logger.info("  Batch size = %d", args.eval_batch_size)
    
    model_vae.eval()
    model_vae =  model_vae.module if hasattr(model_vae, 'module') else model_vae  # Take care of distributed/parallel training
    nll_s, nll_w = calc_rec(model_vae, eval_dataloader, args, ns=1)

    result = {
        "rec_w": nll_w, "rec_s": nll_s
    }

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("%s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

[nltk_data] Downloading package punkt to /home/harry/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2022-06-15 18:48:12.420156: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-15 18:48:12.420181: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
#     train_data_file = "path/to/train/data/file.txt"
#     checkpoint_dir = "checkpoint-508523"
#     output_dir = "path/to/finetuned/model/directory"
#     dataset = "EMNLP"
#     eval_data_file = "path/to/validation/data/file.txt"
#     length_weighted_loss = True
#     encoder_model_type = "bert"
#     encoder_model_name_or_path = "bert-base-cased"
#     decoder_model_type = "gpt"
#     decoder_model_name_or_path = "gpt2"
#     latent_size = 32
#     use_pretrained_model = True
#     beta = 0
#     do_train = True
#     ratio_increase = 0.25
#     ratio_zero = 0.5
#     fb_mode = 1
#     per_gpu_train_batch_size = 5
#     dim_target_kl = 0.5
#     save_steps = 1000
#     logging_steps = 1000
#     gloabl_step_eval = 508523
#     block_size = 100
#     overwrite_output_dir = True
#     num_train_epochs = 1.0

In [None]:
#VAE Training
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_data_file", default=None, type=str, required=True,
                        help="The input training data file (a text file).")
    parser.add_argument("--checkpoint_dir", default=None, type=str,
                        help="The directory where checkpoints are saved.")                        
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument("--dataset", default=None, type=str, help="The dataset.")

    ## Other parameters
    parser.add_argument("--eval_data_file", default=None, type=str,
                        help="An optional input evaluation data file to evaluate the perplexity on (a text file).")
    parser.add_argument("--ExpName", default="", type=str,
                        help="The experiment name used in Azure Table.")
    parser.add_argument("--save_bert_gpt_init", action='store_true',
                        help="Use Philly for computing.")
    parser.add_argument("--length_weighted_loss", action='store_true',
                        help="Use sentence length re-weight the reconstruction loss.")


    ## Encoder options
    parser.add_argument("--encoder_model_type", default="bert", type=str,
                        help="The encoder model architecture to be fine-tuned.")
    parser.add_argument("--encoder_model_name_or_path", default="bert-base-cased", type=str,
                        help="The encoder model checkpoint for weights initialization.")
    parser.add_argument("--encoder_config_name", default="", type=str,
                        help="Optional pretrained config name or path if not the same as model_name_or_path")
    parser.add_argument("--encoder_tokenizer_name", default="", type=str,
                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")

    ## Decoder options
    parser.add_argument("--decoder_model_type", default="gpt2", type=str,
                        help="The decoder model architecture to be fine-tuned.")
    parser.add_argument("--decoder_model_name_or_path", default="bert-base-cased", type=str,
                        help="The decoder model checkpoint for weights initialization.")
    parser.add_argument("--decoder_config_name", default="", type=str,
                        help="Optional pretrained config name or path if not the same as model_name_or_path")
    parser.add_argument("--decoder_tokenizer_name", default="", type=str,
                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")

    ## Variational auto-encoder
    parser.add_argument("--latent_size", default=32, type=int, help="Latent space dimension.")
    parser.add_argument("--use_deterministic_connect", action='store_true',
                        help="Use deterministic inference to generate latent codes, i.e., standard auto-encoders.")
    parser.add_argument("--use_pretrained_model", action='store_true',
                        help="Use pre-trained auto-encoder models as the initialization")
    parser.add_argument("--latent_as_gpt_memory", default=1, type=int, help="Latent vector as memery for GPT2 to attend.")
    parser.add_argument("--latent_as_gpt_emb", default=1, type=int, help="Latent vector as embeddings for GPT2.")
    
    ## Objective functions
    parser.add_argument("--mlm", action='store_true',
                        help="Train with masked-language modeling loss instead of language modeling.")
    parser.add_argument("--mlm_probability", type=float, default=0.15,
                        help="Ratio of tokens to mask for masked language modeling loss")
    parser.add_argument("--beta", type=float, default=1.0,
                        help="The weighting hyper-parameter of the KL term in VAE")


    parser.add_argument("--cache_dir", default="", type=str,
                        help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)")
    parser.add_argument("--max_seq_length", default=512, type=int,
                        help="Optional input sequence length before tokenization. The sequence will be dropped if it is longer the max_seq_length")
    parser.add_argument("--block_size", default=-1, type=int,
                        help="Optional input sequence length after tokenization."
                             "The training dataset will be truncated in block of this size for training."
                             "Default to the model max input length for single sentence inputs (take into account special tokens).")
    parser.add_argument("--do_train", action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_eval_rec", action='store_true',
                        help="Whether to run eval reconstruction on a set of models.")   
    parser.add_argument("--evaluate_during_training", action='store_true',
                        help="Run evaluation during training at each logging step.")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")


    # Training Schedules
    parser.add_argument("--ratio_increase", default=0.25, type=float,
                        help="Learning schedule, the percentage for the annealing stage.") 
    parser.add_argument("--ratio_zero", default=0.25, type=float,
                        help="Learning schedule, the percentage for the pure auto-encoding stage.")     
    parser.add_argument("--fb_mode", default=0, type=int,
                        help="free bit training mode.")   
    parser.add_argument("--dim_target_kl", default=3.0, type=float,
                        help="dim_target_kl free bit training mode.")                            
    parser.add_argument("--per_gpu_train_batch_size", default=4, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size", default=1, type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--learning_rate", default=5e-5, type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay", default=0.0, type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=1.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--max_steps", default=-1, type=int,
                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
    parser.add_argument("--warmup_steps", default=0, type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--use_philly", action='store_true',
                        help="Use Philly for computing.")
    parser.add_argument("--use_pretrained_vae", action='store_true',
                        help="Use use_pretrained_vae as initialization, where beta value is specified in the folder")
    parser.add_argument("--use_random_weight", action='store_true',
                        help="Use random weights as initialization")


    ## IO: Logging and Saving
    parser.add_argument('--logging_steps', type=int, default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps', type=int, default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument("--eval_all_checkpoints", action='store_true',
                        help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number")
    parser.add_argument("--no_cuda", action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument('--overwrite_cache', action='store_true',
                        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed', type=int, default=42,
                        help="random seed for initialization")
    parser.add_argument('--gloabl_step_eval', type=int, default=661,
                        help="Evaluate the results at the given global step")

    # Precision & Distributed Training 
    parser.add_argument('--fp16', action='store_true',
                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
    parser.add_argument('--fp16_opt_level', type=str, default='O1',
                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
                             "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank", type=int, default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
    
    args = parser.parse_args("--output_dir=output_dir_yahoo --dataset EMNLP --encoder_model_type=bert \
                             --encoder_model_name_or_path=bert-base-cased --decoder_model_type=gpt2 --decoder_model_name_or_path=gpt2 \
                             --beta 0 --ratio_zero 0.5 --ratio_increase 0.25 --do_train --fb_mode 1 --dim_target_kl 0.5 \
                             --train_data_file=../../yahoo/unlabelled/train.txt --eval_data_file=../../yahoo/unlabelled/test.txt \
                             --num_train_epochs 1.0 --save_steps 10000 --logging_steps 1000 --overwrite_output_dir --per_gpu_train_batch_size=5 \
                             --block_size 100 --length_weighted_loss --use_pretrained_model --checkpoint_dir=checkpoint-508523 --latent_size 32 \
                             --gloabl_step_eval 508523".split())

    if args.decoder_model_type in ["bert", "roberta"] and not args.mlm:
        raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
                         "flag (masked language modeling).")
    if args.eval_data_file is None and args.do_eval:
        raise ValueError("Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
                         "or remove the --do_eval argument.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
#     if args.local_rank == -1 or args.no_cuda:
#         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
#         args.n_gpu = torch.cuda.device_count()
#     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
#         torch.cuda.set_device(args.local_rank)
#         device = torch.device("cuda", args.local_rank)
#         torch.distributed.init_process_group(backend='nccl')
#         args.n_gpu = 1
    device = torch.device("cuda:1")
    args.device = device
    args.n_gpu = 1

    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

    args.ExpName = 'Vae_' + args.dataset + '_Nz_' + str(args.latent_size)  + '_Beta_'  + str(args.beta) + '_Dkl_' + str(args.dim_target_kl) + '_Ra_' + str(args.ratio_increase) + '_R0_' + str(args.ratio_zero) 
    table_name = 'Vae' + args.dataset + 'Nz' + str(args.latent_size) 
    try: 
        ts.create_table(table_name)
    except:
        pass


    # Set seed
    set_seed(args)

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training download model & vocab

    # Load Optimius pre-trained model and tokenizer
    if args.use_pretrained_model:
        args.encoder_model_type = args.encoder_model_type.lower()
        args.decoder_model_type = args.decoder_model_type.lower()

        global_step = args.gloabl_step_eval

        output_encoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-encoder-{}'.format(global_step))
        output_decoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-decoder-{}'.format(global_step)) 
        output_full_dir    = os.path.join(args.checkpoint_dir, 'checkpoint-full-{}'.format(global_step)) 

        checkpoints = [ [output_encoder_dir, output_decoder_dir] ]
        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        # Load a trained Encoder model and vocabulary
        encoder_config_class, encoder_model_class, encoder_tokenizer_class = MODEL_CLASSES[args.encoder_model_type]
        model_encoder = encoder_model_class.from_pretrained(output_encoder_dir, latent_size=args.latent_size)
        tokenizer_encoder = encoder_tokenizer_class.from_pretrained(args.encoder_tokenizer_name if args.encoder_tokenizer_name else args.encoder_model_name_or_path, do_lower_case=args.do_lower_case)

        model_encoder.to(args.device)
        if args.block_size <= 0:
            args.block_size = tokenizer_encoder.max_len_single_sentence  # Our input block size will be the max possible for the model
        args.block_size = min(args.block_size, tokenizer_encoder.max_len_single_sentence)

        # Load a trained Decoder model and vocabulary
        decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES[args.decoder_model_type]
        model_decoder = decoder_model_class.from_pretrained(output_decoder_dir, latent_size=args.latent_size)
        tokenizer_decoder = decoder_tokenizer_class.from_pretrained(args.decoder_tokenizer_name if args.decoder_tokenizer_name else args.decoder_model_name_or_path, do_lower_case=args.do_lower_case)
        model_decoder.to(args.device)
        if args.block_size <= 0:
            args.block_size = tokenizer_decoder.max_len_single_sentence  # Our input block size will be the max possible for the model
        args.block_size = min(args.block_size, tokenizer_decoder.max_len_single_sentence)

        # Load full model
        # checkpoint = torch.load(os.path.join(output_full_dir, 'training.bin')) # Paolo

        
    else:
        # Load BERT and GPT weights (As an alternaive, one may train a VAE for this small)

        ## Encoder 
        encoder_config_class, encoder_model_class, encoder_tokenizer_class = MODEL_CLASSES[args.encoder_model_type]
        encoder_config = encoder_config_class.from_pretrained(args.encoder_config_name if args.encoder_config_name else args.encoder_model_name_or_path)
        tokenizer_encoder = encoder_tokenizer_class.from_pretrained(args.encoder_tokenizer_name if args.encoder_tokenizer_name else args.encoder_model_name_or_path, do_lower_case=args.do_lower_case)
        if args.block_size <= 0:
            args.block_size = tokenizer_encoder.max_len_single_sentence  # Our input block size will be the max possible for the model
        args.block_size = min(args.block_size, tokenizer_encoder.max_len_single_sentence)
        model_encoder = encoder_model_class.from_pretrained(args.encoder_model_name_or_path, from_tf=bool('.ckpt' in args.encoder_model_name_or_path), config=encoder_config, latent_size=args.latent_size)
        # model_encoder.to(args.device)

        ## Decoder 
        decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES[args.decoder_model_type]
        decoder_config = decoder_config_class.from_pretrained(args.decoder_config_name if args.decoder_config_name else args.decoder_model_name_or_path)
        tokenizer_decoder = decoder_tokenizer_class.from_pretrained(args.decoder_tokenizer_name if args.decoder_tokenizer_name else args.decoder_model_name_or_path, do_lower_case=args.do_lower_case)
        if args.block_size <= 0:
            args.block_size = tokenizer_decoder.max_len_single_sentence  # Our input block size will be the max possible for the model
        args.block_size = min(args.block_size, tokenizer_decoder.max_len_single_sentence)

        
        if args.latent_as_gpt_emb + args.latent_as_gpt_memory == 0:
            return # latent vector should pass into GPT to decode 
        else: 
            latent_as_gpt_emb = True if args.latent_as_gpt_emb == 1 else False
            latent_as_gpt_memory = True if args.latent_as_gpt_memory == 1 else False

        setattr(decoder_config, "latent_size", args.latent_size)
        model_decoder = decoder_model_class.from_pretrained(args.decoder_model_name_or_path, from_tf=bool('.ckpt' in args.decoder_model_name_or_path), config=decoder_config, latent_size=args.latent_size, latent_as_gpt_emb=latent_as_gpt_emb, latent_as_gpt_memory=latent_as_gpt_memory)
        
    # Save the init weights of BERT and GPT-2, so that we can load from local (Some infra requires so)
    if args.save_bert_gpt_init:
        encoder_path = os.path.join(args.output_dir, f"initial-models-tokenization-enoder-{args.latent_size}")
        if not os.path.exists(encoder_path): os.makedirs(encoder_path)
        model_encoder.save_pretrained(encoder_path)
        tokenizer_encoder.save_pretrained(encoder_path)

        decoder_path = os.path.join(args.output_dir, f"initial-models-tokenization-decoder-{args.latent_size}")
        if not os.path.exists(decoder_path): os.makedirs(decoder_path)
        model_decoder.save_pretrained(decoder_path)
        tokenizer_decoder.save_pretrained(decoder_path)

        return 



    # Chunyuan: Add Padding token to GPT2
    special_tokens_dict = {'pad_token': '<PAD>', 'bos_token': '<BOS>', 'eos_token': '<EOS>'}
    num_added_toks = tokenizer_decoder.add_special_tokens(special_tokens_dict)
    print('We have added', num_added_toks, 'tokens to GPT2')
    model_decoder.resize_token_embeddings(len(tokenizer_decoder))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
    assert tokenizer_decoder.pad_token == '<PAD>'

    # model_decoder.to(args.device)

    model_vae = VAE(model_encoder, model_decoder, tokenizer_encoder, tokenizer_decoder, args)

    # pdb.set_trace()
    if args.use_random_weight:
        model_vae.apply(weights_init_rondom)

    # if args.use_pretrained_model: # Paolo
    #     model_vae.load_state_dict(checkpoint['model_state_dict']) # Paolo
    #     logger.info("Pre-trained Optimus is successfully loaded") # Paolo
    # model_vae.to(args.device) # Paolo

    # on_gpu = next(model_vae.parameters()).is_cuda

    

    if args.local_rank == 0:
        torch.distributed.barrier()  # End of barrier to make sure only the first process in distributed training download model & vocab

    logger.info("Training/evaluation parameters %s", args)
    
    ##############################
    # Training
    global_step= 0
    if args.do_train:
        if args.local_rank not in [-1, 0]:
            torch.distributed.barrier()  # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache

        train_dataloader = build_dataload_and_cache_examples(args, [tokenizer_encoder, tokenizer_decoder], evaluate=False)

        if args.local_rank == 0:
            torch.distributed.barrier()

        global_step, tr_loss, optimizer = train(args, train_dataloader, model_vae, tokenizer_encoder, tokenizer_decoder, table_name)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        save_checkpoint(model_vae, optimizer, global_step, args)

        
    ##############################
    # Evaluation the metrics of VAE models, including PPL, MI, AU
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        if global_step == 0:
            global_step = args.gloabl_step_eval

        output_encoder_dir = os.path.join(args.output_dir, 'checkpoint-encoder-{}'.format(global_step))
        output_decoder_dir = os.path.join(args.output_dir, 'checkpoint-decoder-{}'.format(global_step))
        output_full_dir    = os.path.join(args.output_dir, 'checkpoint-full-{}'.format(global_step))
        checkpoint_dir = [output_encoder_dir, output_decoder_dir, output_full_dir]

        logger.info("Evaluate the following checkpoint: %s", checkpoint_dir[-1])
        global_step = checkpoint_dir[-1].split('-')[-1] if len(checkpoint_dir) > 1 else ""

        checkpoint = torch.load(os.path.join(output_full_dir, 'training.bin'))
        model_vae.load_state_dict(checkpoint['model_state_dict'])
        logger.info(f"Pre-trained Optimus is successfully loaded: {output_full_dir}")
        model_vae.to(args.device)

        result = evaluate(args, model_vae, tokenizer_encoder, tokenizer_decoder, table_name, prefix=global_step, subset='test')
        result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
        results.update(result)

        output_eval_file = os.path.join(args.output_dir, "eval_vae_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(results.keys()):
                logger.info("%s = %s", key, str(results[key]))
                writer.write("%s = %s\n" % (key, str(results[key])))
        logger.info(f"The testing results are successfully saved: {output_eval_file}")

    ##############################
    #  Evaluate the reconstruction loss for each checkpoints; 
    # This is used in studying two different latent vector injection schemes
    results = {}
    if args.do_eval_rec and args.local_rank in [-1, 0]:
        if global_step == 0:
            global_step = args.gloabl_step_eval
            # eval_steps = range(500, 13500, 500)
            # eval_steps = range(1000, 2000, 500)
            eval_steps = range(2000, 32000, 2000)

        checkpoints = []
        for e in eval_steps:
            output_encoder_dir = os.path.join(args.output_dir, 'checkpoint-encoder-{}'.format(e))
            output_decoder_dir = os.path.join(args.output_dir, 'checkpoint-decoder-{}'.format(e))
            checkpoints.append([output_encoder_dir, output_decoder_dir])

        

        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint[0].split('-')[-1] if len(checkpoints) > 1 else ""

            model_encoder = encoder_model_class.from_pretrained(checkpoint[0], latent_size=args.latent_size)
            model_encoder.to(args.device)     
     
            model_decoder = decoder_model_class.from_pretrained(checkpoint[1])
            model_decoder.to(args.device)

            model_vae = VAE(model_encoder, model_decoder, tokenizer_encoder, tokenizer_decoder, args).to(args.device)

            result = evaluate_rec(args, model_vae, tokenizer_encoder, tokenizer_decoder, table_name, prefix=global_step, subset='test')
            result = dict((k + '_test_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

            result = evaluate_rec(args, model_vae, tokenizer_encoder, tokenizer_decoder, table_name, prefix=global_step, subset='train')
            result = dict((k + '_train_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)
            
            # pdb.set_trace()

        output_eval_file = os.path.join(args.output_dir, "eval_rec_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(results.keys()):
                logger.info("%s = %s", key, str(results[key]))
                writer.write("%s = %s\n" % (key, str(results[key])))
        logger.info(f"The testing results are successfully saved: {output_eval_file}")


    return results


if __name__ == "__main__":
    main()

06/15/2022 16:55:45 - INFO - __main__ -   Evaluate the following checkpoints: [['checkpoint-508523/checkpoint-encoder-508523', 'checkpoint-508523/checkpoint-decoder-508523']]
06/15/2022 16:55:45 - INFO - func.configuration_utils -   loading configuration file checkpoint-508523/checkpoint-encoder-508523/config.json
06/15/2022 16:55:45 - INFO - func.configuration_utils -   Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 28996
}

06/15/2022 16:55:45 - INFO - func.modeling_utils -   loading weights 

We have added 3 tokens to GPT2


06/15/2022 16:55:56 - INFO - __main__ -   ***** Running training *****
06/15/2022 16:55:56 - INFO - __main__ -     Num examples = 120000
06/15/2022 16:55:56 - INFO - __main__ -     Num Epochs = 1
06/15/2022 16:55:56 - INFO - __main__ -     Instantaneous batch size per GPU = 5
06/15/2022 16:55:56 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 5
06/15/2022 16:55:56 - INFO - __main__ -     Gradient Accumulation steps = 1
06/15/2022 16:55:56 - INFO - __main__ -     Total optimization steps = 24000
Epoch:   0%|                                                                                                                                                                                                | 0/1 [00:00<?, ?it/s]

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1055.)
  exp_avg.mul_(beta1).add_(

iter: 31; loss: 3.121; loss_rec: 38.634; loss_kl: 87.077; beta: 0.000:   0%|▏                                                                                                                  | 32/24000 [00:03<46:52,  8.52it/s][A
iter: 32; loss: 3.493; loss_rec: 45.039; loss_kl: 87.984; beta: 0.000:   0%|▏                                                                                                                  | 32/24000 [00:04<46:52,  8.52it/s][A
iter: 32; loss: 3.493; loss_rec: 45.039; loss_kl: 87.984; beta: 0.000:   0%|▏                                                                                                                  | 33/24000 [00:04<48:12,  8.29it/s][A
iter: 33; loss: 3.401; loss_rec: 28.150; loss_kl: 76.979; beta: 0.000:   0%|▏                                                                                                                  | 33/24000 [00:04<48:12,  8.29it/s][A
iter: 33; loss: 3.401; loss_rec: 28.150; loss_kl: 76.979; beta: 0.000:   0%|▏   

iter: 66; loss: 3.299; loss_rec: 23.775; loss_kl: 71.795; beta: 0.000:   0%|▎                                                                                                                  | 67/24000 [00:08<46:21,  8.61it/s][A
iter: 67; loss: 2.355; loss_rec: 26.627; loss_kl: 88.385; beta: 0.000:   0%|▎                                                                                                                  | 67/24000 [00:08<46:21,  8.61it/s][A
iter: 67; loss: 2.355; loss_rec: 26.627; loss_kl: 88.385; beta: 0.000:   0%|▎                                                                                                                  | 68/24000 [00:08<47:02,  8.48it/s][A
iter: 68; loss: 3.916; loss_rec: 52.442; loss_kl: 81.017; beta: 0.000:   0%|▎                                                                                                                  | 68/24000 [00:08<47:02,  8.48it/s][A
iter: 68; loss: 3.916; loss_rec: 52.442; loss_kl: 81.017; beta: 0.000:   0%|▎   

iter: 101; loss: 2.768; loss_rec: 33.299; loss_kl: 94.198; beta: 0.000:   0%|▍                                                                                                                | 102/24000 [00:12<45:44,  8.71it/s][A
iter: 102; loss: 3.038; loss_rec: 31.748; loss_kl: 87.281; beta: 0.000:   0%|▍                                                                                                                | 102/24000 [00:12<45:44,  8.71it/s][A
iter: 102; loss: 3.038; loss_rec: 31.748; loss_kl: 87.281; beta: 0.000:   0%|▍                                                                                                                | 103/24000 [00:12<46:48,  8.51it/s][A
iter: 103; loss: 3.720; loss_rec: 42.275; loss_kl: 89.917; beta: 0.000:   0%|▍                                                                                                                | 103/24000 [00:12<46:48,  8.51it/s][A
iter: 103; loss: 3.720; loss_rec: 42.275; loss_kl: 89.917; beta: 0.000:   0%|▍  

iter: 136; loss: 2.905; loss_rec: 46.013; loss_kl: 95.615; beta: 0.000:   1%|▋                                                                                                                | 137/24000 [00:16<47:49,  8.31it/s][A
iter: 137; loss: 3.146; loss_rec: 48.991; loss_kl: 99.525; beta: 0.000:   1%|▋                                                                                                                | 137/24000 [00:16<47:49,  8.31it/s][A
iter: 137; loss: 3.146; loss_rec: 48.991; loss_kl: 99.525; beta: 0.000:   1%|▋                                                                                                                | 138/24000 [00:16<47:58,  8.29it/s][A
iter: 138; loss: 3.537; loss_rec: 49.607; loss_kl: 95.762; beta: 0.000:   1%|▋                                                                                                                | 138/24000 [00:16<47:58,  8.29it/s][A
iter: 138; loss: 3.537; loss_rec: 49.607; loss_kl: 95.762; beta: 0.000:   1%|▋  

iter: 171; loss: 3.414; loss_rec: 47.117; loss_kl: 99.512; beta: 0.000:   1%|▊                                                                                                                | 172/24000 [00:20<48:21,  8.21it/s][A
iter: 172; loss: 2.986; loss_rec: 22.591; loss_kl: 97.353; beta: 0.000:   1%|▊                                                                                                                | 172/24000 [00:20<48:21,  8.21it/s][A
iter: 172; loss: 2.986; loss_rec: 22.591; loss_kl: 97.353; beta: 0.000:   1%|▊                                                                                                                | 173/24000 [00:20<47:52,  8.30it/s][A
iter: 173; loss: 3.111; loss_rec: 25.795; loss_kl: 96.830; beta: 0.000:   1%|▊                                                                                                                | 173/24000 [00:20<47:52,  8.30it/s][A
iter: 173; loss: 3.111; loss_rec: 25.795; loss_kl: 96.830; beta: 0.000:   1%|▊  

iter: 206; loss: 3.582; loss_rec: 57.704; loss_kl: 99.802; beta: 0.000:   1%|▉                                                                                                                | 207/24000 [00:24<48:42,  8.14it/s][A
iter: 207; loss: 2.493; loss_rec: 29.921; loss_kl: 93.891; beta: 0.000:   1%|▉                                                                                                                | 207/24000 [00:24<48:42,  8.14it/s][A
iter: 207; loss: 2.493; loss_rec: 29.921; loss_kl: 93.891; beta: 0.000:   1%|▉                                                                                                                | 208/24000 [00:24<47:40,  8.32it/s][A
iter: 208; loss: 3.604; loss_rec: 46.746; loss_kl: 95.326; beta: 0.000:   1%|▉                                                                                                                | 208/24000 [00:24<47:40,  8.32it/s][A
iter: 208; loss: 3.604; loss_rec: 46.746; loss_kl: 95.326; beta: 0.000:   1%|▉  

iter: 241; loss: 3.305; loss_rec: 32.131; loss_kl: 83.986; beta: 0.000:   1%|█▏                                                                                                               | 242/24000 [00:28<47:55,  8.26it/s][A
iter: 242; loss: 2.816; loss_rec: 32.286; loss_kl: 93.662; beta: 0.000:   1%|█▏                                                                                                               | 242/24000 [00:28<47:55,  8.26it/s][A
iter: 242; loss: 2.816; loss_rec: 32.286; loss_kl: 93.662; beta: 0.000:   1%|█▏                                                                                                               | 243/24000 [00:28<47:25,  8.35it/s][A
iter: 243; loss: 1.919; loss_rec: 14.977; loss_kl: 75.311; beta: 0.000:   1%|█▏                                                                                                               | 243/24000 [00:29<47:25,  8.35it/s][A
iter: 243; loss: 1.919; loss_rec: 14.977; loss_kl: 75.311; beta: 0.000:   1%|█▏ 

iter: 276; loss: 3.187; loss_rec: 45.377; loss_kl: 96.500; beta: 0.000:   1%|█▎                                                                                                               | 277/24000 [00:33<47:10,  8.38it/s][A
iter: 277; loss: 3.712; loss_rec: 52.768; loss_kl: 95.217; beta: 0.000:   1%|█▎                                                                                                               | 277/24000 [00:33<47:10,  8.38it/s][A
iter: 277; loss: 3.712; loss_rec: 52.768; loss_kl: 95.217; beta: 0.000:   1%|█▎                                                                                                               | 278/24000 [00:33<47:02,  8.41it/s][A
iter: 278; loss: 1.834; loss_rec: 12.115; loss_kl: 79.633; beta: 0.000:   1%|█▎                                                                                                               | 278/24000 [00:33<47:02,  8.41it/s][A
iter: 278; loss: 1.834; loss_rec: 12.115; loss_kl: 79.633; beta: 0.000:   1%|█▎ 

iter: 311; loss: 2.106; loss_rec: 17.432; loss_kl: 79.123; beta: 0.000:   1%|█▍                                                                                                               | 312/24000 [00:37<47:36,  8.29it/s][A
iter: 312; loss: 3.297; loss_rec: 33.887; loss_kl: 93.140; beta: 0.000:   1%|█▍                                                                                                               | 312/24000 [00:37<47:36,  8.29it/s][A
iter: 312; loss: 3.297; loss_rec: 33.887; loss_kl: 93.140; beta: 0.000:   1%|█▍                                                                                                               | 313/24000 [00:37<47:22,  8.33it/s][A
iter: 313; loss: 3.510; loss_rec: 48.151; loss_kl: 87.798; beta: 0.000:   1%|█▍                                                                                                               | 313/24000 [00:37<47:22,  8.33it/s][A
iter: 313; loss: 3.510; loss_rec: 48.151; loss_kl: 87.798; beta: 0.000:   1%|█▍ 

iter: 346; loss: 3.286; loss_rec: 35.572; loss_kl: 92.461; beta: 0.000:   1%|█▋                                                                                                               | 347/24000 [00:41<46:36,  8.46it/s][A
iter: 347; loss: 2.688; loss_rec: 24.317; loss_kl: 81.398; beta: 0.000:   1%|█▋                                                                                                               | 347/24000 [00:41<46:36,  8.46it/s][A
iter: 347; loss: 2.688; loss_rec: 24.317; loss_kl: 81.398; beta: 0.000:   1%|█▋                                                                                                               | 348/24000 [00:41<45:59,  8.57it/s][A
iter: 348; loss: 2.352; loss_rec: 18.817; loss_kl: 82.987; beta: 0.000:   1%|█▋                                                                                                               | 348/24000 [00:41<45:59,  8.57it/s][A
iter: 348; loss: 2.352; loss_rec: 18.817; loss_kl: 82.987; beta: 0.000:   1%|█▋ 

iter: 381; loss: 3.039; loss_rec: 25.900; loss_kl: 93.420; beta: 0.000:   2%|█▊                                                                                                               | 382/24000 [00:45<46:25,  8.48it/s][A
iter: 382; loss: 3.225; loss_rec: 34.521; loss_kl: 94.741; beta: 0.000:   2%|█▊                                                                                                               | 382/24000 [00:45<46:25,  8.48it/s][A
iter: 382; loss: 3.225; loss_rec: 34.521; loss_kl: 94.741; beta: 0.000:   2%|█▊                                                                                                               | 383/24000 [00:45<46:34,  8.45it/s][A
iter: 383; loss: 2.711; loss_rec: 33.280; loss_kl: 92.368; beta: 0.000:   2%|█▊                                                                                                               | 383/24000 [00:45<46:34,  8.45it/s][A
iter: 383; loss: 2.711; loss_rec: 33.280; loss_kl: 92.368; beta: 0.000:   2%|█▊ 

iter: 416; loss: 3.012; loss_rec: 47.609; loss_kl: 96.557; beta: 0.000:   2%|█▉                                                                                                               | 417/24000 [00:49<46:47,  8.40it/s][A
iter: 417; loss: 3.422; loss_rec: 42.163; loss_kl: 94.821; beta: 0.000:   2%|█▉                                                                                                               | 417/24000 [00:49<46:47,  8.40it/s][A
iter: 417; loss: 3.422; loss_rec: 42.163; loss_kl: 94.821; beta: 0.000:   2%|█▉                                                                                                               | 418/24000 [00:49<47:03,  8.35it/s][A
iter: 418; loss: 2.884; loss_rec: 28.581; loss_kl: 86.830; beta: 0.000:   2%|█▉                                                                                                               | 418/24000 [00:49<47:03,  8.35it/s][A
iter: 418; loss: 2.884; loss_rec: 28.581; loss_kl: 86.830; beta: 0.000:   2%|█▉ 

iter: 451; loss: 3.609; loss_rec: 41.946; loss_kl: 96.873; beta: 0.000:   2%|██▏                                                                                                              | 452/24000 [00:53<46:38,  8.41it/s][A
iter: 452; loss: 3.699; loss_rec: 50.920; loss_kl: 95.515; beta: 0.000:   2%|██▏                                                                                                              | 452/24000 [00:53<46:38,  8.41it/s][A
iter: 452; loss: 3.699; loss_rec: 50.920; loss_kl: 95.515; beta: 0.000:   2%|██▏                                                                                                              | 453/24000 [00:53<46:46,  8.39it/s][A
iter: 453; loss: 3.240; loss_rec: 44.110; loss_kl: 95.362; beta: 0.000:   2%|██▏                                                                                                              | 453/24000 [00:53<46:46,  8.39it/s][A
iter: 453; loss: 3.240; loss_rec: 44.110; loss_kl: 95.362; beta: 0.000:   2%|██▏

iter: 486; loss: 2.494; loss_rec: 31.672; loss_kl: 101.757; beta: 0.000:   2%|██▎                                                                                                             | 487/24000 [00:57<46:15,  8.47it/s][A
iter: 487; loss: 2.552; loss_rec: 22.791; loss_kl: 88.544; beta: 0.000:   2%|██▎                                                                                                              | 487/24000 [00:57<46:15,  8.47it/s][A
iter: 487; loss: 2.552; loss_rec: 22.791; loss_kl: 88.544; beta: 0.000:   2%|██▎                                                                                                              | 488/24000 [00:58<45:42,  8.57it/s][A
iter: 488; loss: 2.537; loss_rec: 29.637; loss_kl: 101.320; beta: 0.000:   2%|██▎                                                                                                             | 488/24000 [00:58<45:42,  8.57it/s][A
iter: 488; loss: 2.537; loss_rec: 29.637; loss_kl: 101.320; beta: 0.000:   2%|██

iter: 521; loss: 3.116; loss_rec: 48.557; loss_kl: 97.386; beta: 0.000:   2%|██▍                                                                                                              | 522/24000 [01:02<46:33,  8.40it/s][A
iter: 522; loss: 2.707; loss_rec: 20.074; loss_kl: 82.252; beta: 0.000:   2%|██▍                                                                                                              | 522/24000 [01:02<46:33,  8.40it/s][A
iter: 522; loss: 2.707; loss_rec: 20.074; loss_kl: 82.252; beta: 0.000:   2%|██▍                                                                                                              | 523/24000 [01:02<46:40,  8.38it/s][A
iter: 523; loss: 3.547; loss_rec: 47.049; loss_kl: 95.319; beta: 0.000:   2%|██▍                                                                                                              | 523/24000 [01:02<46:40,  8.38it/s][A
iter: 523; loss: 3.547; loss_rec: 47.049; loss_kl: 95.319; beta: 0.000:   2%|██▍

iter: 556; loss: 3.984; loss_rec: 54.417; loss_kl: 94.872; beta: 0.000:   2%|██▌                                                                                                              | 557/24000 [01:06<45:48,  8.53it/s][A
iter: 557; loss: 1.762; loss_rec: 8.898; loss_kl: 79.722; beta: 0.000:   2%|██▋                                                                                                               | 557/24000 [01:06<45:48,  8.53it/s][A
iter: 557; loss: 1.762; loss_rec: 8.898; loss_kl: 79.722; beta: 0.000:   2%|██▋                                                                                                               | 558/24000 [01:06<46:29,  8.40it/s][A
iter: 558; loss: 3.392; loss_rec: 32.819; loss_kl: 91.218; beta: 0.000:   2%|██▋                                                                                                              | 558/24000 [01:06<46:29,  8.40it/s][A
iter: 558; loss: 3.392; loss_rec: 32.819; loss_kl: 91.218; beta: 0.000:   2%|██▋

iter: 591; loss: 2.585; loss_rec: 12.716; loss_kl: 64.558; beta: 0.000:   2%|██▊                                                                                                              | 592/24000 [01:10<46:52,  8.32it/s][A
iter: 592; loss: 3.062; loss_rec: 32.046; loss_kl: 82.882; beta: 0.000:   2%|██▊                                                                                                              | 592/24000 [01:10<46:52,  8.32it/s][A
iter: 592; loss: 3.062; loss_rec: 32.046; loss_kl: 82.882; beta: 0.000:   2%|██▊                                                                                                              | 593/24000 [01:10<46:49,  8.33it/s][A
iter: 593; loss: 2.613; loss_rec: 16.714; loss_kl: 80.261; beta: 0.000:   2%|██▊                                                                                                              | 593/24000 [01:10<46:49,  8.33it/s][A
iter: 593; loss: 2.613; loss_rec: 16.714; loss_kl: 80.261; beta: 0.000:   2%|██▊

iter: 626; loss: 2.407; loss_rec: 28.870; loss_kl: 93.556; beta: 0.000:   3%|██▉                                                                                                              | 627/24000 [01:14<46:19,  8.41it/s][A
iter: 627; loss: 1.896; loss_rec: 11.377; loss_kl: 98.315; beta: 0.000:   3%|██▉                                                                                                              | 627/24000 [01:14<46:19,  8.41it/s][A
iter: 627; loss: 1.896; loss_rec: 11.377; loss_kl: 98.315; beta: 0.000:   3%|██▉                                                                                                              | 628/24000 [01:14<46:23,  8.40it/s][A
iter: 628; loss: 3.529; loss_rec: 35.932; loss_kl: 94.674; beta: 0.000:   3%|██▉                                                                                                              | 628/24000 [01:14<46:23,  8.40it/s][A
iter: 628; loss: 3.529; loss_rec: 35.932; loss_kl: 94.674; beta: 0.000:   3%|██▉

iter: 661; loss: 2.597; loss_rec: 31.168; loss_kl: 105.209; beta: 0.000:   3%|███                                                                                                             | 662/24000 [01:18<46:44,  8.32it/s][A
iter: 662; loss: 3.249; loss_rec: 38.241; loss_kl: 97.605; beta: 0.000:   3%|███                                                                                                              | 662/24000 [01:18<46:44,  8.32it/s][A
iter: 662; loss: 3.249; loss_rec: 38.241; loss_kl: 97.605; beta: 0.000:   3%|███                                                                                                              | 663/24000 [01:18<46:51,  8.30it/s][A
iter: 663; loss: 3.226; loss_rec: 34.716; loss_kl: 94.344; beta: 0.000:   3%|███                                                                                                              | 663/24000 [01:18<46:51,  8.30it/s][A
iter: 663; loss: 3.226; loss_rec: 34.716; loss_kl: 94.344; beta: 0.000:   3%|███

iter: 696; loss: 2.558; loss_rec: 34.746; loss_kl: 90.529; beta: 0.000:   3%|███▎                                                                                                             | 697/24000 [01:22<46:52,  8.28it/s][A
iter: 697; loss: 2.261; loss_rec: 22.608; loss_kl: 95.563; beta: 0.000:   3%|███▎                                                                                                             | 697/24000 [01:22<46:52,  8.28it/s][A
iter: 697; loss: 2.261; loss_rec: 22.608; loss_kl: 95.563; beta: 0.000:   3%|███▎                                                                                                             | 698/24000 [01:23<46:45,  8.31it/s][A
iter: 698; loss: 2.897; loss_rec: 31.125; loss_kl: 95.317; beta: 0.000:   3%|███▎                                                                                                             | 698/24000 [01:23<46:45,  8.31it/s][A
iter: 698; loss: 2.897; loss_rec: 31.125; loss_kl: 95.317; beta: 0.000:   3%|███

iter: 731; loss: 2.741; loss_rec: 21.388; loss_kl: 106.131; beta: 0.000:   3%|███▍                                                                                                            | 732/24000 [01:27<46:29,  8.34it/s][A
iter: 732; loss: 3.483; loss_rec: 40.997; loss_kl: 105.495; beta: 0.000:   3%|███▍                                                                                                            | 732/24000 [01:27<46:29,  8.34it/s][A
iter: 732; loss: 3.483; loss_rec: 40.997; loss_kl: 105.495; beta: 0.000:   3%|███▍                                                                                                            | 733/24000 [01:27<46:37,  8.32it/s][A
iter: 733; loss: 3.495; loss_rec: 50.060; loss_kl: 104.732; beta: 0.000:   3%|███▍                                                                                                            | 733/24000 [01:27<46:37,  8.32it/s][A
iter: 733; loss: 3.495; loss_rec: 50.060; loss_kl: 104.732; beta: 0.000:   3%|██

iter: 766; loss: 2.519; loss_rec: 28.507; loss_kl: 107.265; beta: 0.000:   3%|███▌                                                                                                            | 767/24000 [01:31<45:58,  8.42it/s][A
iter: 767; loss: 3.458; loss_rec: 51.483; loss_kl: 107.314; beta: 0.000:   3%|███▌                                                                                                            | 767/24000 [01:31<45:58,  8.42it/s][A
iter: 767; loss: 3.458; loss_rec: 51.483; loss_kl: 107.314; beta: 0.000:   3%|███▌                                                                                                            | 768/24000 [01:31<45:39,  8.48it/s][A
iter: 768; loss: 2.705; loss_rec: 24.519; loss_kl: 104.859; beta: 0.000:   3%|███▌                                                                                                            | 768/24000 [01:31<45:39,  8.48it/s][A
iter: 768; loss: 2.705; loss_rec: 24.519; loss_kl: 104.859; beta: 0.000:   3%|██

iter: 801; loss: 2.810; loss_rec: 28.100; loss_kl: 101.799; beta: 0.000:   3%|███▋                                                                                                            | 802/24000 [01:35<46:33,  8.30it/s][A
iter: 802; loss: 3.025; loss_rec: 26.934; loss_kl: 97.850; beta: 0.000:   3%|███▊                                                                                                             | 802/24000 [01:35<46:33,  8.30it/s][A
iter: 802; loss: 3.025; loss_rec: 26.934; loss_kl: 97.850; beta: 0.000:   3%|███▊                                                                                                             | 803/24000 [01:35<46:35,  8.30it/s][A
iter: 803; loss: 2.292; loss_rec: 13.425; loss_kl: 81.607; beta: 0.000:   3%|███▊                                                                                                             | 803/24000 [01:35<46:35,  8.30it/s][A
iter: 803; loss: 2.292; loss_rec: 13.425; loss_kl: 81.607; beta: 0.000:   3%|███

iter: 836; loss: 3.048; loss_rec: 48.713; loss_kl: 93.905; beta: 0.000:   3%|███▉                                                                                                             | 837/24000 [01:39<45:34,  8.47it/s][A
iter: 837; loss: 2.477; loss_rec: 16.034; loss_kl: 100.835; beta: 0.000:   3%|███▉                                                                                                            | 837/24000 [01:39<45:34,  8.47it/s][A
iter: 837; loss: 2.477; loss_rec: 16.034; loss_kl: 100.835; beta: 0.000:   3%|███▉                                                                                                            | 838/24000 [01:39<45:04,  8.56it/s][A
iter: 838; loss: 3.761; loss_rec: 58.880; loss_kl: 95.260; beta: 0.000:   3%|███▉                                                                                                             | 838/24000 [01:39<45:04,  8.56it/s][A
iter: 838; loss: 3.761; loss_rec: 58.880; loss_kl: 95.260; beta: 0.000:   3%|███

iter: 871; loss: 2.753; loss_rec: 40.036; loss_kl: 111.094; beta: 0.000:   4%|████                                                                                                            | 872/24000 [01:43<44:50,  8.60it/s][A
iter: 872; loss: 4.237; loss_rec: 57.379; loss_kl: 97.636; beta: 0.000:   4%|████                                                                                                             | 872/24000 [01:43<44:50,  8.60it/s][A
iter: 872; loss: 4.237; loss_rec: 57.379; loss_kl: 97.636; beta: 0.000:   4%|████                                                                                                             | 873/24000 [01:44<45:27,  8.48it/s][A
iter: 873; loss: 2.748; loss_rec: 25.875; loss_kl: 89.384; beta: 0.000:   4%|████                                                                                                             | 873/24000 [01:44<45:27,  8.48it/s][A
iter: 873; loss: 2.748; loss_rec: 25.875; loss_kl: 89.384; beta: 0.000:   4%|███

iter: 906; loss: 2.675; loss_rec: 37.210; loss_kl: 101.718; beta: 0.000:   4%|████▏                                                                                                           | 907/24000 [01:48<45:02,  8.55it/s][A
iter: 907; loss: 3.199; loss_rec: 30.781; loss_kl: 104.896; beta: 0.000:   4%|████▏                                                                                                           | 907/24000 [01:48<45:02,  8.55it/s][A
iter: 907; loss: 3.199; loss_rec: 30.781; loss_kl: 104.896; beta: 0.000:   4%|████▏                                                                                                           | 908/24000 [01:48<45:39,  8.43it/s][A
iter: 908; loss: 2.543; loss_rec: 17.804; loss_kl: 93.647; beta: 0.000:   4%|████▎                                                                                                            | 908/24000 [01:48<45:39,  8.43it/s][A
iter: 908; loss: 2.543; loss_rec: 17.804; loss_kl: 93.647; beta: 0.000:   4%|███

iter: 941; loss: 4.271; loss_rec: 56.287; loss_kl: 93.343; beta: 0.000:   4%|████▍                                                                                                            | 942/24000 [01:52<45:47,  8.39it/s][A
iter: 942; loss: 2.837; loss_rec: 44.659; loss_kl: 105.969; beta: 0.000:   4%|████▍                                                                                                           | 942/24000 [01:52<45:47,  8.39it/s][A
iter: 942; loss: 2.837; loss_rec: 44.659; loss_kl: 105.969; beta: 0.000:   4%|████▍                                                                                                           | 943/24000 [01:52<45:57,  8.36it/s][A
iter: 943; loss: 3.477; loss_rec: 30.732; loss_kl: 103.837; beta: 0.000:   4%|████▍                                                                                                           | 943/24000 [01:52<45:57,  8.36it/s][A
iter: 943; loss: 3.477; loss_rec: 30.732; loss_kl: 103.837; beta: 0.000:   4%|██

iter: 976; loss: 1.877; loss_rec: 18.671; loss_kl: 94.876; beta: 0.000:   4%|████▌                                                                                                            | 977/24000 [01:56<45:35,  8.42it/s][A
iter: 977; loss: 2.727; loss_rec: 29.994; loss_kl: 105.707; beta: 0.000:   4%|████▌                                                                                                           | 977/24000 [01:56<45:35,  8.42it/s][A
iter: 977; loss: 2.727; loss_rec: 29.994; loss_kl: 105.707; beta: 0.000:   4%|████▌                                                                                                           | 978/24000 [01:56<45:43,  8.39it/s][A
iter: 978; loss: 2.698; loss_rec: 29.041; loss_kl: 103.571; beta: 0.000:   4%|████▌                                                                                                           | 978/24000 [01:56<45:43,  8.39it/s][A
iter: 978; loss: 2.698; loss_rec: 29.041; loss_kl: 103.571; beta: 0.000:   4%|██

iter: 1011; loss: 2.187; loss_rec: 11.829; loss_kl: 83.893; beta: 0.000:   4%|████▋                                                                                                          | 1011/24000 [02:00<45:17,  8.46it/s][A
iter: 1011; loss: 2.187; loss_rec: 11.829; loss_kl: 83.893; beta: 0.000:   4%|████▋                                                                                                          | 1012/24000 [02:00<44:42,  8.57it/s][A
iter: 1012; loss: 1.669; loss_rec: 8.644; loss_kl: 85.139; beta: 0.000:   4%|████▋                                                                                                           | 1012/24000 [02:00<44:42,  8.57it/s][A
iter: 1012; loss: 1.669; loss_rec: 8.644; loss_kl: 85.139; beta: 0.000:   4%|████▋                                                                                                           | 1013/24000 [02:00<45:15,  8.46it/s][A
iter: 1013; loss: 3.308; loss_rec: 33.048; loss_kl: 102.026; beta: 0.000:   4%|█

iter: 1046; loss: 3.391; loss_rec: 48.817; loss_kl: 108.306; beta: 0.000:   4%|████▊                                                                                                         | 1046/24000 [02:04<45:50,  8.35it/s][A
iter: 1046; loss: 3.391; loss_rec: 48.817; loss_kl: 108.306; beta: 0.000:   4%|████▊                                                                                                         | 1047/24000 [02:04<45:57,  8.33it/s][A
iter: 1047; loss: 2.830; loss_rec: 43.721; loss_kl: 109.814; beta: 0.000:   4%|████▊                                                                                                         | 1047/24000 [02:04<45:57,  8.33it/s][A
iter: 1047; loss: 2.830; loss_rec: 43.721; loss_kl: 109.814; beta: 0.000:   4%|████▊                                                                                                         | 1048/24000 [02:04<46:11,  8.28it/s][A
iter: 1048; loss: 2.581; loss_rec: 36.139; loss_kl: 104.492; beta: 0.000:   4%|█

iter: 1081; loss: 3.171; loss_rec: 44.311; loss_kl: 112.193; beta: 0.000:   5%|████▉                                                                                                         | 1081/24000 [02:08<44:51,  8.51it/s][A
iter: 1081; loss: 3.171; loss_rec: 44.311; loss_kl: 112.193; beta: 0.000:   5%|████▉                                                                                                         | 1082/24000 [02:08<45:10,  8.45it/s][A
iter: 1082; loss: 3.246; loss_rec: 41.004; loss_kl: 111.446; beta: 0.000:   5%|████▉                                                                                                         | 1082/24000 [02:08<45:10,  8.45it/s][A
iter: 1082; loss: 3.246; loss_rec: 41.004; loss_kl: 111.446; beta: 0.000:   5%|████▉                                                                                                         | 1083/24000 [02:08<44:59,  8.49it/s][A
iter: 1083; loss: 1.785; loss_rec: 15.627; loss_kl: 90.075; beta: 0.000:   5%|██

iter: 1116; loss: 1.902; loss_rec: 14.834; loss_kl: 87.487; beta: 0.000:   5%|█████▏                                                                                                         | 1116/24000 [02:12<45:10,  8.44it/s][A
iter: 1116; loss: 1.902; loss_rec: 14.834; loss_kl: 87.487; beta: 0.000:   5%|█████▏                                                                                                         | 1117/24000 [02:12<44:48,  8.51it/s][A
iter: 1117; loss: 2.595; loss_rec: 20.317; loss_kl: 96.489; beta: 0.000:   5%|█████▏                                                                                                         | 1117/24000 [02:13<44:48,  8.51it/s][A
iter: 1117; loss: 2.595; loss_rec: 20.317; loss_kl: 96.489; beta: 0.000:   5%|█████▏                                                                                                         | 1118/24000 [02:13<45:03,  8.46it/s][A
iter: 1118; loss: 2.898; loss_rec: 28.054; loss_kl: 83.505; beta: 0.000:   5%|██

iter: 1151; loss: 2.987; loss_rec: 43.070; loss_kl: 103.435; beta: 0.000:   5%|█████▎                                                                                                        | 1151/24000 [02:17<45:23,  8.39it/s][A
iter: 1151; loss: 2.987; loss_rec: 43.070; loss_kl: 103.435; beta: 0.000:   5%|█████▎                                                                                                        | 1152/24000 [02:17<45:30,  8.37it/s][A
iter: 1152; loss: 1.607; loss_rec: 9.640; loss_kl: 82.558; beta: 0.000:   5%|█████▍                                                                                                          | 1152/24000 [02:17<45:30,  8.37it/s][A
iter: 1152; loss: 1.607; loss_rec: 9.640; loss_kl: 82.558; beta: 0.000:   5%|█████▍                                                                                                          | 1153/24000 [02:17<45:22,  8.39it/s][A
iter: 1153; loss: 3.121; loss_rec: 37.284; loss_kl: 111.086; beta: 0.000:   5%|█

iter: 1186; loss: 3.051; loss_rec: 46.477; loss_kl: 110.838; beta: 0.000:   5%|█████▍                                                                                                        | 1186/24000 [02:21<44:49,  8.48it/s][A
iter: 1186; loss: 3.051; loss_rec: 46.477; loss_kl: 110.838; beta: 0.000:   5%|█████▍                                                                                                        | 1187/24000 [02:21<44:52,  8.47it/s][A
iter: 1187; loss: 2.439; loss_rec: 25.370; loss_kl: 99.833; beta: 0.000:   5%|█████▍                                                                                                         | 1187/24000 [02:21<44:52,  8.47it/s][A
iter: 1187; loss: 2.439; loss_rec: 25.370; loss_kl: 99.833; beta: 0.000:   5%|█████▍                                                                                                         | 1188/24000 [02:21<44:54,  8.47it/s][A
iter: 1188; loss: 2.799; loss_rec: 19.778; loss_kl: 94.997; beta: 0.000:   5%|██

iter: 1221; loss: 1.599; loss_rec: 9.594; loss_kl: 88.965; beta: 0.000:   5%|█████▋                                                                                                          | 1221/24000 [02:25<43:51,  8.66it/s][A
iter: 1221; loss: 1.599; loss_rec: 9.594; loss_kl: 88.965; beta: 0.000:   5%|█████▋                                                                                                          | 1222/24000 [02:25<43:28,  8.73it/s][A
iter: 1222; loss: 2.554; loss_rec: 22.406; loss_kl: 102.760; beta: 0.000:   5%|█████▌                                                                                                        | 1222/24000 [02:25<43:28,  8.73it/s][A
iter: 1222; loss: 2.554; loss_rec: 22.406; loss_kl: 102.760; beta: 0.000:   5%|█████▌                                                                                                        | 1223/24000 [02:25<42:53,  8.85it/s][A
iter: 1223; loss: 2.680; loss_rec: 32.878; loss_kl: 101.705; beta: 0.000:   5%|█

iter: 1256; loss: 3.314; loss_rec: 44.022; loss_kl: 98.917; beta: 0.000:   5%|█████▊                                                                                                         | 1256/24000 [02:29<42:39,  8.89it/s][A
iter: 1256; loss: 3.314; loss_rec: 44.022; loss_kl: 98.917; beta: 0.000:   5%|█████▊                                                                                                         | 1257/24000 [02:29<43:02,  8.81it/s][A
iter: 1257; loss: 2.288; loss_rec: 21.943; loss_kl: 91.127; beta: 0.000:   5%|█████▊                                                                                                         | 1257/24000 [02:29<43:02,  8.81it/s][A
iter: 1257; loss: 2.288; loss_rec: 21.943; loss_kl: 91.127; beta: 0.000:   5%|█████▊                                                                                                         | 1258/24000 [02:29<43:10,  8.78it/s][A
iter: 1258; loss: 3.182; loss_rec: 45.530; loss_kl: 96.261; beta: 0.000:   5%|██

iter: 1291; loss: 2.107; loss_rec: 15.261; loss_kl: 102.491; beta: 0.000:   5%|█████▉                                                                                                        | 1291/24000 [02:33<42:14,  8.96it/s][A
iter: 1291; loss: 2.107; loss_rec: 15.261; loss_kl: 102.491; beta: 0.000:   5%|█████▉                                                                                                        | 1292/24000 [02:33<42:48,  8.84it/s][A
iter: 1292; loss: 2.360; loss_rec: 16.521; loss_kl: 101.582; beta: 0.000:   5%|█████▉                                                                                                        | 1292/24000 [02:33<42:48,  8.84it/s][A
iter: 1292; loss: 2.360; loss_rec: 16.521; loss_kl: 101.582; beta: 0.000:   5%|█████▉                                                                                                        | 1293/24000 [02:33<43:23,  8.72it/s][A
iter: 1293; loss: 2.827; loss_rec: 39.632; loss_kl: 111.900; beta: 0.000:   5%|█

iter: 1326; loss: 2.172; loss_rec: 23.182; loss_kl: 108.075; beta: 0.000:   6%|██████                                                                                                        | 1326/24000 [02:37<45:22,  8.33it/s][A
iter: 1326; loss: 2.172; loss_rec: 23.182; loss_kl: 108.075; beta: 0.000:   6%|██████                                                                                                        | 1327/24000 [02:37<45:08,  8.37it/s][A
iter: 1327; loss: 1.427; loss_rec: 9.670; loss_kl: 102.785; beta: 0.000:   6%|██████▏                                                                                                        | 1327/24000 [02:37<45:08,  8.37it/s][A
iter: 1327; loss: 1.427; loss_rec: 9.670; loss_kl: 102.785; beta: 0.000:   6%|██████▏                                                                                                        | 1328/24000 [02:37<44:24,  8.51it/s][A
iter: 1328; loss: 3.225; loss_rec: 42.146; loss_kl: 112.284; beta: 0.000:   6%|█

iter: 1361; loss: 2.188; loss_rec: 25.357; loss_kl: 96.001; beta: 0.000:   6%|██████▎                                                                                                        | 1361/24000 [02:41<44:34,  8.46it/s][A
iter: 1361; loss: 2.188; loss_rec: 25.357; loss_kl: 96.001; beta: 0.000:   6%|██████▎                                                                                                        | 1362/24000 [02:41<44:24,  8.50it/s][A
iter: 1362; loss: 2.631; loss_rec: 17.824; loss_kl: 95.525; beta: 0.000:   6%|██████▎                                                                                                        | 1362/24000 [02:41<44:24,  8.50it/s][A
iter: 1362; loss: 2.631; loss_rec: 17.824; loss_kl: 95.525; beta: 0.000:   6%|██████▎                                                                                                        | 1363/24000 [02:41<44:40,  8.45it/s][A
iter: 1363; loss: 1.605; loss_rec: 10.900; loss_kl: 85.305; beta: 0.000:   6%|██

iter: 1396; loss: 3.362; loss_rec: 33.893; loss_kl: 109.518; beta: 0.000:   6%|██████▍                                                                                                       | 1396/24000 [02:45<44:20,  8.49it/s][A
iter: 1396; loss: 3.362; loss_rec: 33.893; loss_kl: 109.518; beta: 0.000:   6%|██████▍                                                                                                       | 1397/24000 [02:45<44:00,  8.56it/s][A
iter: 1397; loss: 3.436; loss_rec: 43.552; loss_kl: 105.017; beta: 0.000:   6%|██████▍                                                                                                       | 1397/24000 [02:45<44:00,  8.56it/s][A
iter: 1397; loss: 3.436; loss_rec: 43.552; loss_kl: 105.017; beta: 0.000:   6%|██████▍                                                                                                       | 1398/24000 [02:45<44:32,  8.46it/s][A
iter: 1398; loss: 3.121; loss_rec: 40.284; loss_kl: 107.827; beta: 0.000:   6%|█

iter: 1431; loss: 3.239; loss_rec: 20.982; loss_kl: 103.316; beta: 0.000:   6%|██████▌                                                                                                       | 1431/24000 [02:49<45:22,  8.29it/s][A
iter: 1431; loss: 3.239; loss_rec: 20.982; loss_kl: 103.316; beta: 0.000:   6%|██████▌                                                                                                       | 1432/24000 [02:49<45:30,  8.27it/s][A
iter: 1432; loss: 2.790; loss_rec: 33.369; loss_kl: 105.348; beta: 0.000:   6%|██████▌                                                                                                       | 1432/24000 [02:49<45:30,  8.27it/s][A
iter: 1432; loss: 2.790; loss_rec: 33.369; loss_kl: 105.348; beta: 0.000:   6%|██████▌                                                                                                       | 1433/24000 [02:49<45:20,  8.30it/s][A
iter: 1433; loss: 2.029; loss_rec: 17.988; loss_kl: 103.380; beta: 0.000:   6%|█

iter: 1466; loss: 2.994; loss_rec: 31.967; loss_kl: 107.043; beta: 0.000:   6%|██████▋                                                                                                       | 1466/24000 [02:53<44:20,  8.47it/s][A
iter: 1466; loss: 2.994; loss_rec: 31.967; loss_kl: 107.043; beta: 0.000:   6%|██████▋                                                                                                       | 1467/24000 [02:53<44:38,  8.41it/s][A
iter: 1467; loss: 2.657; loss_rec: 38.528; loss_kl: 116.138; beta: 0.000:   6%|██████▋                                                                                                       | 1467/24000 [02:53<44:38,  8.41it/s][A
iter: 1467; loss: 2.657; loss_rec: 38.528; loss_kl: 116.138; beta: 0.000:   6%|██████▋                                                                                                       | 1468/24000 [02:53<45:00,  8.34it/s][A
iter: 1468; loss: 2.361; loss_rec: 19.373; loss_kl: 104.979; beta: 0.000:   6%|█

iter: 1501; loss: 3.667; loss_rec: 56.365; loss_kl: 111.218; beta: 0.000:   6%|██████▉                                                                                                       | 1501/24000 [02:57<45:20,  8.27it/s][A
iter: 1501; loss: 3.667; loss_rec: 56.365; loss_kl: 111.218; beta: 0.000:   6%|██████▉                                                                                                       | 1502/24000 [02:57<45:20,  8.27it/s][A
iter: 1502; loss: 3.901; loss_rec: 27.216; loss_kl: 105.752; beta: 0.000:   6%|██████▉                                                                                                       | 1502/24000 [02:57<45:20,  8.27it/s][A
iter: 1502; loss: 3.901; loss_rec: 27.216; loss_kl: 105.752; beta: 0.000:   6%|██████▉                                                                                                       | 1503/24000 [02:58<44:08,  8.49it/s][A
iter: 1503; loss: 3.731; loss_rec: 48.834; loss_kl: 111.586; beta: 0.000:   6%|█

iter: 1536; loss: 2.853; loss_rec: 29.965; loss_kl: 99.965; beta: 0.000:   6%|███████                                                                                                        | 1536/24000 [03:01<42:49,  8.74it/s][A
iter: 1536; loss: 2.853; loss_rec: 29.965; loss_kl: 99.965; beta: 0.000:   6%|███████                                                                                                        | 1537/24000 [03:02<42:54,  8.73it/s][A
iter: 1537; loss: 3.390; loss_rec: 44.808; loss_kl: 114.297; beta: 0.000:   6%|███████                                                                                                       | 1537/24000 [03:02<42:54,  8.73it/s][A
iter: 1537; loss: 3.390; loss_rec: 44.808; loss_kl: 114.297; beta: 0.000:   6%|███████                                                                                                       | 1538/24000 [03:02<43:17,  8.65it/s][A
iter: 1538; loss: 2.529; loss_rec: 26.591; loss_kl: 95.302; beta: 0.000:   6%|██

iter: 1571; loss: 2.310; loss_rec: 26.056; loss_kl: 116.378; beta: 0.000:   7%|███████▏                                                                                                      | 1571/24000 [03:06<45:02,  8.30it/s][A
iter: 1571; loss: 2.310; loss_rec: 26.056; loss_kl: 116.378; beta: 0.000:   7%|███████▏                                                                                                      | 1572/24000 [03:06<44:48,  8.34it/s][A
iter: 1572; loss: 2.593; loss_rec: 34.581; loss_kl: 98.225; beta: 0.000:   7%|███████▎                                                                                                       | 1572/24000 [03:06<44:48,  8.34it/s][A
iter: 1572; loss: 2.593; loss_rec: 34.581; loss_kl: 98.225; beta: 0.000:   7%|███████▎                                                                                                       | 1573/24000 [03:06<45:07,  8.28it/s][A
iter: 1573; loss: 2.177; loss_rec: 20.775; loss_kl: 94.676; beta: 0.000:   7%|██

iter: 1606; loss: 2.913; loss_rec: 33.462; loss_kl: 116.158; beta: 0.000:   7%|███████▎                                                                                                      | 1606/24000 [03:10<44:37,  8.36it/s][A
iter: 1606; loss: 2.913; loss_rec: 33.462; loss_kl: 116.158; beta: 0.000:   7%|███████▎                                                                                                      | 1607/24000 [03:10<44:37,  8.36it/s][A
iter: 1607; loss: 2.106; loss_rec: 18.953; loss_kl: 101.953; beta: 0.000:   7%|███████▎                                                                                                      | 1607/24000 [03:10<44:37,  8.36it/s][A
iter: 1607; loss: 2.106; loss_rec: 18.953; loss_kl: 101.953; beta: 0.000:   7%|███████▎                                                                                                      | 1608/24000 [03:10<44:07,  8.46it/s][A
iter: 1608; loss: 2.317; loss_rec: 27.252; loss_kl: 109.622; beta: 0.000:   7%|█

iter: 1641; loss: 1.746; loss_rec: 13.966; loss_kl: 106.505; beta: 0.000:   7%|███████▌                                                                                                      | 1641/24000 [03:14<44:44,  8.33it/s][A
iter: 1641; loss: 1.746; loss_rec: 13.966; loss_kl: 106.505; beta: 0.000:   7%|███████▌                                                                                                      | 1642/24000 [03:14<43:59,  8.47it/s][A
iter: 1642; loss: 2.507; loss_rec: 18.535; loss_kl: 108.153; beta: 0.000:   7%|███████▌                                                                                                      | 1642/24000 [03:14<43:59,  8.47it/s][A
iter: 1642; loss: 2.507; loss_rec: 18.535; loss_kl: 108.153; beta: 0.000:   7%|███████▌                                                                                                      | 1643/24000 [03:14<43:47,  8.51it/s][A
iter: 1643; loss: 2.085; loss_rec: 20.554; loss_kl: 99.254; beta: 0.000:   7%|██

iter: 1676; loss: 2.512; loss_rec: 22.980; loss_kl: 102.291; beta: 0.000:   7%|███████▋                                                                                                      | 1676/24000 [03:18<44:45,  8.31it/s][A
iter: 1676; loss: 2.512; loss_rec: 22.980; loss_kl: 102.291; beta: 0.000:   7%|███████▋                                                                                                      | 1677/24000 [03:18<44:48,  8.30it/s][A
iter: 1677; loss: 3.178; loss_rec: 38.444; loss_kl: 106.736; beta: 0.000:   7%|███████▋                                                                                                      | 1677/24000 [03:18<44:48,  8.30it/s][A
iter: 1677; loss: 3.178; loss_rec: 38.444; loss_kl: 106.736; beta: 0.000:   7%|███████▋                                                                                                      | 1678/24000 [03:18<45:10,  8.23it/s][A
iter: 1678; loss: 2.440; loss_rec: 29.104; loss_kl: 103.719; beta: 0.000:   7%|█

iter: 1711; loss: 2.926; loss_rec: 26.004; loss_kl: 110.595; beta: 0.000:   7%|███████▊                                                                                                      | 1711/24000 [03:22<44:43,  8.31it/s][A
iter: 1711; loss: 2.926; loss_rec: 26.004; loss_kl: 110.595; beta: 0.000:   7%|███████▊                                                                                                      | 1712/24000 [03:22<44:34,  8.33it/s][A
iter: 1712; loss: 2.515; loss_rec: 28.766; loss_kl: 104.926; beta: 0.000:   7%|███████▊                                                                                                      | 1712/24000 [03:22<44:34,  8.33it/s][A
iter: 1712; loss: 2.515; loss_rec: 28.766; loss_kl: 104.926; beta: 0.000:   7%|███████▊                                                                                                      | 1713/24000 [03:23<44:56,  8.26it/s][A
iter: 1713; loss: 2.352; loss_rec: 21.391; loss_kl: 107.902; beta: 0.000:   7%|█

iter: 1746; loss: 4.166; loss_rec: 44.756; loss_kl: 113.991; beta: 0.000:   7%|████████                                                                                                      | 1746/24000 [03:27<44:08,  8.40it/s][A
iter: 1746; loss: 4.166; loss_rec: 44.756; loss_kl: 113.991; beta: 0.000:   7%|████████                                                                                                      | 1747/24000 [03:27<44:39,  8.30it/s][A
iter: 1747; loss: 2.679; loss_rec: 33.214; loss_kl: 101.431; beta: 0.000:   7%|████████                                                                                                      | 1747/24000 [03:27<44:39,  8.30it/s][A
iter: 1747; loss: 2.679; loss_rec: 33.214; loss_kl: 101.431; beta: 0.000:   7%|████████                                                                                                      | 1748/24000 [03:27<44:22,  8.36it/s][A
iter: 1748; loss: 2.393; loss_rec: 31.582; loss_kl: 113.832; beta: 0.000:   7%|█

iter: 1781; loss: 2.556; loss_rec: 28.736; loss_kl: 120.108; beta: 0.000:   7%|████████▏                                                                                                     | 1781/24000 [03:31<44:29,  8.32it/s][A
iter: 1781; loss: 2.556; loss_rec: 28.736; loss_kl: 120.108; beta: 0.000:   7%|████████▏                                                                                                     | 1782/24000 [03:31<44:31,  8.32it/s][A
iter: 1782; loss: 2.195; loss_rec: 12.766; loss_kl: 93.092; beta: 0.000:   7%|████████▏                                                                                                      | 1782/24000 [03:31<44:31,  8.32it/s][A
iter: 1782; loss: 2.195; loss_rec: 12.766; loss_kl: 93.092; beta: 0.000:   7%|████████▏                                                                                                      | 1783/24000 [03:31<44:31,  8.32it/s][A
iter: 1783; loss: 2.613; loss_rec: 32.565; loss_kl: 116.966; beta: 0.000:   7%|█

iter: 1816; loss: 2.378; loss_rec: 24.360; loss_kl: 109.137; beta: 0.000:   8%|████████▎                                                                                                     | 1816/24000 [03:35<44:32,  8.30it/s][A
iter: 1816; loss: 2.378; loss_rec: 24.360; loss_kl: 109.137; beta: 0.000:   8%|████████▎                                                                                                     | 1817/24000 [03:35<45:16,  8.17it/s][A
iter: 1817; loss: 2.588; loss_rec: 27.850; loss_kl: 107.503; beta: 0.000:   8%|████████▎                                                                                                     | 1817/24000 [03:35<45:16,  8.17it/s][A
iter: 1817; loss: 2.588; loss_rec: 27.850; loss_kl: 107.503; beta: 0.000:   8%|████████▎                                                                                                     | 1818/24000 [03:35<44:45,  8.26it/s][A
iter: 1818; loss: 2.704; loss_rec: 33.970; loss_kl: 115.792; beta: 0.000:   8%|█

iter: 1851; loss: 2.509; loss_rec: 27.095; loss_kl: 99.812; beta: 0.000:   8%|████████▌                                                                                                      | 1851/24000 [03:39<45:08,  8.18it/s][A
iter: 1851; loss: 2.509; loss_rec: 27.095; loss_kl: 99.812; beta: 0.000:   8%|████████▌                                                                                                      | 1852/24000 [03:39<44:33,  8.28it/s][A
iter: 1852; loss: 1.559; loss_rec: 21.189; loss_kl: 114.880; beta: 0.000:   8%|████████▍                                                                                                     | 1852/24000 [03:39<44:33,  8.28it/s][A
iter: 1852; loss: 1.559; loss_rec: 21.189; loss_kl: 114.880; beta: 0.000:   8%|████████▍                                                                                                     | 1853/24000 [03:39<44:35,  8.28it/s][A
iter: 1853; loss: 3.220; loss_rec: 36.597; loss_kl: 102.354; beta: 0.000:   8%|█

iter: 1886; loss: 2.741; loss_rec: 30.152; loss_kl: 104.537; beta: 0.000:   8%|████████▋                                                                                                     | 1886/24000 [03:43<44:38,  8.26it/s][A
iter: 1886; loss: 2.741; loss_rec: 30.152; loss_kl: 104.537; beta: 0.000:   8%|████████▋                                                                                                     | 1887/24000 [03:43<44:25,  8.30it/s][A
iter: 1887; loss: 2.208; loss_rec: 22.294; loss_kl: 111.668; beta: 0.000:   8%|████████▋                                                                                                     | 1887/24000 [03:44<44:25,  8.30it/s][A
iter: 1887; loss: 2.208; loss_rec: 22.294; loss_kl: 111.668; beta: 0.000:   8%|████████▋                                                                                                     | 1888/24000 [03:44<44:17,  8.32it/s][A
iter: 1888; loss: 1.955; loss_rec: 11.135; loss_kl: 98.826; beta: 0.000:   8%|██

iter: 1921; loss: 1.867; loss_rec: 12.600; loss_kl: 101.719; beta: 0.000:   8%|████████▊                                                                                                     | 1921/24000 [03:48<44:15,  8.31it/s][A
iter: 1921; loss: 1.867; loss_rec: 12.600; loss_kl: 101.719; beta: 0.000:   8%|████████▊                                                                                                     | 1922/24000 [03:48<44:10,  8.33it/s][A
iter: 1922; loss: 2.127; loss_rec: 24.669; loss_kl: 111.987; beta: 0.000:   8%|████████▊                                                                                                     | 1922/24000 [03:48<44:10,  8.33it/s][A
iter: 1922; loss: 2.127; loss_rec: 24.669; loss_kl: 111.987; beta: 0.000:   8%|████████▊                                                                                                     | 1923/24000 [03:48<44:11,  8.33it/s][A
iter: 1923; loss: 3.581; loss_rec: 43.828; loss_kl: 106.184; beta: 0.000:   8%|█

iter: 1956; loss: 2.568; loss_rec: 30.796; loss_kl: 114.274; beta: 0.000:   8%|████████▉                                                                                                     | 1956/24000 [03:52<44:33,  8.25it/s][A
iter: 1956; loss: 2.568; loss_rec: 30.796; loss_kl: 114.274; beta: 0.000:   8%|████████▉                                                                                                     | 1957/24000 [03:52<44:17,  8.29it/s][A
iter: 1957; loss: 2.583; loss_rec: 27.982; loss_kl: 111.591; beta: 0.000:   8%|████████▉                                                                                                     | 1957/24000 [03:52<44:17,  8.29it/s][A
iter: 1957; loss: 2.583; loss_rec: 27.982; loss_kl: 111.591; beta: 0.000:   8%|████████▉                                                                                                     | 1958/24000 [03:52<44:26,  8.27it/s][A
iter: 1958; loss: 2.557; loss_rec: 21.257; loss_kl: 113.326; beta: 0.000:   8%|█

iter: 1991; loss: 1.774; loss_rec: 15.962; loss_kl: 104.754; beta: 0.000:   8%|█████████▏                                                                                                    | 1991/24000 [03:56<44:14,  8.29it/s][A
iter: 1991; loss: 1.774; loss_rec: 15.962; loss_kl: 104.754; beta: 0.000:   8%|█████████▏                                                                                                    | 1992/24000 [03:56<44:15,  8.29it/s][A
iter: 1992; loss: 1.980; loss_rec: 22.794; loss_kl: 109.042; beta: 0.000:   8%|█████████▏                                                                                                    | 1992/24000 [03:56<44:15,  8.29it/s][A
iter: 1992; loss: 1.980; loss_rec: 22.794; loss_kl: 109.042; beta: 0.000:   8%|█████████▏                                                                                                    | 1993/24000 [03:56<44:15,  8.29it/s][A
iter: 1993; loss: 1.383; loss_rec: 9.679; loss_kl: 102.339; beta: 0.000:   8%|██

iter: 2026; loss: 2.761; loss_rec: 39.447; loss_kl: 116.404; beta: 0.000:   8%|█████████▎                                                                                                    | 2026/24000 [04:00<44:07,  8.30it/s][A
iter: 2026; loss: 2.761; loss_rec: 39.447; loss_kl: 116.404; beta: 0.000:   8%|█████████▎                                                                                                    | 2027/24000 [04:00<44:12,  8.28it/s][A
iter: 2027; loss: 2.882; loss_rec: 32.360; loss_kl: 114.681; beta: 0.000:   8%|█████████▎                                                                                                    | 2027/24000 [04:00<44:12,  8.28it/s][A
iter: 2027; loss: 2.882; loss_rec: 32.360; loss_kl: 114.681; beta: 0.000:   8%|█████████▎                                                                                                    | 2028/24000 [04:00<44:06,  8.30it/s][A
iter: 2028; loss: 2.920; loss_rec: 44.129; loss_kl: 106.166; beta: 0.000:   8%|█

iter: 2061; loss: 2.862; loss_rec: 37.338; loss_kl: 105.306; beta: 0.000:   9%|█████████▍                                                                                                    | 2061/24000 [04:04<44:10,  8.28it/s][A
iter: 2061; loss: 2.862; loss_rec: 37.338; loss_kl: 105.306; beta: 0.000:   9%|█████████▍                                                                                                    | 2062/24000 [04:05<44:13,  8.27it/s][A
iter: 2062; loss: 2.598; loss_rec: 33.772; loss_kl: 114.396; beta: 0.000:   9%|█████████▍                                                                                                    | 2062/24000 [04:05<44:13,  8.27it/s][A
iter: 2062; loss: 2.598; loss_rec: 33.772; loss_kl: 114.396; beta: 0.000:   9%|█████████▍                                                                                                    | 2063/24000 [04:05<44:22,  8.24it/s][A
iter: 2063; loss: 1.467; loss_rec: 7.335; loss_kl: 100.397; beta: 0.000:   9%|██

iter: 2096; loss: 3.018; loss_rec: 40.241; loss_kl: 114.461; beta: 0.000:   9%|█████████▌                                                                                                    | 2096/24000 [04:09<43:57,  8.31it/s][A
iter: 2096; loss: 3.018; loss_rec: 40.241; loss_kl: 114.461; beta: 0.000:   9%|█████████▌                                                                                                    | 2097/24000 [04:09<44:08,  8.27it/s][A
iter: 2097; loss: 3.199; loss_rec: 47.686; loss_kl: 114.428; beta: 0.000:   9%|█████████▌                                                                                                    | 2097/24000 [04:09<44:08,  8.27it/s][A
iter: 2097; loss: 3.199; loss_rec: 47.686; loss_kl: 114.428; beta: 0.000:   9%|█████████▌                                                                                                    | 2098/24000 [04:09<44:10,  8.26it/s][A
iter: 2098; loss: 2.267; loss_rec: 22.041; loss_kl: 113.888; beta: 0.000:   9%|█

iter: 2131; loss: 2.934; loss_rec: 34.178; loss_kl: 116.253; beta: 0.000:   9%|█████████▊                                                                                                    | 2131/24000 [04:13<44:37,  8.17it/s][A
iter: 2131; loss: 2.934; loss_rec: 34.178; loss_kl: 116.253; beta: 0.000:   9%|█████████▊                                                                                                    | 2132/24000 [04:13<44:56,  8.11it/s][A
iter: 2132; loss: 2.255; loss_rec: 23.881; loss_kl: 121.108; beta: 0.000:   9%|█████████▊                                                                                                    | 2132/24000 [04:13<44:56,  8.11it/s][A
iter: 2132; loss: 2.255; loss_rec: 23.881; loss_kl: 121.108; beta: 0.000:   9%|█████████▊                                                                                                    | 2133/24000 [04:13<44:30,  8.19it/s][A
iter: 2133; loss: 2.796; loss_rec: 41.247; loss_kl: 106.849; beta: 0.000:   9%|█

iter: 2166; loss: 2.491; loss_rec: 26.700; loss_kl: 116.420; beta: 0.000:   9%|█████████▉                                                                                                    | 2166/24000 [04:17<43:50,  8.30it/s][A
iter: 2166; loss: 2.491; loss_rec: 26.700; loss_kl: 116.420; beta: 0.000:   9%|█████████▉                                                                                                    | 2167/24000 [04:17<43:40,  8.33it/s][A
iter: 2167; loss: 3.292; loss_rec: 47.339; loss_kl: 115.803; beta: 0.000:   9%|█████████▉                                                                                                    | 2167/24000 [04:17<43:40,  8.33it/s][A
iter: 2167; loss: 3.292; loss_rec: 47.339; loss_kl: 115.803; beta: 0.000:   9%|█████████▉                                                                                                    | 2168/24000 [04:17<43:49,  8.30it/s][A
iter: 2168; loss: 3.454; loss_rec: 44.904; loss_kl: 111.962; beta: 0.000:   9%|█

iter: 2201; loss: 2.384; loss_rec: 37.161; loss_kl: 111.168; beta: 0.000:   9%|██████████                                                                                                    | 2201/24000 [04:21<43:42,  8.31it/s][A
iter: 2201; loss: 2.384; loss_rec: 37.161; loss_kl: 111.168; beta: 0.000:   9%|██████████                                                                                                    | 2202/24000 [04:21<43:49,  8.29it/s][A
iter: 2202; loss: 2.783; loss_rec: 42.241; loss_kl: 114.906; beta: 0.000:   9%|██████████                                                                                                    | 2202/24000 [04:21<43:49,  8.29it/s][A
iter: 2202; loss: 2.783; loss_rec: 42.241; loss_kl: 114.906; beta: 0.000:   9%|██████████                                                                                                    | 2203/24000 [04:22<43:54,  8.27it/s][A
iter: 2203; loss: 2.418; loss_rec: 21.041; loss_kl: 110.443; beta: 0.000:   9%|█

iter: 2236; loss: 2.793; loss_rec: 35.212; loss_kl: 108.948; beta: 0.000:   9%|██████████▏                                                                                                   | 2236/24000 [04:26<43:56,  8.25it/s][A
iter: 2236; loss: 2.793; loss_rec: 35.212; loss_kl: 108.948; beta: 0.000:   9%|██████████▎                                                                                                   | 2237/24000 [04:26<44:02,  8.24it/s][A
iter: 2237; loss: 2.006; loss_rec: 16.847; loss_kl: 116.773; beta: 0.000:   9%|██████████▎                                                                                                   | 2237/24000 [04:26<44:02,  8.24it/s][A
iter: 2237; loss: 2.006; loss_rec: 16.847; loss_kl: 116.773; beta: 0.000:   9%|██████████▎                                                                                                   | 2238/24000 [04:26<43:40,  8.30it/s][A
iter: 2238; loss: 1.377; loss_rec: 9.054; loss_kl: 89.224; beta: 0.000:   9%|███

iter: 2271; loss: 2.534; loss_rec: 40.537; loss_kl: 114.435; beta: 0.000:   9%|██████████▍                                                                                                   | 2271/24000 [04:30<43:40,  8.29it/s][A
iter: 2271; loss: 2.534; loss_rec: 40.537; loss_kl: 114.435; beta: 0.000:   9%|██████████▍                                                                                                   | 2272/24000 [04:30<43:51,  8.26it/s][A
iter: 2272; loss: 1.089; loss_rec: 6.535; loss_kl: 89.784; beta: 0.000:   9%|██████████▌                                                                                                     | 2272/24000 [04:30<43:51,  8.26it/s][A
iter: 2272; loss: 1.089; loss_rec: 6.535; loss_kl: 89.784; beta: 0.000:   9%|██████████▌                                                                                                     | 2273/24000 [04:30<43:38,  8.30it/s][A
iter: 2273; loss: 2.190; loss_rec: 13.932; loss_kl: 101.658; beta: 0.000:   9%|█

iter: 2306; loss: 1.180; loss_rec: 9.438; loss_kl: 101.935; beta: 0.000:  10%|██████████▋                                                                                                    | 2306/24000 [04:34<43:40,  8.28it/s][A
iter: 2306; loss: 1.180; loss_rec: 9.438; loss_kl: 101.935; beta: 0.000:  10%|██████████▋                                                                                                    | 2307/24000 [04:34<43:49,  8.25it/s][A
iter: 2307; loss: 1.788; loss_rec: 22.331; loss_kl: 109.949; beta: 0.000:  10%|██████████▌                                                                                                   | 2307/24000 [04:34<43:49,  8.25it/s][A
iter: 2307; loss: 1.788; loss_rec: 22.331; loss_kl: 109.949; beta: 0.000:  10%|██████████▌                                                                                                   | 2308/24000 [04:34<43:27,  8.32it/s][A
iter: 2308; loss: 2.185; loss_rec: 31.371; loss_kl: 110.123; beta: 0.000:  10%|█

iter: 2341; loss: 3.089; loss_rec: 47.815; loss_kl: 111.148; beta: 0.000:  10%|██████████▋                                                                                                   | 2341/24000 [04:38<43:26,  8.31it/s][A
iter: 2341; loss: 3.089; loss_rec: 47.815; loss_kl: 111.148; beta: 0.000:  10%|██████████▋                                                                                                   | 2342/24000 [04:38<43:32,  8.29it/s][A
iter: 2342; loss: 1.205; loss_rec: 9.872; loss_kl: 103.419; beta: 0.000:  10%|██████████▊                                                                                                    | 2342/24000 [04:38<43:32,  8.29it/s][A
iter: 2342; loss: 1.205; loss_rec: 9.872; loss_kl: 103.419; beta: 0.000:  10%|██████████▊                                                                                                    | 2343/24000 [04:38<43:24,  8.32it/s][A
iter: 2343; loss: 3.874; loss_rec: 56.698; loss_kl: 121.389; beta: 0.000:  10%|█

iter: 2376; loss: 2.080; loss_rec: 28.350; loss_kl: 105.765; beta: 0.000:  10%|██████████▉                                                                                                   | 2376/24000 [04:42<42:52,  8.41it/s][A
iter: 2376; loss: 2.080; loss_rec: 28.350; loss_kl: 105.765; beta: 0.000:  10%|██████████▉                                                                                                   | 2377/24000 [04:42<43:05,  8.36it/s][A
iter: 2377; loss: 2.623; loss_rec: 30.864; loss_kl: 110.302; beta: 0.000:  10%|██████████▉                                                                                                   | 2377/24000 [04:42<43:05,  8.36it/s][A
iter: 2377; loss: 2.623; loss_rec: 30.864; loss_kl: 110.302; beta: 0.000:  10%|██████████▉                                                                                                   | 2378/24000 [04:43<42:47,  8.42it/s][A
iter: 2378; loss: 1.932; loss_rec: 14.479; loss_kl: 103.963; beta: 0.000:  10%|█

iter: 2411; loss: 2.088; loss_rec: 14.126; loss_kl: 112.962; beta: 0.000:  10%|███████████                                                                                                   | 2411/24000 [04:47<43:03,  8.36it/s][A
iter: 2411; loss: 2.088; loss_rec: 14.126; loss_kl: 112.962; beta: 0.000:  10%|███████████                                                                                                   | 2412/24000 [04:47<43:02,  8.36it/s][A
iter: 2412; loss: 3.039; loss_rec: 48.628; loss_kl: 116.515; beta: 0.000:  10%|███████████                                                                                                   | 2412/24000 [04:47<43:02,  8.36it/s][A
iter: 2412; loss: 3.039; loss_rec: 48.628; loss_kl: 116.515; beta: 0.000:  10%|███████████                                                                                                   | 2413/24000 [04:47<42:33,  8.45it/s][A
iter: 2413; loss: 2.086; loss_rec: 19.863; loss_kl: 116.211; beta: 0.000:  10%|█

iter: 2446; loss: 2.835; loss_rec: 35.987; loss_kl: 123.803; beta: 0.000:  10%|███████████▏                                                                                                  | 2446/24000 [04:51<42:05,  8.53it/s][A
iter: 2446; loss: 2.835; loss_rec: 35.987; loss_kl: 123.803; beta: 0.000:  10%|███████████▏                                                                                                  | 2447/24000 [04:51<42:36,  8.43it/s][A
iter: 2447; loss: 1.441; loss_rec: 7.207; loss_kl: 101.187; beta: 0.000:  10%|███████████▎                                                                                                   | 2447/24000 [04:51<42:36,  8.43it/s][A
iter: 2447; loss: 1.441; loss_rec: 7.207; loss_kl: 101.187; beta: 0.000:  10%|███████████▎                                                                                                   | 2448/24000 [04:51<42:38,  8.42it/s][A
iter: 2448; loss: 1.977; loss_rec: 20.461; loss_kl: 110.468; beta: 0.000:  10%|█

iter: 2481; loss: 2.563; loss_rec: 24.651; loss_kl: 114.381; beta: 0.000:  10%|███████████▎                                                                                                  | 2481/24000 [04:55<41:27,  8.65it/s][A
iter: 2481; loss: 2.563; loss_rec: 24.651; loss_kl: 114.381; beta: 0.000:  10%|███████████▍                                                                                                  | 2482/24000 [04:55<41:25,  8.66it/s][A
iter: 2482; loss: 2.647; loss_rec: 32.465; loss_kl: 119.776; beta: 0.000:  10%|███████████▍                                                                                                  | 2482/24000 [04:55<41:25,  8.66it/s][A
iter: 2482; loss: 2.647; loss_rec: 32.465; loss_kl: 119.776; beta: 0.000:  10%|███████████▍                                                                                                  | 2483/24000 [04:55<42:07,  8.51it/s][A
iter: 2483; loss: 2.080; loss_rec: 21.102; loss_kl: 114.611; beta: 0.000:  10%|█

iter: 2516; loss: 2.256; loss_rec: 29.837; loss_kl: 113.695; beta: 0.000:  10%|███████████▌                                                                                                  | 2516/24000 [04:59<40:25,  8.86it/s][A
iter: 2516; loss: 2.256; loss_rec: 29.837; loss_kl: 113.695; beta: 0.000:  10%|███████████▌                                                                                                  | 2517/24000 [04:59<40:39,  8.80it/s][A
iter: 2517; loss: 2.336; loss_rec: 33.684; loss_kl: 116.758; beta: 0.000:  10%|███████████▌                                                                                                  | 2517/24000 [04:59<40:39,  8.80it/s][A
iter: 2517; loss: 2.336; loss_rec: 33.684; loss_kl: 116.758; beta: 0.000:  10%|███████████▌                                                                                                  | 2518/24000 [04:59<40:55,  8.75it/s][A
iter: 2518; loss: 3.482; loss_rec: 50.625; loss_kl: 113.533; beta: 0.000:  10%|█

iter: 2551; loss: 2.221; loss_rec: 34.950; loss_kl: 111.208; beta: 0.000:  11%|███████████▋                                                                                                  | 2551/24000 [05:03<43:09,  8.28it/s][A
iter: 2551; loss: 2.221; loss_rec: 34.950; loss_kl: 111.208; beta: 0.000:  11%|███████████▋                                                                                                  | 2552/24000 [05:03<43:15,  8.26it/s][A
iter: 2552; loss: 2.576; loss_rec: 24.473; loss_kl: 120.366; beta: 0.000:  11%|███████████▋                                                                                                  | 2552/24000 [05:03<43:15,  8.26it/s][A
iter: 2552; loss: 2.576; loss_rec: 24.473; loss_kl: 120.366; beta: 0.000:  11%|███████████▋                                                                                                  | 2553/24000 [05:03<43:01,  8.31it/s][A
iter: 2553; loss: 2.290; loss_rec: 18.094; loss_kl: 99.508; beta: 0.000:  11%|██

iter: 2586; loss: 3.316; loss_rec: 48.101; loss_kl: 114.608; beta: 0.000:  11%|███████████▊                                                                                                  | 2586/24000 [05:07<43:00,  8.30it/s][A
iter: 2586; loss: 3.316; loss_rec: 48.101; loss_kl: 114.608; beta: 0.000:  11%|███████████▊                                                                                                  | 2587/24000 [05:07<43:02,  8.29it/s][A
iter: 2587; loss: 2.855; loss_rec: 31.804; loss_kl: 118.371; beta: 0.000:  11%|███████████▊                                                                                                  | 2587/24000 [05:07<43:02,  8.29it/s][A
iter: 2587; loss: 2.855; loss_rec: 31.804; loss_kl: 118.371; beta: 0.000:  11%|███████████▊                                                                                                  | 2588/24000 [05:08<42:55,  8.31it/s][A
iter: 2588; loss: 2.421; loss_rec: 17.864; loss_kl: 102.700; beta: 0.000:  11%|█

iter: 2621; loss: 2.648; loss_rec: 26.982; loss_kl: 114.328; beta: 0.000:  11%|████████████                                                                                                  | 2621/24000 [05:12<42:59,  8.29it/s][A
iter: 2621; loss: 2.648; loss_rec: 26.982; loss_kl: 114.328; beta: 0.000:  11%|████████████                                                                                                  | 2622/24000 [05:12<42:55,  8.30it/s][A
iter: 2622; loss: 2.251; loss_rec: 22.001; loss_kl: 118.043; beta: 0.000:  11%|████████████                                                                                                  | 2622/24000 [05:12<42:55,  8.30it/s][A
iter: 2622; loss: 2.251; loss_rec: 22.001; loss_kl: 118.043; beta: 0.000:  11%|████████████                                                                                                  | 2623/24000 [05:12<42:53,  8.31it/s][A
iter: 2623; loss: 2.377; loss_rec: 36.186; loss_kl: 118.839; beta: 0.000:  11%|█

iter: 2656; loss: 2.957; loss_rec: 39.209; loss_kl: 117.356; beta: 0.000:  11%|████████████▏                                                                                                 | 2656/24000 [05:16<42:56,  8.28it/s][A
iter: 2656; loss: 2.957; loss_rec: 39.209; loss_kl: 117.356; beta: 0.000:  11%|████████████▏                                                                                                 | 2657/24000 [05:16<42:58,  8.28it/s][A
iter: 2657; loss: 1.909; loss_rec: 20.682; loss_kl: 109.950; beta: 0.000:  11%|████████████▏                                                                                                 | 2657/24000 [05:16<42:58,  8.28it/s][A
iter: 2657; loss: 1.909; loss_rec: 20.682; loss_kl: 109.950; beta: 0.000:  11%|████████████▏                                                                                                 | 2658/24000 [05:16<42:52,  8.30it/s][A
iter: 2658; loss: 3.045; loss_rec: 43.064; loss_kl: 122.402; beta: 0.000:  11%|█

iter: 2691; loss: 1.936; loss_rec: 12.860; loss_kl: 110.117; beta: 0.000:  11%|████████████▎                                                                                                 | 2691/24000 [05:20<42:54,  8.28it/s][A
iter: 2691; loss: 1.936; loss_rec: 12.860; loss_kl: 110.117; beta: 0.000:  11%|████████████▎                                                                                                 | 2692/24000 [05:20<42:38,  8.33it/s][A
iter: 2692; loss: 2.766; loss_rec: 34.433; loss_kl: 126.149; beta: 0.000:  11%|████████████▎                                                                                                 | 2692/24000 [05:20<42:38,  8.33it/s][A
iter: 2692; loss: 2.766; loss_rec: 34.433; loss_kl: 126.149; beta: 0.000:  11%|████████████▎                                                                                                 | 2693/24000 [05:20<42:45,  8.31it/s][A
iter: 2693; loss: 2.711; loss_rec: 36.207; loss_kl: 126.873; beta: 0.000:  11%|█

iter: 2726; loss: 2.410; loss_rec: 33.303; loss_kl: 110.377; beta: 0.000:  11%|████████████▍                                                                                                 | 2726/24000 [05:24<42:43,  8.30it/s][A
iter: 2726; loss: 2.410; loss_rec: 33.303; loss_kl: 110.377; beta: 0.000:  11%|████████████▍                                                                                                 | 2727/24000 [05:24<42:44,  8.29it/s][A
iter: 2727; loss: 3.210; loss_rec: 47.451; loss_kl: 116.941; beta: 0.000:  11%|████████████▍                                                                                                 | 2727/24000 [05:24<42:44,  8.29it/s][A
iter: 2727; loss: 3.210; loss_rec: 47.451; loss_kl: 116.941; beta: 0.000:  11%|████████████▌                                                                                                 | 2728/24000 [05:24<42:42,  8.30it/s][A
iter: 2728; loss: 3.272; loss_rec: 47.976; loss_kl: 112.782; beta: 0.000:  11%|█

iter: 2761; loss: 2.886; loss_rec: 38.910; loss_kl: 117.484; beta: 0.000:  12%|████████████▋                                                                                                 | 2761/24000 [05:28<42:43,  8.29it/s][A
iter: 2761; loss: 2.886; loss_rec: 38.910; loss_kl: 117.484; beta: 0.000:  12%|████████████▋                                                                                                 | 2762/24000 [05:29<42:41,  8.29it/s][A
iter: 2762; loss: 3.147; loss_rec: 24.965; loss_kl: 122.281; beta: 0.000:  12%|████████████▋                                                                                                 | 2762/24000 [05:29<42:41,  8.29it/s][A
iter: 2762; loss: 3.147; loss_rec: 24.965; loss_kl: 122.281; beta: 0.000:  12%|████████████▋                                                                                                 | 2763/24000 [05:29<42:38,  8.30it/s][A
iter: 2763; loss: 2.168; loss_rec: 23.879; loss_kl: 122.721; beta: 0.000:  12%|█

iter: 2796; loss: 3.478; loss_rec: 38.378; loss_kl: 110.762; beta: 0.000:  12%|████████████▊                                                                                                 | 2796/24000 [05:33<42:19,  8.35it/s][A
iter: 2796; loss: 3.478; loss_rec: 38.378; loss_kl: 110.762; beta: 0.000:  12%|████████████▊                                                                                                 | 2797/24000 [05:33<42:42,  8.27it/s][A
iter: 2797; loss: 1.956; loss_rec: 18.920; loss_kl: 111.852; beta: 0.000:  12%|████████████▊                                                                                                 | 2797/24000 [05:33<42:42,  8.27it/s][A
iter: 2797; loss: 1.956; loss_rec: 18.920; loss_kl: 111.852; beta: 0.000:  12%|████████████▊                                                                                                 | 2798/24000 [05:33<42:29,  8.32it/s][A
iter: 2798; loss: 2.346; loss_rec: 17.332; loss_kl: 105.840; beta: 0.000:  12%|█

iter: 2831; loss: 2.024; loss_rec: 18.946; loss_kl: 116.447; beta: 0.000:  12%|████████████▉                                                                                                 | 2831/24000 [05:37<42:38,  8.27it/s][A
iter: 2831; loss: 2.024; loss_rec: 18.946; loss_kl: 116.447; beta: 0.000:  12%|████████████▉                                                                                                 | 2832/24000 [05:37<42:25,  8.32it/s][A
iter: 2832; loss: 3.999; loss_rec: 38.652; loss_kl: 121.756; beta: 0.000:  12%|████████████▉                                                                                                 | 2832/24000 [05:37<42:25,  8.32it/s][A
iter: 2832; loss: 3.999; loss_rec: 38.652; loss_kl: 121.756; beta: 0.000:  12%|████████████▉                                                                                                 | 2833/24000 [05:37<42:59,  8.21it/s][A
iter: 2833; loss: 3.088; loss_rec: 32.846; loss_kl: 129.650; beta: 0.000:  12%|█

iter: 2866; loss: 3.937; loss_rec: 52.292; loss_kl: 121.990; beta: 0.000:  12%|█████████████▏                                                                                                | 2866/24000 [05:41<42:47,  8.23it/s][A
iter: 2866; loss: 3.937; loss_rec: 52.292; loss_kl: 121.990; beta: 0.000:  12%|█████████████▏                                                                                                | 2867/24000 [05:41<42:20,  8.32it/s][A
iter: 2867; loss: 2.268; loss_rec: 25.747; loss_kl: 112.165; beta: 0.000:  12%|█████████████▏                                                                                                | 2867/24000 [05:41<42:20,  8.32it/s][A
iter: 2867; loss: 2.268; loss_rec: 25.747; loss_kl: 112.165; beta: 0.000:  12%|█████████████▏                                                                                                | 2868/24000 [05:41<42:30,  8.29it/s][A
iter: 2868; loss: 2.428; loss_rec: 24.069; loss_kl: 114.693; beta: 0.000:  12%|█

iter: 2901; loss: 2.519; loss_rec: 19.604; loss_kl: 124.508; beta: 0.000:  12%|█████████████▎                                                                                                | 2901/24000 [05:45<42:29,  8.28it/s][A
iter: 2901; loss: 2.519; loss_rec: 19.604; loss_kl: 124.508; beta: 0.000:  12%|█████████████▎                                                                                                | 2902/24000 [05:45<42:19,  8.31it/s][A
iter: 2902; loss: 1.833; loss_rec: 16.404; loss_kl: 123.718; beta: 0.000:  12%|█████████████▎                                                                                                | 2902/24000 [05:45<42:19,  8.31it/s][A
iter: 2902; loss: 1.833; loss_rec: 16.404; loss_kl: 123.718; beta: 0.000:  12%|█████████████▎                                                                                                | 2903/24000 [05:46<42:26,  8.28it/s][A
iter: 2903; loss: 2.756; loss_rec: 40.955; loss_kl: 117.413; beta: 0.000:  12%|█

iter: 2936; loss: 2.011; loss_rec: 21.741; loss_kl: 119.837; beta: 0.000:  12%|█████████████▍                                                                                                | 2936/24000 [05:50<42:28,  8.27it/s][A
iter: 2936; loss: 2.011; loss_rec: 21.741; loss_kl: 119.837; beta: 0.000:  12%|█████████████▍                                                                                                | 2937/24000 [05:50<42:39,  8.23it/s][A
iter: 2937; loss: 3.438; loss_rec: 40.680; loss_kl: 115.085; beta: 0.000:  12%|█████████████▍                                                                                                | 2937/24000 [05:50<42:39,  8.23it/s][A
iter: 2937; loss: 3.438; loss_rec: 40.680; loss_kl: 115.085; beta: 0.000:  12%|█████████████▍                                                                                                | 2938/24000 [05:50<42:10,  8.32it/s][A
iter: 2938; loss: 1.752; loss_rec: 11.655; loss_kl: 102.748; beta: 0.000:  12%|█

iter: 2971; loss: 2.731; loss_rec: 33.888; loss_kl: 120.582; beta: 0.000:  12%|█████████████▌                                                                                                | 2971/24000 [05:54<42:34,  8.23it/s][A
iter: 2971; loss: 2.731; loss_rec: 33.888; loss_kl: 120.582; beta: 0.000:  12%|█████████████▌                                                                                                | 2972/24000 [05:54<42:19,  8.28it/s][A
iter: 2972; loss: 2.142; loss_rec: 23.560; loss_kl: 118.353; beta: 0.000:  12%|█████████████▌                                                                                                | 2972/24000 [05:54<42:19,  8.28it/s][A
iter: 2972; loss: 2.142; loss_rec: 23.560; loss_kl: 118.353; beta: 0.000:  12%|█████████████▋                                                                                                | 2973/24000 [05:54<42:06,  8.32it/s][A
iter: 2973; loss: 3.314; loss_rec: 43.921; loss_kl: 110.395; beta: 0.000:  12%|█

iter: 3006; loss: 3.279; loss_rec: 40.063; loss_kl: 123.132; beta: 0.000:  13%|█████████████▊                                                                                                | 3006/24000 [05:58<39:31,  8.85it/s][A
iter: 3006; loss: 3.279; loss_rec: 40.063; loss_kl: 123.132; beta: 0.000:  13%|█████████████▊                                                                                                | 3007/24000 [05:58<39:23,  8.88it/s][A
iter: 3007; loss: 2.621; loss_rec: 25.463; loss_kl: 122.632; beta: 0.000:  13%|█████████████▊                                                                                                | 3007/24000 [05:58<39:23,  8.88it/s][A
iter: 3007; loss: 2.621; loss_rec: 25.463; loss_kl: 122.632; beta: 0.000:  13%|█████████████▊                                                                                                | 3008/24000 [05:58<39:05,  8.95it/s][A
iter: 3008; loss: 2.278; loss_rec: 21.584; loss_kl: 119.529; beta: 0.000:  13%|█

iter: 3041; loss: 0.965; loss_rec: 6.240; loss_kl: 87.132; beta: 0.000:  13%|██████████████▏                                                                                                 | 3041/24000 [06:02<39:21,  8.87it/s][A
iter: 3041; loss: 0.965; loss_rec: 6.240; loss_kl: 87.132; beta: 0.000:  13%|██████████████▏                                                                                                 | 3042/24000 [06:02<38:50,  8.99it/s][A
iter: 3042; loss: 2.317; loss_rec: 22.220; loss_kl: 96.003; beta: 0.000:  13%|██████████████                                                                                                 | 3042/24000 [06:02<38:50,  8.99it/s][A
iter: 3042; loss: 2.317; loss_rec: 22.220; loss_kl: 96.003; beta: 0.000:  13%|██████████████                                                                                                 | 3043/24000 [06:02<38:48,  9.00it/s][A
iter: 3043; loss: 3.328; loss_rec: 34.971; loss_kl: 107.740; beta: 0.000:  13%|█

iter: 3076; loss: 3.395; loss_rec: 53.739; loss_kl: 108.119; beta: 0.000:  13%|██████████████                                                                                                | 3076/24000 [06:06<40:21,  8.64it/s][A
iter: 3076; loss: 3.395; loss_rec: 53.739; loss_kl: 108.119; beta: 0.000:  13%|██████████████                                                                                                | 3077/24000 [06:06<41:02,  8.50it/s][A
iter: 3077; loss: 3.004; loss_rec: 34.411; loss_kl: 109.019; beta: 0.000:  13%|██████████████                                                                                                | 3077/24000 [06:06<41:02,  8.50it/s][A
iter: 3077; loss: 3.004; loss_rec: 34.411; loss_kl: 109.019; beta: 0.000:  13%|██████████████                                                                                                | 3078/24000 [06:06<41:00,  8.50it/s][A
iter: 3078; loss: 1.159; loss_rec: 6.957; loss_kl: 93.795; beta: 0.000:  13%|███

iter: 3111; loss: 2.445; loss_rec: 21.664; loss_kl: 116.039; beta: 0.000:  13%|██████████████▎                                                                                               | 3111/24000 [06:10<42:13,  8.25it/s][A
iter: 3111; loss: 2.445; loss_rec: 21.664; loss_kl: 116.039; beta: 0.000:  13%|██████████████▎                                                                                               | 3112/24000 [06:10<42:10,  8.26it/s][A
iter: 3112; loss: 2.455; loss_rec: 33.894; loss_kl: 126.207; beta: 0.000:  13%|██████████████▎                                                                                               | 3112/24000 [06:10<42:10,  8.26it/s][A
iter: 3112; loss: 2.455; loss_rec: 33.894; loss_kl: 126.207; beta: 0.000:  13%|██████████████▎                                                                                               | 3113/24000 [06:10<42:19,  8.22it/s][A
iter: 3113; loss: 2.416; loss_rec: 29.330; loss_kl: 122.449; beta: 0.000:  13%|█

iter: 3146; loss: 1.419; loss_rec: 7.096; loss_kl: 103.521; beta: 0.000:  13%|██████████████▌                                                                                                | 3146/24000 [06:14<41:08,  8.45it/s][A
iter: 3146; loss: 1.419; loss_rec: 7.096; loss_kl: 103.521; beta: 0.000:  13%|██████████████▌                                                                                                | 3147/24000 [06:14<40:49,  8.51it/s][A
iter: 3147; loss: 2.493; loss_rec: 29.207; loss_kl: 118.885; beta: 0.000:  13%|██████████████▍                                                                                               | 3147/24000 [06:14<40:49,  8.51it/s][A
iter: 3147; loss: 2.493; loss_rec: 29.207; loss_kl: 118.885; beta: 0.000:  13%|██████████████▍                                                                                               | 3148/24000 [06:14<41:14,  8.43it/s][A
iter: 3148; loss: 3.601; loss_rec: 53.198; loss_kl: 120.021; beta: 0.000:  13%|█

iter: 3181; loss: 2.123; loss_rec: 29.980; loss_kl: 106.803; beta: 0.000:  13%|██████████████▌                                                                                               | 3181/24000 [06:18<41:43,  8.31it/s][A
iter: 3181; loss: 2.123; loss_rec: 29.980; loss_kl: 106.803; beta: 0.000:  13%|██████████████▌                                                                                               | 3182/24000 [06:19<41:53,  8.28it/s][A
iter: 3182; loss: 3.017; loss_rec: 30.523; loss_kl: 123.713; beta: 0.000:  13%|██████████████▌                                                                                               | 3182/24000 [06:19<41:53,  8.28it/s][A
iter: 3182; loss: 3.017; loss_rec: 30.523; loss_kl: 123.713; beta: 0.000:  13%|██████████████▌                                                                                               | 3183/24000 [06:19<42:06,  8.24it/s][A
iter: 3183; loss: 1.218; loss_rec: 7.054; loss_kl: 105.892; beta: 0.000:  13%|██

iter: 3216; loss: 2.824; loss_rec: 39.121; loss_kl: 113.596; beta: 0.000:  13%|██████████████▋                                                                                               | 3216/24000 [06:23<41:06,  8.43it/s][A
iter: 3216; loss: 2.824; loss_rec: 39.121; loss_kl: 113.596; beta: 0.000:  13%|██████████████▋                                                                                               | 3217/24000 [06:23<41:22,  8.37it/s][A
iter: 3217; loss: 2.439; loss_rec: 28.277; loss_kl: 119.880; beta: 0.000:  13%|██████████████▋                                                                                               | 3217/24000 [06:23<41:22,  8.37it/s][A
iter: 3217; loss: 2.439; loss_rec: 28.277; loss_kl: 119.880; beta: 0.000:  13%|██████████████▋                                                                                               | 3218/24000 [06:23<41:23,  8.37it/s][A
iter: 3218; loss: 1.664; loss_rec: 8.322; loss_kl: 84.563; beta: 0.000:  13%|███

iter: 3251; loss: 3.270; loss_rec: 41.559; loss_kl: 124.497; beta: 0.000:  14%|██████████████▉                                                                                               | 3251/24000 [06:27<40:09,  8.61it/s][A
iter: 3251; loss: 3.270; loss_rec: 41.559; loss_kl: 124.497; beta: 0.000:  14%|██████████████▉                                                                                               | 3252/24000 [06:27<40:11,  8.60it/s][A
iter: 3252; loss: 2.178; loss_rec: 15.406; loss_kl: 114.496; beta: 0.000:  14%|██████████████▉                                                                                               | 3252/24000 [06:27<40:11,  8.60it/s][A
iter: 3252; loss: 2.178; loss_rec: 15.406; loss_kl: 114.496; beta: 0.000:  14%|██████████████▉                                                                                               | 3253/24000 [06:27<39:46,  8.69it/s][A
iter: 3253; loss: 2.769; loss_rec: 24.763; loss_kl: 116.108; beta: 0.000:  14%|█

iter: 3286; loss: 2.429; loss_rec: 31.943; loss_kl: 124.503; beta: 0.000:  14%|███████████████                                                                                               | 3286/24000 [06:31<41:02,  8.41it/s][A
iter: 3286; loss: 2.429; loss_rec: 31.943; loss_kl: 124.503; beta: 0.000:  14%|███████████████                                                                                               | 3287/24000 [06:31<41:18,  8.36it/s][A
iter: 3287; loss: 2.945; loss_rec: 40.676; loss_kl: 128.978; beta: 0.000:  14%|███████████████                                                                                               | 3287/24000 [06:31<41:18,  8.36it/s][A
iter: 3287; loss: 2.945; loss_rec: 40.676; loss_kl: 128.978; beta: 0.000:  14%|███████████████                                                                                               | 3288/24000 [06:31<41:42,  8.28it/s][A
iter: 3288; loss: 1.913; loss_rec: 20.634; loss_kl: 121.881; beta: 0.000:  14%|█

iter: 3321; loss: 1.917; loss_rec: 14.533; loss_kl: 121.046; beta: 0.000:  14%|███████████████▏                                                                                              | 3321/24000 [06:35<40:37,  8.49it/s][A
iter: 3321; loss: 1.917; loss_rec: 14.533; loss_kl: 121.046; beta: 0.000:  14%|███████████████▏                                                                                              | 3322/24000 [06:35<41:01,  8.40it/s][A
iter: 3322; loss: 2.250; loss_rec: 20.109; loss_kl: 122.086; beta: 0.000:  14%|███████████████▏                                                                                              | 3322/24000 [06:35<41:01,  8.40it/s][A
iter: 3322; loss: 2.250; loss_rec: 20.109; loss_kl: 122.086; beta: 0.000:  14%|███████████████▏                                                                                              | 3323/24000 [06:35<40:49,  8.44it/s][A
iter: 3323; loss: 2.792; loss_rec: 38.360; loss_kl: 125.708; beta: 0.000:  14%|█

iter: 3356; loss: 1.731; loss_rec: 15.785; loss_kl: 120.541; beta: 0.000:  14%|███████████████▍                                                                                              | 3356/24000 [06:39<41:18,  8.33it/s][A
iter: 3356; loss: 1.731; loss_rec: 15.785; loss_kl: 120.541; beta: 0.000:  14%|███████████████▍                                                                                              | 3357/24000 [06:39<41:38,  8.26it/s][A
iter: 3357; loss: 2.008; loss_rec: 26.700; loss_kl: 116.415; beta: 0.000:  14%|███████████████▍                                                                                              | 3357/24000 [06:39<41:38,  8.26it/s][A
iter: 3357; loss: 2.008; loss_rec: 26.700; loss_kl: 116.415; beta: 0.000:  14%|███████████████▍                                                                                              | 3358/24000 [06:40<41:34,  8.28it/s][A
iter: 3358; loss: 2.550; loss_rec: 33.888; loss_kl: 120.847; beta: 0.000:  14%|█

iter: 3391; loss: 2.587; loss_rec: 31.040; loss_kl: 120.471; beta: 0.000:  14%|███████████████▌                                                                                              | 3391/24000 [06:44<41:25,  8.29it/s][A
iter: 3391; loss: 2.587; loss_rec: 31.040; loss_kl: 120.471; beta: 0.000:  14%|███████████████▌                                                                                              | 3392/24000 [06:44<41:19,  8.31it/s][A
iter: 3392; loss: 2.494; loss_rec: 23.679; loss_kl: 127.438; beta: 0.000:  14%|███████████████▌                                                                                              | 3392/24000 [06:44<41:19,  8.31it/s][A
iter: 3392; loss: 2.494; loss_rec: 23.679; loss_kl: 127.438; beta: 0.000:  14%|███████████████▌                                                                                              | 3393/24000 [06:44<40:56,  8.39it/s][A
iter: 3393; loss: 3.855; loss_rec: 57.284; loss_kl: 118.922; beta: 0.000:  14%|█

iter: 3426; loss: 2.044; loss_rec: 18.058; loss_kl: 121.369; beta: 0.000:  14%|███████████████▋                                                                                              | 3426/24000 [06:48<41:13,  8.32it/s][A
iter: 3426; loss: 2.044; loss_rec: 18.058; loss_kl: 121.369; beta: 0.000:  14%|███████████████▋                                                                                              | 3427/24000 [06:48<41:30,  8.26it/s][A
iter: 3427; loss: 2.825; loss_rec: 25.485; loss_kl: 123.800; beta: 0.000:  14%|███████████████▋                                                                                              | 3427/24000 [06:48<41:30,  8.26it/s][A
iter: 3427; loss: 2.825; loss_rec: 25.485; loss_kl: 123.800; beta: 0.000:  14%|███████████████▋                                                                                              | 3428/24000 [06:48<41:14,  8.31it/s][A
iter: 3428; loss: 1.755; loss_rec: 16.218; loss_kl: 114.732; beta: 0.000:  14%|█

iter: 3461; loss: 3.426; loss_rec: 45.827; loss_kl: 128.925; beta: 0.000:  14%|███████████████▊                                                                                              | 3461/24000 [06:52<41:18,  8.29it/s][A
iter: 3461; loss: 3.426; loss_rec: 45.827; loss_kl: 128.925; beta: 0.000:  14%|███████████████▊                                                                                              | 3462/24000 [06:52<41:25,  8.26it/s][A
iter: 3462; loss: 2.362; loss_rec: 23.618; loss_kl: 124.077; beta: 0.000:  14%|███████████████▊                                                                                              | 3462/24000 [06:52<41:25,  8.26it/s][A
iter: 3462; loss: 2.362; loss_rec: 23.618; loss_kl: 124.077; beta: 0.000:  14%|███████████████▊                                                                                              | 3463/24000 [06:52<41:25,  8.26it/s][A
iter: 3463; loss: 1.515; loss_rec: 8.693; loss_kl: 114.635; beta: 0.000:  14%|██

iter: 3496; loss: 2.583; loss_rec: 30.466; loss_kl: 111.472; beta: 0.000:  15%|████████████████                                                                                              | 3496/24000 [06:56<40:55,  8.35it/s][A
iter: 3496; loss: 2.583; loss_rec: 30.466; loss_kl: 111.472; beta: 0.000:  15%|████████████████                                                                                              | 3497/24000 [06:56<40:59,  8.34it/s][A
iter: 3497; loss: 2.077; loss_rec: 17.410; loss_kl: 108.442; beta: 0.000:  15%|████████████████                                                                                              | 3497/24000 [06:56<40:59,  8.34it/s][A
iter: 3497; loss: 2.077; loss_rec: 17.410; loss_kl: 108.442; beta: 0.000:  15%|████████████████                                                                                              | 3498/24000 [06:56<41:11,  8.29it/s][A
iter: 3498; loss: 2.807; loss_rec: 34.572; loss_kl: 122.961; beta: 0.000:  15%|█

iter: 3531; loss: 2.527; loss_rec: 34.334; loss_kl: 120.618; beta: 0.000:  15%|████████████████▏                                                                                             | 3531/24000 [07:00<41:09,  8.29it/s][A
iter: 3531; loss: 2.527; loss_rec: 34.334; loss_kl: 120.618; beta: 0.000:  15%|████████████████▏                                                                                             | 3532/24000 [07:01<41:12,  8.28it/s][A
iter: 3532; loss: 2.420; loss_rec: 31.247; loss_kl: 119.750; beta: 0.000:  15%|████████████████▏                                                                                             | 3532/24000 [07:01<41:12,  8.28it/s][A
iter: 3532; loss: 2.420; loss_rec: 31.247; loss_kl: 119.750; beta: 0.000:  15%|████████████████▏                                                                                             | 3533/24000 [07:01<41:10,  8.28it/s][A
iter: 3533; loss: 3.046; loss_rec: 32.086; loss_kl: 124.050; beta: 0.000:  15%|█

iter: 3566; loss: 2.273; loss_rec: 30.392; loss_kl: 124.671; beta: 0.000:  15%|████████████████▎                                                                                             | 3566/24000 [07:05<41:05,  8.29it/s][A
iter: 3566; loss: 2.273; loss_rec: 30.392; loss_kl: 124.671; beta: 0.000:  15%|████████████████▎                                                                                             | 3567/24000 [07:05<41:02,  8.30it/s][A
iter: 3567; loss: 1.379; loss_rec: 14.162; loss_kl: 116.803; beta: 0.000:  15%|████████████████▎                                                                                             | 3567/24000 [07:05<41:02,  8.30it/s][A
iter: 3567; loss: 1.379; loss_rec: 14.162; loss_kl: 116.803; beta: 0.000:  15%|████████████████▎                                                                                             | 3568/24000 [07:05<41:18,  8.24it/s][A
iter: 3568; loss: 2.778; loss_rec: 44.502; loss_kl: 109.828; beta: 0.000:  15%|█

iter: 3601; loss: 2.217; loss_rec: 25.911; loss_kl: 123.432; beta: 0.000:  15%|████████████████▌                                                                                             | 3601/24000 [07:09<40:58,  8.30it/s][A
iter: 3601; loss: 2.217; loss_rec: 25.911; loss_kl: 123.432; beta: 0.000:  15%|████████████████▌                                                                                             | 3602/24000 [07:09<41:12,  8.25it/s][A
iter: 3602; loss: 2.871; loss_rec: 36.440; loss_kl: 123.003; beta: 0.000:  15%|████████████████▌                                                                                             | 3602/24000 [07:09<41:12,  8.25it/s][A
iter: 3602; loss: 2.871; loss_rec: 36.440; loss_kl: 123.003; beta: 0.000:  15%|████████████████▌                                                                                             | 3603/24000 [07:09<40:57,  8.30it/s][A
iter: 3603; loss: 3.516; loss_rec: 51.656; loss_kl: 128.636; beta: 0.000:  15%|█

iter: 3636; loss: 1.640; loss_rec: 13.616; loss_kl: 123.816; beta: 0.000:  15%|████████████████▋                                                                                             | 3636/24000 [07:13<40:58,  8.28it/s][A
iter: 3636; loss: 1.640; loss_rec: 13.616; loss_kl: 123.816; beta: 0.000:  15%|████████████████▋                                                                                             | 3637/24000 [07:13<40:56,  8.29it/s][A
iter: 3637; loss: 2.521; loss_rec: 23.306; loss_kl: 114.364; beta: 0.000:  15%|████████████████▋                                                                                             | 3637/24000 [07:13<40:56,  8.29it/s][A
iter: 3637; loss: 2.521; loss_rec: 23.306; loss_kl: 114.364; beta: 0.000:  15%|████████████████▋                                                                                             | 3638/24000 [07:13<40:54,  8.30it/s][A
iter: 3638; loss: 2.694; loss_rec: 28.740; loss_kl: 116.142; beta: 0.000:  15%|█

iter: 3671; loss: 2.241; loss_rec: 26.215; loss_kl: 113.982; beta: 0.000:  15%|████████████████▊                                                                                             | 3671/24000 [07:17<40:52,  8.29it/s][A
iter: 3671; loss: 2.241; loss_rec: 26.215; loss_kl: 113.982; beta: 0.000:  15%|████████████████▊                                                                                             | 3672/24000 [07:17<40:55,  8.28it/s][A
iter: 3672; loss: 1.935; loss_rec: 22.251; loss_kl: 116.452; beta: 0.000:  15%|████████████████▊                                                                                             | 3672/24000 [07:17<40:55,  8.28it/s][A
iter: 3672; loss: 1.935; loss_rec: 22.251; loss_kl: 116.452; beta: 0.000:  15%|████████████████▊                                                                                             | 3673/24000 [07:18<40:52,  8.29it/s][A
iter: 3673; loss: 2.298; loss_rec: 18.647; loss_kl: 125.738; beta: 0.000:  15%|█

iter: 3706; loss: 2.084; loss_rec: 13.837; loss_kl: 110.269; beta: 0.000:  15%|████████████████▉                                                                                             | 3706/24000 [07:22<40:57,  8.26it/s][A
iter: 3706; loss: 2.084; loss_rec: 13.837; loss_kl: 110.269; beta: 0.000:  15%|████████████████▉                                                                                             | 3707/24000 [07:22<40:43,  8.30it/s][A
iter: 3707; loss: 2.183; loss_rec: 26.477; loss_kl: 112.288; beta: 0.000:  15%|████████████████▉                                                                                             | 3707/24000 [07:22<40:43,  8.30it/s][A
iter: 3707; loss: 2.183; loss_rec: 26.477; loss_kl: 112.288; beta: 0.000:  15%|████████████████▉                                                                                             | 3708/24000 [07:22<40:44,  8.30it/s][A
iter: 3708; loss: 1.150; loss_rec: 8.052; loss_kl: 105.374; beta: 0.000:  15%|██

iter: 3741; loss: 2.703; loss_rec: 25.845; loss_kl: 113.495; beta: 0.000:  16%|█████████████████▏                                                                                            | 3741/24000 [07:26<40:41,  8.30it/s][A
iter: 3741; loss: 2.703; loss_rec: 25.845; loss_kl: 113.495; beta: 0.000:  16%|█████████████████▏                                                                                            | 3742/24000 [07:26<40:34,  8.32it/s][A
iter: 3742; loss: 1.981; loss_rec: 15.850; loss_kl: 111.740; beta: 0.000:  16%|█████████████████▏                                                                                            | 3742/24000 [07:26<40:34,  8.32it/s][A
iter: 3742; loss: 1.981; loss_rec: 15.850; loss_kl: 111.740; beta: 0.000:  16%|█████████████████▏                                                                                            | 3743/24000 [07:26<40:35,  8.32it/s][A
iter: 3743; loss: 2.792; loss_rec: 33.596; loss_kl: 113.859; beta: 0.000:  16%|█

iter: 3776; loss: 2.975; loss_rec: 41.621; loss_kl: 109.457; beta: 0.000:  16%|█████████████████▎                                                                                            | 3776/24000 [07:30<40:42,  8.28it/s][A
iter: 3776; loss: 2.975; loss_rec: 41.621; loss_kl: 109.457; beta: 0.000:  16%|█████████████████▎                                                                                            | 3777/24000 [07:30<40:40,  8.29it/s][A
iter: 3777; loss: 2.952; loss_rec: 42.017; loss_kl: 127.245; beta: 0.000:  16%|█████████████████▎                                                                                            | 3777/24000 [07:30<40:40,  8.29it/s][A
iter: 3777; loss: 2.952; loss_rec: 42.017; loss_kl: 127.245; beta: 0.000:  16%|█████████████████▎                                                                                            | 3778/24000 [07:30<42:22,  7.95it/s][A
iter: 3778; loss: 2.477; loss_rec: 25.014; loss_kl: 126.902; beta: 0.000:  16%|█

iter: 3811; loss: 2.724; loss_rec: 39.153; loss_kl: 125.825; beta: 0.000:  16%|█████████████████▍                                                                                            | 3811/24000 [07:34<41:02,  8.20it/s][A
iter: 3811; loss: 2.724; loss_rec: 39.153; loss_kl: 125.825; beta: 0.000:  16%|█████████████████▍                                                                                            | 3812/24000 [07:34<41:04,  8.19it/s][A
iter: 3812; loss: 2.830; loss_rec: 32.756; loss_kl: 133.501; beta: 0.000:  16%|█████████████████▍                                                                                            | 3812/24000 [07:34<41:04,  8.19it/s][A
iter: 3812; loss: 2.830; loss_rec: 32.756; loss_kl: 133.501; beta: 0.000:  16%|█████████████████▍                                                                                            | 3813/24000 [07:34<41:00,  8.20it/s][A
iter: 3813; loss: 2.255; loss_rec: 22.052; loss_kl: 123.646; beta: 0.000:  16%|█

iter: 3846; loss: 2.709; loss_rec: 31.121; loss_kl: 125.029; beta: 0.000:  16%|█████████████████▋                                                                                            | 3846/24000 [07:38<40:44,  8.25it/s][A
iter: 3846; loss: 2.709; loss_rec: 31.121; loss_kl: 125.029; beta: 0.000:  16%|█████████████████▋                                                                                            | 3847/24000 [07:39<40:11,  8.36it/s][A
iter: 3847; loss: 1.491; loss_rec: 8.621; loss_kl: 109.327; beta: 0.000:  16%|█████████████████▊                                                                                             | 3847/24000 [07:39<40:11,  8.36it/s][A
iter: 3847; loss: 1.491; loss_rec: 8.621; loss_kl: 109.327; beta: 0.000:  16%|█████████████████▊                                                                                             | 3848/24000 [07:39<40:19,  8.33it/s][A
iter: 3848; loss: 2.105; loss_rec: 23.159; loss_kl: 126.363; beta: 0.000:  16%|█

iter: 3881; loss: 2.606; loss_rec: 30.451; loss_kl: 121.492; beta: 0.000:  16%|█████████████████▊                                                                                            | 3881/24000 [07:43<39:44,  8.44it/s][A
iter: 3881; loss: 2.606; loss_rec: 30.451; loss_kl: 121.492; beta: 0.000:  16%|█████████████████▊                                                                                            | 3882/24000 [07:43<39:43,  8.44it/s][A
iter: 3882; loss: 3.234; loss_rec: 41.376; loss_kl: 126.646; beta: 0.000:  16%|█████████████████▊                                                                                            | 3882/24000 [07:43<39:43,  8.44it/s][A
iter: 3882; loss: 3.234; loss_rec: 41.376; loss_kl: 126.646; beta: 0.000:  16%|█████████████████▊                                                                                            | 3883/24000 [07:43<39:53,  8.40it/s][A
iter: 3883; loss: 2.741; loss_rec: 35.113; loss_kl: 126.831; beta: 0.000:  16%|█

iter: 3916; loss: 2.750; loss_rec: 36.562; loss_kl: 123.177; beta: 0.000:  16%|█████████████████▉                                                                                            | 3916/24000 [07:47<39:42,  8.43it/s][A
iter: 3916; loss: 2.750; loss_rec: 36.562; loss_kl: 123.177; beta: 0.000:  16%|█████████████████▉                                                                                            | 3917/24000 [07:47<40:07,  8.34it/s][A
iter: 3917; loss: 2.858; loss_rec: 23.712; loss_kl: 126.944; beta: 0.000:  16%|█████████████████▉                                                                                            | 3917/24000 [07:47<40:07,  8.34it/s][A
iter: 3917; loss: 2.858; loss_rec: 23.712; loss_kl: 126.944; beta: 0.000:  16%|█████████████████▉                                                                                            | 3918/24000 [07:47<40:02,  8.36it/s][A
iter: 3918; loss: 3.022; loss_rec: 36.457; loss_kl: 126.099; beta: 0.000:  16%|█

iter: 3951; loss: 0.842; loss_rec: 6.016; loss_kl: 120.039; beta: 0.000:  16%|██████████████████▎                                                                                            | 3951/24000 [07:51<39:58,  8.36it/s][A
iter: 3951; loss: 0.842; loss_rec: 6.016; loss_kl: 120.039; beta: 0.000:  16%|██████████████████▎                                                                                            | 3952/24000 [07:51<40:05,  8.33it/s][A
iter: 3952; loss: 2.313; loss_rec: 36.474; loss_kl: 114.962; beta: 0.000:  16%|██████████████████                                                                                            | 3952/24000 [07:51<40:05,  8.33it/s][A
iter: 3952; loss: 2.313; loss_rec: 36.474; loss_kl: 114.962; beta: 0.000:  16%|██████████████████                                                                                            | 3953/24000 [07:51<40:01,  8.35it/s][A
iter: 3953; loss: 2.093; loss_rec: 24.678; loss_kl: 120.018; beta: 0.000:  16%|█

iter: 3986; loss: 1.259; loss_rec: 8.097; loss_kl: 110.764; beta: 0.000:  17%|██████████████████▍                                                                                            | 3986/24000 [07:55<39:24,  8.46it/s][A
iter: 3986; loss: 1.259; loss_rec: 8.097; loss_kl: 110.764; beta: 0.000:  17%|██████████████████▍                                                                                            | 3987/24000 [07:55<39:46,  8.38it/s][A
iter: 3987; loss: 3.047; loss_rec: 43.426; loss_kl: 127.973; beta: 0.000:  17%|██████████████████▎                                                                                           | 3987/24000 [07:55<39:46,  8.38it/s][A
iter: 3987; loss: 3.047; loss_rec: 43.426; loss_kl: 127.973; beta: 0.000:  17%|██████████████████▎                                                                                           | 3988/24000 [07:55<39:56,  8.35it/s][A
iter: 3988; loss: 2.455; loss_rec: 34.353; loss_kl: 130.016; beta: 0.000:  17%|█

iter: 4021; loss: 1.747; loss_rec: 19.747; loss_kl: 121.074; beta: 0.000:  17%|██████████████████▍                                                                                           | 4021/24000 [07:59<39:06,  8.52it/s][A
iter: 4021; loss: 1.747; loss_rec: 19.747; loss_kl: 121.074; beta: 0.000:  17%|██████████████████▍                                                                                           | 4022/24000 [07:59<38:43,  8.60it/s][A
iter: 4022; loss: 2.844; loss_rec: 26.804; loss_kl: 107.228; beta: 0.000:  17%|██████████████████▍                                                                                           | 4022/24000 [07:59<38:43,  8.60it/s][A
iter: 4022; loss: 2.844; loss_rec: 26.804; loss_kl: 107.228; beta: 0.000:  17%|██████████████████▍                                                                                           | 4023/24000 [07:59<38:53,  8.56it/s][A
iter: 4023; loss: 3.733; loss_rec: 54.518; loss_kl: 124.761; beta: 0.000:  17%|█

iter: 4056; loss: 1.930; loss_rec: 9.648; loss_kl: 114.450; beta: 0.000:  17%|██████████████████▊                                                                                            | 4056/24000 [08:03<40:20,  8.24it/s][A
iter: 4056; loss: 1.930; loss_rec: 9.648; loss_kl: 114.450; beta: 0.000:  17%|██████████████████▊                                                                                            | 4057/24000 [08:03<40:15,  8.26it/s][A
iter: 4057; loss: 1.701; loss_rec: 16.443; loss_kl: 127.635; beta: 0.000:  17%|██████████████████▌                                                                                           | 4057/24000 [08:03<40:15,  8.26it/s][A
iter: 4057; loss: 1.701; loss_rec: 16.443; loss_kl: 127.635; beta: 0.000:  17%|██████████████████▌                                                                                           | 4058/24000 [08:04<40:14,  8.26it/s][A
iter: 4058; loss: 2.573; loss_rec: 36.113; loss_kl: 129.741; beta: 0.000:  17%|█

iter: 4091; loss: 2.368; loss_rec: 28.351; loss_kl: 131.453; beta: 0.000:  17%|██████████████████▊                                                                                           | 4091/24000 [08:08<39:07,  8.48it/s][A
iter: 4091; loss: 2.368; loss_rec: 28.351; loss_kl: 131.453; beta: 0.000:  17%|██████████████████▊                                                                                           | 4092/24000 [08:08<39:26,  8.41it/s][A
iter: 4092; loss: 2.486; loss_rec: 21.174; loss_kl: 120.537; beta: 0.000:  17%|██████████████████▊                                                                                           | 4092/24000 [08:08<39:26,  8.41it/s][A
iter: 4092; loss: 2.486; loss_rec: 21.174; loss_kl: 120.537; beta: 0.000:  17%|██████████████████▊                                                                                           | 4093/24000 [08:08<39:27,  8.41it/s][A
iter: 4093; loss: 2.069; loss_rec: 25.724; loss_kl: 126.398; beta: 0.000:  17%|█

iter: 4126; loss: 2.713; loss_rec: 39.568; loss_kl: 116.524; beta: 0.000:  17%|██████████████████▉                                                                                           | 4126/24000 [08:12<39:38,  8.36it/s][A
iter: 4126; loss: 2.713; loss_rec: 39.568; loss_kl: 116.524; beta: 0.000:  17%|██████████████████▉                                                                                           | 4127/24000 [08:12<39:34,  8.37it/s][A
iter: 4127; loss: 1.659; loss_rec: 11.644; loss_kl: 111.902; beta: 0.000:  17%|██████████████████▉                                                                                           | 4127/24000 [08:12<39:34,  8.37it/s][A
iter: 4127; loss: 1.659; loss_rec: 11.644; loss_kl: 111.902; beta: 0.000:  17%|██████████████████▉                                                                                           | 4128/24000 [08:12<39:20,  8.42it/s][A
iter: 4128; loss: 2.739; loss_rec: 23.558; loss_kl: 120.818; beta: 0.000:  17%|█

iter: 4161; loss: 2.267; loss_rec: 33.171; loss_kl: 117.519; beta: 0.000:  17%|███████████████████                                                                                           | 4161/24000 [08:16<38:58,  8.48it/s][A
iter: 4161; loss: 2.267; loss_rec: 33.171; loss_kl: 117.519; beta: 0.000:  17%|███████████████████                                                                                           | 4162/24000 [08:16<38:53,  8.50it/s][A
iter: 4162; loss: 2.481; loss_rec: 34.740; loss_kl: 123.532; beta: 0.000:  17%|███████████████████                                                                                           | 4162/24000 [08:16<38:53,  8.50it/s][A
iter: 4162; loss: 2.481; loss_rec: 34.740; loss_kl: 123.532; beta: 0.000:  17%|███████████████████                                                                                           | 4163/24000 [08:16<39:09,  8.44it/s][A
iter: 4163; loss: 2.226; loss_rec: 24.484; loss_kl: 133.546; beta: 0.000:  17%|█

iter: 4196; loss: 2.676; loss_rec: 41.518; loss_kl: 122.807; beta: 0.000:  17%|███████████████████▏                                                                                          | 4196/24000 [08:20<39:42,  8.31it/s][A
iter: 4196; loss: 2.676; loss_rec: 41.518; loss_kl: 122.807; beta: 0.000:  17%|███████████████████▏                                                                                          | 4197/24000 [08:20<39:37,  8.33it/s][A
iter: 4197; loss: 2.433; loss_rec: 18.878; loss_kl: 121.266; beta: 0.000:  17%|███████████████████▏                                                                                          | 4197/24000 [08:20<39:37,  8.33it/s][A
iter: 4197; loss: 2.433; loss_rec: 18.878; loss_kl: 121.266; beta: 0.000:  17%|███████████████████▏                                                                                          | 4198/24000 [08:20<39:26,  8.37it/s][A
iter: 4198; loss: 2.668; loss_rec: 39.619; loss_kl: 115.377; beta: 0.000:  17%|█

iter: 4231; loss: 3.020; loss_rec: 47.054; loss_kl: 128.673; beta: 0.000:  18%|███████████████████▍                                                                                          | 4231/24000 [08:24<39:30,  8.34it/s][A
iter: 4231; loss: 3.020; loss_rec: 47.054; loss_kl: 128.673; beta: 0.000:  18%|███████████████████▍                                                                                          | 4232/24000 [08:24<39:27,  8.35it/s][A
iter: 4232; loss: 2.306; loss_rec: 27.676; loss_kl: 129.688; beta: 0.000:  18%|███████████████████▍                                                                                          | 4232/24000 [08:24<39:27,  8.35it/s][A
iter: 4232; loss: 2.306; loss_rec: 27.676; loss_kl: 129.688; beta: 0.000:  18%|███████████████████▍                                                                                          | 4233/24000 [08:24<39:27,  8.35it/s][A
iter: 4233; loss: 2.683; loss_rec: 26.193; loss_kl: 123.249; beta: 0.000:  18%|█

iter: 4266; loss: 3.585; loss_rec: 56.677; loss_kl: 124.214; beta: 0.000:  18%|███████████████████▌                                                                                          | 4266/24000 [08:28<39:59,  8.22it/s][A
iter: 4266; loss: 3.585; loss_rec: 56.677; loss_kl: 124.214; beta: 0.000:  18%|███████████████████▌                                                                                          | 4267/24000 [08:28<39:46,  8.27it/s][A
iter: 4267; loss: 2.998; loss_rec: 36.811; loss_kl: 127.991; beta: 0.000:  18%|███████████████████▌                                                                                          | 4267/24000 [08:28<39:46,  8.27it/s][A
iter: 4267; loss: 2.998; loss_rec: 36.811; loss_kl: 127.991; beta: 0.000:  18%|███████████████████▌                                                                                          | 4268/24000 [08:29<39:26,  8.34it/s][A
iter: 4268; loss: 1.898; loss_rec: 15.181; loss_kl: 115.414; beta: 0.000:  18%|█

iter: 4301; loss: 1.969; loss_rec: 13.376; loss_kl: 121.982; beta: 0.000:  18%|███████████████████▋                                                                                          | 4301/24000 [08:32<38:57,  8.43it/s][A
iter: 4301; loss: 1.969; loss_rec: 13.376; loss_kl: 121.982; beta: 0.000:  18%|███████████████████▋                                                                                          | 4302/24000 [08:33<38:29,  8.53it/s][A
iter: 4302; loss: 1.147; loss_rec: 6.396; loss_kl: 90.067; beta: 0.000:  18%|████████████████████                                                                                            | 4302/24000 [08:33<38:29,  8.53it/s][A
iter: 4302; loss: 1.147; loss_rec: 6.396; loss_kl: 90.067; beta: 0.000:  18%|████████████████████                                                                                            | 4303/24000 [08:33<38:15,  8.58it/s][A
iter: 4303; loss: 1.569; loss_rec: 15.689; loss_kl: 114.630; beta: 0.000:  18%|█

iter: 4336; loss: 2.462; loss_rec: 35.231; loss_kl: 122.011; beta: 0.000:  18%|███████████████████▊                                                                                          | 4336/24000 [08:37<38:17,  8.56it/s][A
iter: 4336; loss: 2.462; loss_rec: 35.231; loss_kl: 122.011; beta: 0.000:  18%|███████████████████▉                                                                                          | 4337/24000 [08:37<38:36,  8.49it/s][A
iter: 4337; loss: 2.198; loss_rec: 22.020; loss_kl: 128.380; beta: 0.000:  18%|███████████████████▉                                                                                          | 4337/24000 [08:37<38:36,  8.49it/s][A
iter: 4337; loss: 2.198; loss_rec: 22.020; loss_kl: 128.380; beta: 0.000:  18%|███████████████████▉                                                                                          | 4338/24000 [08:37<38:23,  8.54it/s][A
iter: 4338; loss: 3.215; loss_rec: 47.060; loss_kl: 117.126; beta: 0.000:  18%|█

iter: 4371; loss: 2.495; loss_rec: 36.902; loss_kl: 124.003; beta: 0.000:  18%|████████████████████                                                                                          | 4371/24000 [08:41<38:38,  8.46it/s][A
iter: 4371; loss: 2.495; loss_rec: 36.902; loss_kl: 124.003; beta: 0.000:  18%|████████████████████                                                                                          | 4372/24000 [08:41<39:03,  8.38it/s][A
iter: 4372; loss: 2.821; loss_rec: 44.541; loss_kl: 124.036; beta: 0.000:  18%|████████████████████                                                                                          | 4372/24000 [08:41<39:03,  8.38it/s][A
iter: 4372; loss: 2.821; loss_rec: 44.541; loss_kl: 124.036; beta: 0.000:  18%|████████████████████                                                                                          | 4373/24000 [08:41<39:38,  8.25it/s][A
iter: 4373; loss: 3.188; loss_rec: 44.458; loss_kl: 127.328; beta: 0.000:  18%|█

iter: 4406; loss: 2.384; loss_rec: 32.841; loss_kl: 124.607; beta: 0.000:  18%|████████████████████▏                                                                                         | 4406/24000 [08:45<39:29,  8.27it/s][A
iter: 4406; loss: 2.384; loss_rec: 32.841; loss_kl: 124.607; beta: 0.000:  18%|████████████████████▏                                                                                         | 4407/24000 [08:45<39:54,  8.18it/s][A
iter: 4407; loss: 1.859; loss_rec: 18.063; loss_kl: 123.522; beta: 0.000:  18%|████████████████████▏                                                                                         | 4407/24000 [08:45<39:54,  8.18it/s][A
iter: 4407; loss: 1.859; loss_rec: 18.063; loss_kl: 123.522; beta: 0.000:  18%|████████████████████▏                                                                                         | 4408/24000 [08:45<39:20,  8.30it/s][A
iter: 4408; loss: 2.450; loss_rec: 26.772; loss_kl: 129.705; beta: 0.000:  18%|█

iter: 4441; loss: 2.529; loss_rec: 31.242; loss_kl: 127.463; beta: 0.000:  19%|████████████████████▎                                                                                         | 4441/24000 [08:49<38:24,  8.49it/s][A
iter: 4441; loss: 2.529; loss_rec: 31.242; loss_kl: 127.463; beta: 0.000:  19%|████████████████████▎                                                                                         | 4442/24000 [08:49<38:55,  8.37it/s][A
iter: 4442; loss: 2.708; loss_rec: 33.096; loss_kl: 129.531; beta: 0.000:  19%|████████████████████▎                                                                                         | 4442/24000 [08:49<38:55,  8.37it/s][A
iter: 4442; loss: 2.708; loss_rec: 33.096; loss_kl: 129.531; beta: 0.000:  19%|████████████████████▎                                                                                         | 4443/24000 [08:49<39:10,  8.32it/s][A
iter: 4443; loss: 2.068; loss_rec: 19.422; loss_kl: 121.569; beta: 0.000:  19%|█

iter: 4476; loss: 3.012; loss_rec: 47.860; loss_kl: 129.688; beta: 0.000:  19%|████████████████████▌                                                                                         | 4476/24000 [08:53<38:53,  8.37it/s][A
iter: 4476; loss: 3.012; loss_rec: 47.860; loss_kl: 129.688; beta: 0.000:  19%|████████████████████▌                                                                                         | 4477/24000 [08:53<39:15,  8.29it/s][A
iter: 4477; loss: 2.499; loss_rec: 30.434; loss_kl: 129.005; beta: 0.000:  19%|████████████████████▌                                                                                         | 4477/24000 [08:53<39:15,  8.29it/s][A
iter: 4477; loss: 2.499; loss_rec: 30.434; loss_kl: 129.005; beta: 0.000:  19%|████████████████████▌                                                                                         | 4478/24000 [08:54<38:50,  8.38it/s][A
iter: 4478; loss: 2.028; loss_rec: 19.078; loss_kl: 122.129; beta: 0.000:  19%|█

iter: 4511; loss: 1.889; loss_rec: 19.442; loss_kl: 127.575; beta: 0.000:  19%|████████████████████▋                                                                                         | 4511/24000 [08:58<38:31,  8.43it/s][A
iter: 4511; loss: 1.889; loss_rec: 19.442; loss_kl: 127.575; beta: 0.000:  19%|████████████████████▋                                                                                         | 4512/24000 [08:58<39:03,  8.31it/s][A
iter: 4512; loss: 2.503; loss_rec: 34.693; loss_kl: 132.243; beta: 0.000:  19%|████████████████████▋                                                                                         | 4512/24000 [08:58<39:03,  8.31it/s][A
iter: 4512; loss: 2.503; loss_rec: 34.693; loss_kl: 132.243; beta: 0.000:  19%|████████████████████▋                                                                                         | 4513/24000 [08:58<39:28,  8.23it/s][A
iter: 4513; loss: 2.153; loss_rec: 25.144; loss_kl: 133.291; beta: 0.000:  19%|█

iter: 4546; loss: 2.157; loss_rec: 23.430; loss_kl: 125.791; beta: 0.000:  19%|████████████████████▊                                                                                         | 4546/24000 [09:02<39:10,  8.28it/s][A
iter: 4546; loss: 2.157; loss_rec: 23.430; loss_kl: 125.791; beta: 0.000:  19%|████████████████████▊                                                                                         | 4547/24000 [09:02<38:40,  8.38it/s][A
iter: 4547; loss: 2.411; loss_rec: 27.359; loss_kl: 123.026; beta: 0.000:  19%|████████████████████▊                                                                                         | 4547/24000 [09:02<38:40,  8.38it/s][A
iter: 4547; loss: 2.411; loss_rec: 27.359; loss_kl: 123.026; beta: 0.000:  19%|████████████████████▊                                                                                         | 4548/24000 [09:02<39:04,  8.30it/s][A
iter: 4548; loss: 0.736; loss_rec: 4.415; loss_kl: 112.678; beta: 0.000:  19%|██

iter: 4581; loss: 2.944; loss_rec: 40.418; loss_kl: 116.773; beta: 0.000:  19%|████████████████████▉                                                                                         | 4581/24000 [09:06<38:45,  8.35it/s][A
iter: 4581; loss: 2.944; loss_rec: 40.418; loss_kl: 116.773; beta: 0.000:  19%|█████████████████████                                                                                         | 4582/24000 [09:06<39:12,  8.25it/s][A
iter: 4582; loss: 1.807; loss_rec: 16.076; loss_kl: 116.542; beta: 0.000:  19%|█████████████████████                                                                                         | 4582/24000 [09:06<39:12,  8.25it/s][A
iter: 4582; loss: 1.807; loss_rec: 16.076; loss_kl: 116.542; beta: 0.000:  19%|█████████████████████                                                                                         | 4583/24000 [09:06<39:06,  8.28it/s][A
iter: 4583; loss: 2.670; loss_rec: 29.336; loss_kl: 121.242; beta: 0.000:  19%|█

iter: 4616; loss: 2.395; loss_rec: 30.124; loss_kl: 129.156; beta: 0.000:  19%|█████████████████████▏                                                                                        | 4616/24000 [09:10<39:11,  8.24it/s][A
iter: 4616; loss: 2.395; loss_rec: 30.124; loss_kl: 129.156; beta: 0.000:  19%|█████████████████████▏                                                                                        | 4617/24000 [09:10<39:16,  8.23it/s][A
iter: 4617; loss: 1.763; loss_rec: 13.720; loss_kl: 115.299; beta: 0.000:  19%|█████████████████████▏                                                                                        | 4617/24000 [09:10<39:16,  8.23it/s][A
iter: 4617; loss: 1.763; loss_rec: 13.720; loss_kl: 115.299; beta: 0.000:  19%|█████████████████████▏                                                                                        | 4618/24000 [09:10<38:32,  8.38it/s][A
iter: 4618; loss: 2.522; loss_rec: 30.041; loss_kl: 119.250; beta: 0.000:  19%|█

iter: 4651; loss: 2.546; loss_rec: 25.911; loss_kl: 124.817; beta: 0.000:  19%|█████████████████████▎                                                                                        | 4651/24000 [09:14<39:22,  8.19it/s][A
iter: 4651; loss: 2.546; loss_rec: 25.911; loss_kl: 124.817; beta: 0.000:  19%|█████████████████████▎                                                                                        | 4652/24000 [09:14<38:37,  8.35it/s][A
iter: 4652; loss: 1.953; loss_rec: 13.592; loss_kl: 128.214; beta: 0.000:  19%|█████████████████████▎                                                                                        | 4652/24000 [09:15<38:37,  8.35it/s][A
iter: 4652; loss: 1.953; loss_rec: 13.592; loss_kl: 128.214; beta: 0.000:  19%|█████████████████████▎                                                                                        | 4653/24000 [09:15<38:19,  8.41it/s][A
iter: 4653; loss: 1.943; loss_rec: 29.351; loss_kl: 123.077; beta: 0.000:  19%|█

iter: 4686; loss: 2.840; loss_rec: 35.625; loss_kl: 134.510; beta: 0.000:  20%|█████████████████████▍                                                                                        | 4686/24000 [09:19<39:08,  8.22it/s][A
iter: 4686; loss: 2.840; loss_rec: 35.625; loss_kl: 134.510; beta: 0.000:  20%|█████████████████████▍                                                                                        | 4687/24000 [09:19<39:08,  8.22it/s][A
iter: 4687; loss: 2.190; loss_rec: 19.622; loss_kl: 121.478; beta: 0.000:  20%|█████████████████████▍                                                                                        | 4687/24000 [09:19<39:08,  8.22it/s][A
iter: 4687; loss: 2.190; loss_rec: 19.622; loss_kl: 121.478; beta: 0.000:  20%|█████████████████████▍                                                                                        | 4688/24000 [09:19<38:43,  8.31it/s][A
iter: 4688; loss: 2.026; loss_rec: 18.062; loss_kl: 115.002; beta: 0.000:  20%|█

iter: 4721; loss: 2.989; loss_rec: 30.578; loss_kl: 133.015; beta: 0.000:  20%|█████████████████████▋                                                                                        | 4721/24000 [09:23<38:18,  8.39it/s][A
iter: 4721; loss: 2.989; loss_rec: 30.578; loss_kl: 133.015; beta: 0.000:  20%|█████████████████████▋                                                                                        | 4722/24000 [09:23<38:20,  8.38it/s][A
iter: 4722; loss: 3.163; loss_rec: 41.925; loss_kl: 125.505; beta: 0.000:  20%|█████████████████████▋                                                                                        | 4722/24000 [09:23<38:20,  8.38it/s][A
iter: 4722; loss: 3.163; loss_rec: 41.925; loss_kl: 125.505; beta: 0.000:  20%|█████████████████████▋                                                                                        | 4723/24000 [09:23<38:30,  8.34it/s][A
iter: 4723; loss: 2.307; loss_rec: 29.304; loss_kl: 116.770; beta: 0.000:  20%|█

iter: 4756; loss: 1.810; loss_rec: 14.798; loss_kl: 127.098; beta: 0.000:  20%|█████████████████████▊                                                                                        | 4756/24000 [09:27<38:50,  8.26it/s][A
iter: 4756; loss: 1.810; loss_rec: 14.798; loss_kl: 127.098; beta: 0.000:  20%|█████████████████████▊                                                                                        | 4757/24000 [09:27<38:58,  8.23it/s][A
iter: 4757; loss: 1.660; loss_rec: 21.581; loss_kl: 121.296; beta: 0.000:  20%|█████████████████████▊                                                                                        | 4757/24000 [09:27<38:58,  8.23it/s][A
iter: 4757; loss: 1.660; loss_rec: 21.581; loss_kl: 121.296; beta: 0.000:  20%|█████████████████████▊                                                                                        | 4758/24000 [09:27<38:59,  8.23it/s][A
iter: 4758; loss: 2.724; loss_rec: 37.609; loss_kl: 136.561; beta: 0.000:  20%|█

iter: 4791; loss: 1.813; loss_rec: 17.636; loss_kl: 123.159; beta: 0.000:  20%|█████████████████████▉                                                                                        | 4791/24000 [09:31<38:39,  8.28it/s][A
iter: 4791; loss: 1.813; loss_rec: 17.636; loss_kl: 123.159; beta: 0.000:  20%|█████████████████████▉                                                                                        | 4792/24000 [09:31<38:19,  8.35it/s][A
iter: 4792; loss: 1.799; loss_rec: 15.506; loss_kl: 131.098; beta: 0.000:  20%|█████████████████████▉                                                                                        | 4792/24000 [09:31<38:19,  8.35it/s][A
iter: 4792; loss: 1.799; loss_rec: 15.506; loss_kl: 131.098; beta: 0.000:  20%|█████████████████████▉                                                                                        | 4793/24000 [09:31<37:58,  8.43it/s][A
iter: 4793; loss: 2.624; loss_rec: 39.207; loss_kl: 134.046; beta: 0.000:  20%|█

iter: 4826; loss: 1.762; loss_rec: 21.416; loss_kl: 117.281; beta: 0.000:  20%|██████████████████████                                                                                        | 4826/24000 [09:35<38:29,  8.30it/s][A
iter: 4826; loss: 1.762; loss_rec: 21.416; loss_kl: 117.281; beta: 0.000:  20%|██████████████████████                                                                                        | 4827/24000 [09:36<38:44,  8.25it/s][A
iter: 4827; loss: 2.201; loss_rec: 21.579; loss_kl: 131.233; beta: 0.000:  20%|██████████████████████                                                                                        | 4827/24000 [09:36<38:44,  8.25it/s][A
iter: 4827; loss: 2.201; loss_rec: 21.579; loss_kl: 131.233; beta: 0.000:  20%|██████████████████████▏                                                                                       | 4828/24000 [09:36<38:31,  8.30it/s][A
iter: 4828; loss: 2.526; loss_rec: 37.925; loss_kl: 119.713; beta: 0.000:  20%|█

iter: 4861; loss: 1.852; loss_rec: 16.668; loss_kl: 130.403; beta: 0.000:  20%|██████████████████████▎                                                                                       | 4861/24000 [09:40<38:29,  8.29it/s][A
iter: 4861; loss: 1.852; loss_rec: 16.668; loss_kl: 130.403; beta: 0.000:  20%|██████████████████████▎                                                                                       | 4862/24000 [09:40<38:13,  8.34it/s][A
iter: 4862; loss: 2.686; loss_rec: 40.518; loss_kl: 131.034; beta: 0.000:  20%|██████████████████████▎                                                                                       | 4862/24000 [09:40<38:13,  8.34it/s][A
iter: 4862; loss: 2.686; loss_rec: 40.518; loss_kl: 131.034; beta: 0.000:  20%|██████████████████████▎                                                                                       | 4863/24000 [09:40<38:11,  8.35it/s][A
iter: 4863; loss: 2.579; loss_rec: 34.736; loss_kl: 136.251; beta: 0.000:  20%|█

iter: 4896; loss: 3.047; loss_rec: 41.687; loss_kl: 132.948; beta: 0.000:  20%|██████████████████████▍                                                                                       | 4896/24000 [09:44<38:13,  8.33it/s][A
iter: 4896; loss: 3.047; loss_rec: 41.687; loss_kl: 132.948; beta: 0.000:  20%|██████████████████████▍                                                                                       | 4897/24000 [09:44<37:55,  8.39it/s][A
iter: 4897; loss: 1.657; loss_rec: 18.225; loss_kl: 119.848; beta: 0.000:  20%|██████████████████████▍                                                                                       | 4897/24000 [09:44<37:55,  8.39it/s][A
iter: 4897; loss: 1.657; loss_rec: 18.225; loss_kl: 119.848; beta: 0.000:  20%|██████████████████████▍                                                                                       | 4898/24000 [09:44<37:22,  8.52it/s][A
iter: 4898; loss: 1.323; loss_rec: 12.942; loss_kl: 123.869; beta: 0.000:  20%|█

iter: 4931; loss: 2.503; loss_rec: 29.444; loss_kl: 119.574; beta: 0.000:  21%|██████████████████████▌                                                                                       | 4931/24000 [09:48<38:10,  8.32it/s][A
iter: 4931; loss: 2.503; loss_rec: 29.444; loss_kl: 119.574; beta: 0.000:  21%|██████████████████████▌                                                                                       | 4932/24000 [09:48<38:26,  8.27it/s][A
iter: 4932; loss: 2.821; loss_rec: 41.106; loss_kl: 127.271; beta: 0.000:  21%|██████████████████████▌                                                                                       | 4932/24000 [09:48<38:26,  8.27it/s][A
iter: 4932; loss: 2.821; loss_rec: 41.106; loss_kl: 127.271; beta: 0.000:  21%|██████████████████████▌                                                                                       | 4933/24000 [09:48<38:21,  8.29it/s][A
iter: 4933; loss: 1.327; loss_rec: 6.633; loss_kl: 108.635; beta: 0.000:  21%|██

iter: 4966; loss: 2.038; loss_rec: 20.385; loss_kl: 128.428; beta: 0.000:  21%|██████████████████████▊                                                                                       | 4966/24000 [09:52<37:42,  8.41it/s][A
iter: 4966; loss: 2.038; loss_rec: 20.385; loss_kl: 128.428; beta: 0.000:  21%|██████████████████████▊                                                                                       | 4967/24000 [09:52<37:11,  8.53it/s][A
iter: 4967; loss: 2.814; loss_rec: 32.841; loss_kl: 132.178; beta: 0.000:  21%|██████████████████████▊                                                                                       | 4967/24000 [09:52<37:11,  8.53it/s][A
iter: 4967; loss: 2.814; loss_rec: 32.841; loss_kl: 132.178; beta: 0.000:  21%|██████████████████████▊                                                                                       | 4968/24000 [09:52<37:05,  8.55it/s][A
iter: 4968; loss: 1.880; loss_rec: 12.053; loss_kl: 121.919; beta: 0.000:  21%|█

iter: 5001; loss: 2.594; loss_rec: 34.417; loss_kl: 125.567; beta: 0.000:  21%|██████████████████████▉                                                                                       | 5001/24000 [09:56<38:34,  8.21it/s][A
iter: 5001; loss: 2.594; loss_rec: 34.417; loss_kl: 125.567; beta: 0.000:  21%|██████████████████████▉                                                                                       | 5002/24000 [09:57<38:41,  8.18it/s][A
iter: 5002; loss: 2.045; loss_rec: 21.033; loss_kl: 113.140; beta: 0.000:  21%|██████████████████████▉                                                                                       | 5002/24000 [09:57<38:41,  8.18it/s][A
iter: 5002; loss: 2.045; loss_rec: 21.033; loss_kl: 113.140; beta: 0.000:  21%|██████████████████████▉                                                                                       | 5003/24000 [09:57<38:44,  8.17it/s][A
iter: 5003; loss: 1.261; loss_rec: 11.124; loss_kl: 126.218; beta: 0.000:  21%|█

iter: 5036; loss: 2.295; loss_rec: 29.296; loss_kl: 125.127; beta: 0.000:  21%|███████████████████████                                                                                       | 5036/24000 [10:01<37:38,  8.40it/s][A
iter: 5036; loss: 2.295; loss_rec: 29.296; loss_kl: 125.127; beta: 0.000:  21%|███████████████████████                                                                                       | 5037/24000 [10:01<37:50,  8.35it/s][A
iter: 5037; loss: 1.521; loss_rec: 9.550; loss_kl: 122.407; beta: 0.000:  21%|███████████████████████▎                                                                                       | 5037/24000 [10:01<37:50,  8.35it/s][A
iter: 5037; loss: 1.521; loss_rec: 9.550; loss_kl: 122.407; beta: 0.000:  21%|███████████████████████▎                                                                                       | 5038/24000 [10:01<38:03,  8.30it/s][A
iter: 5038; loss: 2.871; loss_rec: 38.607; loss_kl: 125.306; beta: 0.000:  21%|█

iter: 5071; loss: 2.223; loss_rec: 23.162; loss_kl: 128.673; beta: 0.000:  21%|███████████████████████▏                                                                                      | 5071/24000 [10:05<37:02,  8.52it/s][A
iter: 5071; loss: 2.223; loss_rec: 23.162; loss_kl: 128.673; beta: 0.000:  21%|███████████████████████▏                                                                                      | 5072/24000 [10:05<36:49,  8.57it/s][A
iter: 5072; loss: 1.930; loss_rec: 18.040; loss_kl: 130.963; beta: 0.000:  21%|███████████████████████▏                                                                                      | 5072/24000 [10:05<36:49,  8.57it/s][A
iter: 5072; loss: 1.930; loss_rec: 18.040; loss_kl: 130.963; beta: 0.000:  21%|███████████████████████▎                                                                                      | 5073/24000 [10:05<36:56,  8.54it/s][A
iter: 5073; loss: 1.441; loss_rec: 11.531; loss_kl: 119.148; beta: 0.000:  21%|█

iter: 5106; loss: 1.705; loss_rec: 22.482; loss_kl: 127.473; beta: 0.000:  21%|███████████████████████▍                                                                                      | 5106/24000 [10:09<36:59,  8.51it/s][A
iter: 5106; loss: 1.705; loss_rec: 22.482; loss_kl: 127.473; beta: 0.000:  21%|███████████████████████▍                                                                                      | 5107/24000 [10:09<37:00,  8.51it/s][A
iter: 5107; loss: 2.146; loss_rec: 23.821; loss_kl: 136.876; beta: 0.000:  21%|███████████████████████▍                                                                                      | 5107/24000 [10:09<37:00,  8.51it/s][A
iter: 5107; loss: 2.146; loss_rec: 23.821; loss_kl: 136.876; beta: 0.000:  21%|███████████████████████▍                                                                                      | 5108/24000 [10:09<36:53,  8.54it/s][A
iter: 5108; loss: 2.229; loss_rec: 33.978; loss_kl: 129.972; beta: 0.000:  21%|█

iter: 5141; loss: 1.858; loss_rec: 11.155; loss_kl: 134.707; beta: 0.000:  21%|███████████████████████▌                                                                                      | 5141/24000 [10:13<37:49,  8.31it/s][A
iter: 5141; loss: 1.858; loss_rec: 11.155; loss_kl: 134.707; beta: 0.000:  21%|███████████████████████▌                                                                                      | 5142/24000 [10:13<38:15,  8.22it/s][A
iter: 5142; loss: 1.537; loss_rec: 9.327; loss_kl: 123.459; beta: 0.000:  21%|███████████████████████▊                                                                                       | 5142/24000 [10:13<38:15,  8.22it/s][A
iter: 5142; loss: 1.537; loss_rec: 9.327; loss_kl: 123.459; beta: 0.000:  21%|███████████████████████▊                                                                                       | 5143/24000 [10:13<38:05,  8.25it/s][A
iter: 5143; loss: 2.215; loss_rec: 21.133; loss_kl: 137.188; beta: 0.000:  21%|█

iter: 5176; loss: 2.539; loss_rec: 32.103; loss_kl: 132.664; beta: 0.000:  22%|███████████████████████▋                                                                                      | 5176/24000 [10:17<38:28,  8.16it/s][A
iter: 5176; loss: 2.539; loss_rec: 32.103; loss_kl: 132.664; beta: 0.000:  22%|███████████████████████▋                                                                                      | 5177/24000 [10:18<38:14,  8.20it/s][A
iter: 5177; loss: 2.527; loss_rec: 35.225; loss_kl: 131.995; beta: 0.000:  22%|███████████████████████▋                                                                                      | 5177/24000 [10:18<38:14,  8.20it/s][A
iter: 5177; loss: 2.527; loss_rec: 35.225; loss_kl: 131.995; beta: 0.000:  22%|███████████████████████▋                                                                                      | 5178/24000 [10:18<38:04,  8.24it/s][A
iter: 5178; loss: 2.430; loss_rec: 35.757; loss_kl: 131.908; beta: 0.000:  22%|█

iter: 5211; loss: 2.103; loss_rec: 22.635; loss_kl: 132.521; beta: 0.000:  22%|███████████████████████▉                                                                                      | 5211/24000 [10:22<37:54,  8.26it/s][A
iter: 5211; loss: 2.103; loss_rec: 22.635; loss_kl: 132.521; beta: 0.000:  22%|███████████████████████▉                                                                                      | 5212/24000 [10:22<37:38,  8.32it/s][A
iter: 5212; loss: 1.919; loss_rec: 20.526; loss_kl: 130.787; beta: 0.000:  22%|███████████████████████▉                                                                                      | 5212/24000 [10:22<37:38,  8.32it/s][A
iter: 5212; loss: 1.919; loss_rec: 20.526; loss_kl: 130.787; beta: 0.000:  22%|███████████████████████▉                                                                                      | 5213/24000 [10:22<37:53,  8.27it/s][A
iter: 5213; loss: 1.988; loss_rec: 20.486; loss_kl: 123.279; beta: 0.000:  22%|█

iter: 5246; loss: 2.152; loss_rec: 24.807; loss_kl: 124.502; beta: 0.000:  22%|████████████████████████                                                                                      | 5246/24000 [10:26<37:38,  8.30it/s][A
iter: 5246; loss: 2.152; loss_rec: 24.807; loss_kl: 124.502; beta: 0.000:  22%|████████████████████████                                                                                      | 5247/24000 [10:26<37:40,  8.30it/s][A
iter: 5247; loss: 2.654; loss_rec: 41.399; loss_kl: 125.049; beta: 0.000:  22%|████████████████████████                                                                                      | 5247/24000 [10:26<37:40,  8.30it/s][A
iter: 5247; loss: 2.654; loss_rec: 41.399; loss_kl: 125.049; beta: 0.000:  22%|████████████████████████                                                                                      | 5248/24000 [10:26<37:39,  8.30it/s][A
iter: 5248; loss: 2.343; loss_rec: 26.374; loss_kl: 125.113; beta: 0.000:  22%|█

iter: 5281; loss: 1.537; loss_rec: 11.630; loss_kl: 126.106; beta: 0.000:  22%|████████████████████████▏                                                                                     | 5281/24000 [10:30<37:37,  8.29it/s][A
iter: 5281; loss: 1.537; loss_rec: 11.630; loss_kl: 126.106; beta: 0.000:  22%|████████████████████████▏                                                                                     | 5282/24000 [10:30<38:03,  8.20it/s][A
iter: 5282; loss: 3.197; loss_rec: 42.145; loss_kl: 125.248; beta: 0.000:  22%|████████████████████████▏                                                                                     | 5282/24000 [10:30<38:03,  8.20it/s][A
iter: 5282; loss: 3.197; loss_rec: 42.145; loss_kl: 125.248; beta: 0.000:  22%|████████████████████████▏                                                                                     | 5283/24000 [10:30<38:20,  8.14it/s][A
iter: 5283; loss: 2.152; loss_rec: 17.220; loss_kl: 121.752; beta: 0.000:  22%|█

iter: 5316; loss: 2.465; loss_rec: 23.967; loss_kl: 122.430; beta: 0.000:  22%|████████████████████████▎                                                                                     | 5316/24000 [10:34<37:47,  8.24it/s][A
iter: 5316; loss: 2.465; loss_rec: 23.967; loss_kl: 122.430; beta: 0.000:  22%|████████████████████████▎                                                                                     | 5317/24000 [10:35<37:19,  8.34it/s][A
iter: 5317; loss: 2.454; loss_rec: 29.216; loss_kl: 123.028; beta: 0.000:  22%|████████████████████████▎                                                                                     | 5317/24000 [10:35<37:19,  8.34it/s][A
iter: 5317; loss: 2.454; loss_rec: 29.216; loss_kl: 123.028; beta: 0.000:  22%|████████████████████████▎                                                                                     | 5318/24000 [10:35<37:53,  8.22it/s][A
iter: 5318; loss: 2.309; loss_rec: 33.781; loss_kl: 128.515; beta: 0.000:  22%|█

iter: 5351; loss: 2.166; loss_rec: 28.882; loss_kl: 130.066; beta: 0.000:  22%|████████████████████████▌                                                                                     | 5351/24000 [10:39<37:48,  8.22it/s][A
iter: 5351; loss: 2.166; loss_rec: 28.882; loss_kl: 130.066; beta: 0.000:  22%|████████████████████████▌                                                                                     | 5352/24000 [10:39<38:12,  8.14it/s][A
iter: 5352; loss: 3.238; loss_rec: 28.948; loss_kl: 135.145; beta: 0.000:  22%|████████████████████████▌                                                                                     | 5352/24000 [10:39<38:12,  8.14it/s][A
iter: 5352; loss: 3.238; loss_rec: 28.948; loss_kl: 135.145; beta: 0.000:  22%|████████████████████████▌                                                                                     | 5353/24000 [10:39<38:22,  8.10it/s][A
iter: 5353; loss: 2.659; loss_rec: 30.958; loss_kl: 140.582; beta: 0.000:  22%|█

iter: 5386; loss: 2.610; loss_rec: 33.262; loss_kl: 125.563; beta: 0.000:  22%|████████████████████████▋                                                                                     | 5386/24000 [10:43<37:32,  8.26it/s][A
iter: 5386; loss: 2.610; loss_rec: 33.262; loss_kl: 125.563; beta: 0.000:  22%|████████████████████████▋                                                                                     | 5387/24000 [10:43<37:43,  8.22it/s][A
iter: 5387; loss: 3.543; loss_rec: 51.454; loss_kl: 125.141; beta: 0.000:  22%|████████████████████████▋                                                                                     | 5387/24000 [10:43<37:43,  8.22it/s][A
iter: 5387; loss: 3.543; loss_rec: 51.454; loss_kl: 125.141; beta: 0.000:  22%|████████████████████████▋                                                                                     | 5388/24000 [10:43<37:44,  8.22it/s][A
iter: 5388; loss: 1.188; loss_rec: 13.806; loss_kl: 123.280; beta: 0.000:  22%|█

iter: 5421; loss: 1.543; loss_rec: 10.295; loss_kl: 125.449; beta: 0.000:  23%|████████████████████████▊                                                                                     | 5421/24000 [10:47<37:41,  8.22it/s][A
iter: 5421; loss: 1.543; loss_rec: 10.295; loss_kl: 125.449; beta: 0.000:  23%|████████████████████████▊                                                                                     | 5422/24000 [10:47<37:37,  8.23it/s][A
iter: 5422; loss: 2.633; loss_rec: 34.753; loss_kl: 130.430; beta: 0.000:  23%|████████████████████████▊                                                                                     | 5422/24000 [10:47<37:37,  8.23it/s][A
iter: 5422; loss: 2.633; loss_rec: 34.753; loss_kl: 130.430; beta: 0.000:  23%|████████████████████████▊                                                                                     | 5423/24000 [10:47<37:32,  8.25it/s][A
iter: 5423; loss: 1.501; loss_rec: 15.010; loss_kl: 128.501; beta: 0.000:  23%|█

iter: 5456; loss: 2.179; loss_rec: 24.539; loss_kl: 124.619; beta: 0.000:  23%|█████████████████████████                                                                                     | 5456/24000 [10:51<38:12,  8.09it/s][A
iter: 5456; loss: 2.179; loss_rec: 24.539; loss_kl: 124.619; beta: 0.000:  23%|█████████████████████████                                                                                     | 5457/24000 [10:51<37:45,  8.19it/s][A
iter: 5457; loss: 2.163; loss_rec: 13.232; loss_kl: 122.584; beta: 0.000:  23%|█████████████████████████                                                                                     | 5457/24000 [10:52<37:45,  8.19it/s][A
iter: 5457; loss: 2.163; loss_rec: 13.232; loss_kl: 122.584; beta: 0.000:  23%|█████████████████████████                                                                                     | 5458/24000 [10:52<37:48,  8.17it/s][A
iter: 5458; loss: 3.052; loss_rec: 49.196; loss_kl: 131.717; beta: 0.000:  23%|█

iter: 5491; loss: 2.579; loss_rec: 33.604; loss_kl: 130.097; beta: 0.000:  23%|█████████████████████████▏                                                                                    | 5491/24000 [10:56<37:24,  8.25it/s][A
iter: 5491; loss: 2.579; loss_rec: 33.604; loss_kl: 130.097; beta: 0.000:  23%|█████████████████████████▏                                                                                    | 5492/24000 [10:56<37:32,  8.21it/s][A
iter: 5492; loss: 2.160; loss_rec: 27.303; loss_kl: 125.956; beta: 0.000:  23%|█████████████████████████▏                                                                                    | 5492/24000 [10:56<37:32,  8.21it/s][A
iter: 5492; loss: 2.160; loss_rec: 27.303; loss_kl: 125.956; beta: 0.000:  23%|█████████████████████████▏                                                                                    | 5493/24000 [10:56<37:34,  8.21it/s][A
iter: 5493; loss: 2.213; loss_rec: 24.478; loss_kl: 126.552; beta: 0.000:  23%|█

iter: 5526; loss: 2.832; loss_rec: 40.631; loss_kl: 137.481; beta: 0.000:  23%|█████████████████████████▎                                                                                    | 5526/24000 [11:00<37:09,  8.29it/s][A
iter: 5526; loss: 2.832; loss_rec: 40.631; loss_kl: 137.481; beta: 0.000:  23%|█████████████████████████▎                                                                                    | 5527/24000 [11:00<37:11,  8.28it/s][A
iter: 5527; loss: 1.977; loss_rec: 23.228; loss_kl: 131.572; beta: 0.000:  23%|█████████████████████████▎                                                                                    | 5527/24000 [11:00<37:11,  8.28it/s][A
iter: 5527; loss: 1.977; loss_rec: 23.228; loss_kl: 131.572; beta: 0.000:  23%|█████████████████████████▎                                                                                    | 5528/24000 [11:00<37:05,  8.30it/s][A
iter: 5528; loss: 2.339; loss_rec: 21.940; loss_kl: 132.135; beta: 0.000:  23%|█

iter: 5561; loss: 1.995; loss_rec: 14.387; loss_kl: 128.260; beta: 0.000:  23%|█████████████████████████▍                                                                                    | 5561/24000 [11:04<36:56,  8.32it/s][A
iter: 5561; loss: 1.995; loss_rec: 14.387; loss_kl: 128.260; beta: 0.000:  23%|█████████████████████████▍                                                                                    | 5562/24000 [11:04<36:51,  8.34it/s][A
iter: 5562; loss: 3.071; loss_rec: 39.786; loss_kl: 132.113; beta: 0.000:  23%|█████████████████████████▍                                                                                    | 5562/24000 [11:04<36:51,  8.34it/s][A
iter: 5562; loss: 3.071; loss_rec: 39.786; loss_kl: 132.113; beta: 0.000:  23%|█████████████████████████▍                                                                                    | 5563/24000 [11:04<36:27,  8.43it/s][A
iter: 5563; loss: 2.660; loss_rec: 26.403; loss_kl: 132.303; beta: 0.000:  23%|█

iter: 5596; loss: 2.058; loss_rec: 26.757; loss_kl: 136.703; beta: 0.000:  23%|█████████████████████████▋                                                                                    | 5596/24000 [11:08<36:20,  8.44it/s][A
iter: 5596; loss: 2.058; loss_rec: 26.757; loss_kl: 136.703; beta: 0.000:  23%|█████████████████████████▋                                                                                    | 5597/24000 [11:08<36:24,  8.42it/s][A
iter: 5597; loss: 1.721; loss_rec: 13.771; loss_kl: 141.142; beta: 0.000:  23%|█████████████████████████▋                                                                                    | 5597/24000 [11:08<36:24,  8.42it/s][A
iter: 5597; loss: 1.721; loss_rec: 13.771; loss_kl: 141.142; beta: 0.000:  23%|█████████████████████████▋                                                                                    | 5598/24000 [11:08<36:55,  8.30it/s][A
iter: 5598; loss: 1.424; loss_rec: 8.541; loss_kl: 130.788; beta: 0.000:  23%|██

iter: 5631; loss: 1.898; loss_rec: 22.373; loss_kl: 128.811; beta: 0.000:  23%|█████████████████████████▊                                                                                    | 5631/24000 [11:12<36:44,  8.33it/s][A
iter: 5631; loss: 1.898; loss_rec: 22.373; loss_kl: 128.811; beta: 0.000:  23%|█████████████████████████▊                                                                                    | 5632/24000 [11:12<36:18,  8.43it/s][A
iter: 5632; loss: 1.810; loss_rec: 16.022; loss_kl: 124.676; beta: 0.000:  23%|█████████████████████████▊                                                                                    | 5632/24000 [11:12<36:18,  8.43it/s][A
iter: 5632; loss: 1.810; loss_rec: 16.022; loss_kl: 124.676; beta: 0.000:  23%|█████████████████████████▊                                                                                    | 5633/24000 [11:13<36:25,  8.40it/s][A
iter: 5633; loss: 2.586; loss_rec: 24.783; loss_kl: 132.357; beta: 0.000:  23%|█

iter: 5666; loss: 2.171; loss_rec: 23.069; loss_kl: 135.295; beta: 0.000:  24%|█████████████████████████▉                                                                                    | 5666/24000 [11:17<36:20,  8.41it/s][A
iter: 5666; loss: 2.171; loss_rec: 23.069; loss_kl: 135.295; beta: 0.000:  24%|█████████████████████████▉                                                                                    | 5667/24000 [11:17<36:21,  8.40it/s][A
iter: 5667; loss: 2.704; loss_rec: 25.749; loss_kl: 137.052; beta: 0.000:  24%|█████████████████████████▉                                                                                    | 5667/24000 [11:17<36:21,  8.40it/s][A
iter: 5667; loss: 2.704; loss_rec: 25.749; loss_kl: 137.052; beta: 0.000:  24%|█████████████████████████▉                                                                                    | 5668/24000 [11:17<36:44,  8.32it/s][A
iter: 5668; loss: 1.824; loss_rec: 19.189; loss_kl: 127.362; beta: 0.000:  24%|█

iter: 5701; loss: 2.238; loss_rec: 28.057; loss_kl: 127.238; beta: 0.000:  24%|██████████████████████████▏                                                                                   | 5701/24000 [11:21<33:09,  9.20it/s][A
iter: 5701; loss: 2.238; loss_rec: 28.057; loss_kl: 127.238; beta: 0.000:  24%|██████████████████████████▏                                                                                   | 5702/24000 [11:21<33:20,  9.15it/s][A
iter: 5702; loss: 2.260; loss_rec: 27.185; loss_kl: 130.262; beta: 0.000:  24%|██████████████████████████▏                                                                                   | 5702/24000 [11:21<33:20,  9.15it/s][A
iter: 5702; loss: 2.260; loss_rec: 27.185; loss_kl: 130.262; beta: 0.000:  24%|██████████████████████████▏                                                                                   | 5703/24000 [11:21<33:22,  9.14it/s][A
iter: 5703; loss: 1.284; loss_rec: 8.991; loss_kl: 131.509; beta: 0.000:  24%|██

iter: 5736; loss: 2.567; loss_rec: 37.481; loss_kl: 136.375; beta: 0.000:  24%|██████████████████████████▎                                                                                   | 5736/24000 [11:24<32:47,  9.28it/s][A
iter: 5736; loss: 2.567; loss_rec: 37.481; loss_kl: 136.375; beta: 0.000:  24%|██████████████████████████▎                                                                                   | 5737/24000 [11:25<33:05,  9.20it/s][A
iter: 5737; loss: 2.811; loss_rec: 39.195; loss_kl: 137.717; beta: 0.000:  24%|██████████████████████████▎                                                                                   | 5737/24000 [11:25<33:05,  9.20it/s][A
iter: 5737; loss: 2.811; loss_rec: 39.195; loss_kl: 137.717; beta: 0.000:  24%|██████████████████████████▎                                                                                   | 5738/24000 [11:25<33:19,  9.13it/s][A
iter: 5738; loss: 2.583; loss_rec: 34.449; loss_kl: 138.913; beta: 0.000:  24%|█

iter: 5771; loss: 2.928; loss_rec: 36.808; loss_kl: 129.629; beta: 0.000:  24%|██████████████████████████▍                                                                                   | 5771/24000 [11:28<32:47,  9.26it/s][A
iter: 5771; loss: 2.928; loss_rec: 36.808; loss_kl: 129.629; beta: 0.000:  24%|██████████████████████████▍                                                                                   | 5772/24000 [11:28<32:59,  9.21it/s][A
iter: 5772; loss: 2.322; loss_rec: 16.720; loss_kl: 134.431; beta: 0.000:  24%|██████████████████████████▍                                                                                   | 5772/24000 [11:28<32:59,  9.21it/s][A
iter: 5772; loss: 2.322; loss_rec: 16.720; loss_kl: 134.431; beta: 0.000:  24%|██████████████████████████▍                                                                                   | 5773/24000 [11:28<32:42,  9.29it/s][A
iter: 5773; loss: 2.051; loss_rec: 18.810; loss_kl: 132.162; beta: 0.000:  24%|█

iter: 5806; loss: 2.071; loss_rec: 18.876; loss_kl: 130.036; beta: 0.000:  24%|██████████████████████████▌                                                                                   | 5806/24000 [11:32<32:38,  9.29it/s][A
iter: 5806; loss: 2.071; loss_rec: 18.876; loss_kl: 130.036; beta: 0.000:  24%|██████████████████████████▌                                                                                   | 5807/24000 [11:32<32:28,  9.34it/s][A
iter: 5807; loss: 2.252; loss_rec: 30.524; loss_kl: 122.568; beta: 0.000:  24%|██████████████████████████▌                                                                                   | 5807/24000 [11:32<32:28,  9.34it/s][A
iter: 5807; loss: 2.252; loss_rec: 30.524; loss_kl: 122.568; beta: 0.000:  24%|██████████████████████████▌                                                                                   | 5808/24000 [11:32<32:46,  9.25it/s][A
iter: 5808; loss: 1.771; loss_rec: 17.688; loss_kl: 122.667; beta: 0.000:  24%|█

iter: 5841; loss: 2.232; loss_rec: 18.525; loss_kl: 120.542; beta: 0.000:  24%|██████████████████████████▊                                                                                   | 5841/24000 [11:36<33:09,  9.13it/s][A
iter: 5841; loss: 2.232; loss_rec: 18.525; loss_kl: 120.542; beta: 0.000:  24%|██████████████████████████▊                                                                                   | 5842/24000 [11:36<32:46,  9.23it/s][A
iter: 5842; loss: 3.794; loss_rec: 51.683; loss_kl: 129.941; beta: 0.000:  24%|██████████████████████████▊                                                                                   | 5842/24000 [11:36<32:46,  9.23it/s][A
iter: 5842; loss: 3.794; loss_rec: 51.683; loss_kl: 129.941; beta: 0.000:  24%|██████████████████████████▊                                                                                   | 5843/24000 [11:36<33:00,  9.17it/s][A
iter: 5843; loss: 1.461; loss_rec: 12.013; loss_kl: 125.420; beta: 0.000:  24%|█

iter: 5876; loss: 2.265; loss_rec: 18.885; loss_kl: 123.952; beta: 0.000:  24%|██████████████████████████▉                                                                                   | 5876/24000 [11:40<33:00,  9.15it/s][A
iter: 5876; loss: 2.265; loss_rec: 18.885; loss_kl: 123.952; beta: 0.000:  24%|██████████████████████████▉                                                                                   | 5877/24000 [11:40<32:41,  9.24it/s][A
iter: 5877; loss: 2.916; loss_rec: 32.329; loss_kl: 130.958; beta: 0.000:  24%|██████████████████████████▉                                                                                   | 5877/24000 [11:40<32:41,  9.24it/s][A
iter: 5877; loss: 2.916; loss_rec: 32.329; loss_kl: 130.958; beta: 0.000:  24%|██████████████████████████▉                                                                                   | 5878/24000 [11:40<32:33,  9.28it/s][A
iter: 5878; loss: 2.797; loss_rec: 39.670; loss_kl: 126.907; beta: 0.000:  24%|█

iter: 5911; loss: 2.987; loss_rec: 45.242; loss_kl: 119.675; beta: 0.000:  25%|███████████████████████████                                                                                   | 5911/24000 [11:43<32:53,  9.17it/s][A
iter: 5911; loss: 2.987; loss_rec: 45.242; loss_kl: 119.675; beta: 0.000:  25%|███████████████████████████                                                                                   | 5912/24000 [11:43<33:04,  9.11it/s][A
iter: 5912; loss: 1.959; loss_rec: 19.955; loss_kl: 123.155; beta: 0.000:  25%|███████████████████████████                                                                                   | 5912/24000 [11:43<33:04,  9.11it/s][A
iter: 5912; loss: 1.959; loss_rec: 19.955; loss_kl: 123.155; beta: 0.000:  25%|███████████████████████████                                                                                   | 5913/24000 [11:44<32:44,  9.21it/s][A
iter: 5913; loss: 2.571; loss_rec: 39.467; loss_kl: 125.630; beta: 0.000:  25%|█

iter: 5946; loss: 1.918; loss_rec: 17.886; loss_kl: 108.054; beta: 0.000:  25%|███████████████████████████▎                                                                                  | 5946/24000 [11:47<32:34,  9.24it/s][A
iter: 5946; loss: 1.918; loss_rec: 17.886; loss_kl: 108.054; beta: 0.000:  25%|███████████████████████████▎                                                                                  | 5947/24000 [11:47<32:22,  9.29it/s][A
iter: 5947; loss: 1.628; loss_rec: 20.135; loss_kl: 115.226; beta: 0.000:  25%|███████████████████████████▎                                                                                  | 5947/24000 [11:47<32:22,  9.29it/s][A
iter: 5947; loss: 1.628; loss_rec: 20.135; loss_kl: 115.226; beta: 0.000:  25%|███████████████████████████▎                                                                                  | 5948/24000 [11:47<33:37,  8.95it/s][A
iter: 5948; loss: 2.197; loss_rec: 25.114; loss_kl: 122.346; beta: 0.000:  25%|█

iter: 5981; loss: 2.554; loss_rec: 34.562; loss_kl: 136.368; beta: 0.000:  25%|███████████████████████████▍                                                                                  | 5981/24000 [11:51<36:03,  8.33it/s][A
iter: 5981; loss: 2.554; loss_rec: 34.562; loss_kl: 136.368; beta: 0.000:  25%|███████████████████████████▍                                                                                  | 5982/24000 [11:51<36:06,  8.32it/s][A
iter: 5982; loss: 1.732; loss_rec: 19.126; loss_kl: 138.143; beta: 0.000:  25%|███████████████████████████▍                                                                                  | 5982/24000 [11:51<36:06,  8.32it/s][A
iter: 5982; loss: 1.732; loss_rec: 19.126; loss_kl: 138.143; beta: 0.000:  25%|███████████████████████████▍                                                                                  | 5983/24000 [11:52<36:07,  8.31it/s][A
iter: 5983; loss: 2.105; loss_rec: 22.379; loss_kl: 125.309; beta: 0.000:  25%|█

iter: 6016; loss: 2.637; loss_rec: 33.204; loss_kl: 136.020; beta: 0.000:  25%|███████████████████████████▌                                                                                  | 6016/24000 [11:56<36:20,  8.25it/s][A
iter: 6016; loss: 2.637; loss_rec: 33.204; loss_kl: 136.020; beta: 0.000:  25%|███████████████████████████▌                                                                                  | 6017/24000 [11:56<36:19,  8.25it/s][A
iter: 6017; loss: 1.696; loss_rec: 17.178; loss_kl: 137.236; beta: 0.000:  25%|███████████████████████████▌                                                                                  | 6017/24000 [11:56<36:19,  8.25it/s][A
iter: 6017; loss: 1.696; loss_rec: 17.178; loss_kl: 137.236; beta: 0.000:  25%|███████████████████████████▌                                                                                  | 6018/24000 [11:56<36:14,  8.27it/s][A
iter: 6018; loss: 2.722; loss_rec: 37.988; loss_kl: 137.433; beta: 0.000:  25%|█

iter: 6051; loss: 1.156; loss_rec: 6.901; loss_kl: 125.857; beta: 0.000:  25%|███████████████████████████▉                                                                                   | 6051/24000 [12:00<36:22,  8.23it/s][A
iter: 6051; loss: 1.156; loss_rec: 6.901; loss_kl: 125.857; beta: 0.000:  25%|███████████████████████████▉                                                                                   | 6052/24000 [12:00<36:15,  8.25it/s][A
iter: 6052; loss: 1.441; loss_rec: 14.097; loss_kl: 122.555; beta: 0.000:  25%|███████████████████████████▋                                                                                  | 6052/24000 [12:00<36:15,  8.25it/s][A
iter: 6052; loss: 1.441; loss_rec: 14.097; loss_kl: 122.555; beta: 0.000:  25%|███████████████████████████▋                                                                                  | 6053/24000 [12:00<36:13,  8.26it/s][A
iter: 6053; loss: 4.048; loss_rec: 60.235; loss_kl: 136.508; beta: 0.000:  25%|█

iter: 6086; loss: 2.295; loss_rec: 15.893; loss_kl: 127.306; beta: 0.000:  25%|███████████████████████████▉                                                                                  | 6086/24000 [12:04<35:52,  8.32it/s][A
iter: 6086; loss: 2.295; loss_rec: 15.893; loss_kl: 127.306; beta: 0.000:  25%|███████████████████████████▉                                                                                  | 6087/24000 [12:04<35:48,  8.34it/s][A
iter: 6087; loss: 1.532; loss_rec: 11.392; loss_kl: 136.520; beta: 0.000:  25%|███████████████████████████▉                                                                                  | 6087/24000 [12:04<35:48,  8.34it/s][A
iter: 6087; loss: 1.532; loss_rec: 11.392; loss_kl: 136.520; beta: 0.000:  25%|███████████████████████████▉                                                                                  | 6088/24000 [12:04<35:51,  8.33it/s][A
iter: 6088; loss: 2.350; loss_rec: 37.599; loss_kl: 135.660; beta: 0.000:  25%|█

iter: 6121; loss: 2.329; loss_rec: 22.747; loss_kl: 134.815; beta: 0.000:  26%|████████████████████████████                                                                                  | 6121/24000 [12:08<36:02,  8.27it/s][A
iter: 6121; loss: 2.329; loss_rec: 22.747; loss_kl: 134.815; beta: 0.000:  26%|████████████████████████████                                                                                  | 6122/24000 [12:08<35:43,  8.34it/s][A
iter: 6122; loss: 1.347; loss_rec: 14.383; loss_kl: 134.707; beta: 0.000:  26%|████████████████████████████                                                                                  | 6122/24000 [12:08<35:43,  8.34it/s][A
iter: 6122; loss: 1.347; loss_rec: 14.383; loss_kl: 134.707; beta: 0.000:  26%|████████████████████████████                                                                                  | 6123/24000 [12:08<35:38,  8.36it/s][A
iter: 6123; loss: 2.511; loss_rec: 36.352; loss_kl: 141.054; beta: 0.000:  26%|█

iter: 6156; loss: 2.363; loss_rec: 24.435; loss_kl: 142.987; beta: 0.000:  26%|████████████████████████████▏                                                                                 | 6156/24000 [12:12<35:49,  8.30it/s][A
iter: 6156; loss: 2.363; loss_rec: 24.435; loss_kl: 142.987; beta: 0.000:  26%|████████████████████████████▏                                                                                 | 6157/24000 [12:13<36:08,  8.23it/s][A
iter: 6157; loss: 1.923; loss_rec: 27.396; loss_kl: 138.581; beta: 0.000:  26%|████████████████████████████▏                                                                                 | 6157/24000 [12:13<36:08,  8.23it/s][A
iter: 6157; loss: 1.923; loss_rec: 27.396; loss_kl: 138.581; beta: 0.000:  26%|████████████████████████████▏                                                                                 | 6158/24000 [12:13<36:14,  8.21it/s][A
iter: 6158; loss: 3.239; loss_rec: 42.112; loss_kl: 136.864; beta: 0.000:  26%|█

iter: 6191; loss: 2.589; loss_rec: 40.682; loss_kl: 126.698; beta: 0.000:  26%|████████████████████████████▍                                                                                 | 6191/24000 [12:17<35:45,  8.30it/s][A
iter: 6191; loss: 2.589; loss_rec: 40.682; loss_kl: 126.698; beta: 0.000:  26%|████████████████████████████▍                                                                                 | 6192/24000 [12:17<35:58,  8.25it/s][A
iter: 6192; loss: 2.585; loss_rec: 27.143; loss_kl: 130.094; beta: 0.000:  26%|████████████████████████████▍                                                                                 | 6192/24000 [12:17<35:58,  8.25it/s][A
iter: 6192; loss: 2.585; loss_rec: 27.143; loss_kl: 130.094; beta: 0.000:  26%|████████████████████████████▍                                                                                 | 6193/24000 [12:17<35:47,  8.29it/s][A
iter: 6193; loss: 1.516; loss_rec: 7.465; loss_kl: 127.011; beta: 0.000:  26%|██

iter: 6226; loss: 2.003; loss_rec: 16.913; loss_kl: 130.102; beta: 0.000:  26%|████████████████████████████▌                                                                                 | 6226/24000 [12:21<35:29,  8.35it/s][A
iter: 6226; loss: 2.003; loss_rec: 16.913; loss_kl: 130.102; beta: 0.000:  26%|████████████████████████████▌                                                                                 | 6227/24000 [12:21<35:31,  8.34it/s][A
iter: 6227; loss: 1.109; loss_rec: 9.691; loss_kl: 124.457; beta: 0.000:  26%|████████████████████████████▊                                                                                  | 6227/24000 [12:21<35:31,  8.34it/s][A
iter: 6227; loss: 1.109; loss_rec: 9.691; loss_kl: 124.457; beta: 0.000:  26%|████████████████████████████▊                                                                                  | 6228/24000 [12:21<35:47,  8.27it/s][A
iter: 6228; loss: 3.039; loss_rec: 32.386; loss_kl: 124.594; beta: 0.000:  26%|█

iter: 6261; loss: 2.568; loss_rec: 30.995; loss_kl: 138.076; beta: 0.000:  26%|████████████████████████████▋                                                                                 | 6261/24000 [12:25<35:17,  8.38it/s][A
iter: 6261; loss: 2.568; loss_rec: 30.995; loss_kl: 138.076; beta: 0.000:  26%|████████████████████████████▋                                                                                 | 6262/24000 [12:25<35:30,  8.32it/s][A
iter: 6262; loss: 1.883; loss_rec: 15.116; loss_kl: 133.258; beta: 0.000:  26%|████████████████████████████▋                                                                                 | 6262/24000 [12:25<35:30,  8.32it/s][A
iter: 6262; loss: 1.883; loss_rec: 15.116; loss_kl: 133.258; beta: 0.000:  26%|████████████████████████████▋                                                                                 | 6263/24000 [12:25<35:36,  8.30it/s][A
iter: 6263; loss: 1.226; loss_rec: 6.766; loss_kl: 125.083; beta: 0.000:  26%|██

iter: 6296; loss: 2.718; loss_rec: 39.535; loss_kl: 131.389; beta: 0.000:  26%|████████████████████████████▊                                                                                 | 6296/24000 [12:29<36:43,  8.03it/s][A
iter: 6296; loss: 2.718; loss_rec: 39.535; loss_kl: 131.389; beta: 0.000:  26%|████████████████████████████▊                                                                                 | 6297/24000 [12:29<36:40,  8.04it/s][A
iter: 6297; loss: 2.413; loss_rec: 17.172; loss_kl: 129.808; beta: 0.000:  26%|████████████████████████████▊                                                                                 | 6297/24000 [12:29<36:40,  8.04it/s][A
iter: 6297; loss: 2.413; loss_rec: 17.172; loss_kl: 129.808; beta: 0.000:  26%|████████████████████████████▊                                                                                 | 6298/24000 [12:29<36:57,  7.98it/s][A
iter: 6298; loss: 1.198; loss_rec: 5.992; loss_kl: 114.633; beta: 0.000:  26%|██

iter: 6331; loss: 1.730; loss_rec: 20.465; loss_kl: 129.344; beta: 0.000:  26%|█████████████████████████████                                                                                 | 6331/24000 [12:33<35:34,  8.28it/s][A
iter: 6331; loss: 1.730; loss_rec: 20.465; loss_kl: 129.344; beta: 0.000:  26%|█████████████████████████████                                                                                 | 6332/24000 [12:33<35:26,  8.31it/s][A
iter: 6332; loss: 1.827; loss_rec: 18.265; loss_kl: 134.511; beta: 0.000:  26%|█████████████████████████████                                                                                 | 6332/24000 [12:33<35:26,  8.31it/s][A
iter: 6332; loss: 1.827; loss_rec: 18.265; loss_kl: 134.511; beta: 0.000:  26%|█████████████████████████████                                                                                 | 6333/24000 [12:34<35:08,  8.38it/s][A
iter: 6333; loss: 1.673; loss_rec: 15.308; loss_kl: 130.548; beta: 0.000:  26%|█

iter: 6366; loss: 2.143; loss_rec: 29.987; loss_kl: 119.443; beta: 0.000:  27%|█████████████████████████████▏                                                                                | 6366/24000 [12:37<34:51,  8.43it/s][A
iter: 6366; loss: 2.143; loss_rec: 29.987; loss_kl: 119.443; beta: 0.000:  27%|█████████████████████████████▏                                                                                | 6367/24000 [12:38<34:47,  8.45it/s][A
iter: 6367; loss: 1.563; loss_rec: 19.278; loss_kl: 136.567; beta: 0.000:  27%|█████████████████████████████▏                                                                                | 6367/24000 [12:38<34:47,  8.45it/s][A
iter: 6367; loss: 1.563; loss_rec: 19.278; loss_kl: 136.567; beta: 0.000:  27%|█████████████████████████████▏                                                                                | 6368/24000 [12:38<35:15,  8.33it/s][A
iter: 6368; loss: 1.101; loss_rec: 7.706; loss_kl: 118.988; beta: 0.000:  27%|██

iter: 6401; loss: 1.548; loss_rec: 15.766; loss_kl: 127.263; beta: 0.000:  27%|█████████████████████████████▎                                                                                | 6401/24000 [12:42<35:25,  8.28it/s][A
iter: 6401; loss: 1.548; loss_rec: 15.766; loss_kl: 127.263; beta: 0.000:  27%|█████████████████████████████▎                                                                                | 6402/24000 [12:42<35:13,  8.32it/s][A
iter: 6402; loss: 2.148; loss_rec: 19.328; loss_kl: 119.594; beta: 0.000:  27%|█████████████████████████████▎                                                                                | 6402/24000 [12:42<35:13,  8.32it/s][A
iter: 6402; loss: 2.148; loss_rec: 19.328; loss_kl: 119.594; beta: 0.000:  27%|█████████████████████████████▎                                                                                | 6403/24000 [12:42<35:14,  8.32it/s][A
iter: 6403; loss: 2.480; loss_rec: 28.948; loss_kl: 125.001; beta: 0.000:  27%|█

iter: 6436; loss: 1.460; loss_rec: 10.667; loss_kl: 134.508; beta: 0.000:  27%|█████████████████████████████▍                                                                                | 6436/24000 [12:46<34:07,  8.58it/s][A
iter: 6436; loss: 1.460; loss_rec: 10.667; loss_kl: 134.508; beta: 0.000:  27%|█████████████████████████████▌                                                                                | 6437/24000 [12:46<35:02,  8.35it/s][A
iter: 6437; loss: 1.835; loss_rec: 17.115; loss_kl: 143.562; beta: 0.000:  27%|█████████████████████████████▌                                                                                | 6437/24000 [12:46<35:02,  8.35it/s][A
iter: 6437; loss: 1.835; loss_rec: 17.115; loss_kl: 143.562; beta: 0.000:  27%|█████████████████████████████▌                                                                                | 6438/24000 [12:46<34:54,  8.39it/s][A
iter: 6438; loss: 2.959; loss_rec: 44.867; loss_kl: 134.828; beta: 0.000:  27%|█

iter: 6471; loss: 2.252; loss_rec: 16.333; loss_kl: 133.291; beta: 0.000:  27%|█████████████████████████████▋                                                                                | 6471/24000 [12:50<34:47,  8.40it/s][A
iter: 6471; loss: 2.252; loss_rec: 16.333; loss_kl: 133.291; beta: 0.000:  27%|█████████████████████████████▋                                                                                | 6472/24000 [12:50<34:53,  8.37it/s][A
iter: 6472; loss: 2.344; loss_rec: 23.626; loss_kl: 138.017; beta: 0.000:  27%|█████████████████████████████▋                                                                                | 6472/24000 [12:50<34:53,  8.37it/s][A
iter: 6472; loss: 2.344; loss_rec: 23.626; loss_kl: 138.017; beta: 0.000:  27%|█████████████████████████████▋                                                                                | 6473/24000 [12:50<35:01,  8.34it/s][A
iter: 6473; loss: 1.875; loss_rec: 15.852; loss_kl: 129.726; beta: 0.000:  27%|█

iter: 6506; loss: 2.560; loss_rec: 32.643; loss_kl: 133.139; beta: 0.000:  27%|█████████████████████████████▊                                                                                | 6506/24000 [12:54<35:07,  8.30it/s][A
iter: 6506; loss: 2.560; loss_rec: 32.643; loss_kl: 133.139; beta: 0.000:  27%|█████████████████████████████▊                                                                                | 6507/24000 [12:54<35:20,  8.25it/s][A
iter: 6507; loss: 2.984; loss_rec: 33.804; loss_kl: 136.969; beta: 0.000:  27%|█████████████████████████████▊                                                                                | 6507/24000 [12:54<35:20,  8.25it/s][A
iter: 6507; loss: 2.984; loss_rec: 33.804; loss_kl: 136.969; beta: 0.000:  27%|█████████████████████████████▊                                                                                | 6508/24000 [12:55<35:35,  8.19it/s][A
iter: 6508; loss: 2.014; loss_rec: 24.408; loss_kl: 131.822; beta: 0.000:  27%|█

iter: 6541; loss: 1.368; loss_rec: 15.471; loss_kl: 127.214; beta: 0.000:  27%|█████████████████████████████▉                                                                                | 6541/24000 [12:59<35:06,  8.29it/s][A
iter: 6541; loss: 1.368; loss_rec: 15.471; loss_kl: 127.214; beta: 0.000:  27%|█████████████████████████████▉                                                                                | 6542/24000 [12:59<35:13,  8.26it/s][A
iter: 6542; loss: 2.558; loss_rec: 37.947; loss_kl: 134.937; beta: 0.000:  27%|█████████████████████████████▉                                                                                | 6542/24000 [12:59<35:13,  8.26it/s][A
iter: 6542; loss: 2.558; loss_rec: 37.947; loss_kl: 134.937; beta: 0.000:  27%|█████████████████████████████▉                                                                                | 6543/24000 [12:59<35:28,  8.20it/s][A
iter: 6543; loss: 1.762; loss_rec: 19.781; loss_kl: 132.366; beta: 0.000:  27%|█

iter: 6576; loss: 1.072; loss_rec: 6.884; loss_kl: 129.338; beta: 0.000:  27%|██████████████████████████████▍                                                                                | 6576/24000 [13:03<35:13,  8.25it/s][A
iter: 6576; loss: 1.072; loss_rec: 6.884; loss_kl: 129.338; beta: 0.000:  27%|██████████████████████████████▍                                                                                | 6577/24000 [13:03<34:56,  8.31it/s][A
iter: 6577; loss: 2.269; loss_rec: 34.363; loss_kl: 123.234; beta: 0.000:  27%|██████████████████████████████▏                                                                               | 6577/24000 [13:03<34:56,  8.31it/s][A
iter: 6577; loss: 2.269; loss_rec: 34.363; loss_kl: 123.234; beta: 0.000:  27%|██████████████████████████████▏                                                                               | 6578/24000 [13:03<34:57,  8.31it/s][A
iter: 6578; loss: 2.495; loss_rec: 29.105; loss_kl: 135.842; beta: 0.000:  27%|█

iter: 6611; loss: 2.499; loss_rec: 31.280; loss_kl: 122.946; beta: 0.000:  28%|██████████████████████████████▎                                                                               | 6611/24000 [13:07<35:02,  8.27it/s][A
iter: 6611; loss: 2.499; loss_rec: 31.280; loss_kl: 122.946; beta: 0.000:  28%|██████████████████████████████▎                                                                               | 6612/24000 [13:07<34:59,  8.28it/s][A
iter: 6612; loss: 2.085; loss_rec: 25.018; loss_kl: 134.360; beta: 0.000:  28%|██████████████████████████████▎                                                                               | 6612/24000 [13:07<34:59,  8.28it/s][A
iter: 6612; loss: 2.085; loss_rec: 25.018; loss_kl: 134.360; beta: 0.000:  28%|██████████████████████████████▎                                                                               | 6613/24000 [13:07<34:54,  8.30it/s][A
iter: 6613; loss: 1.454; loss_rec: 16.914; loss_kl: 127.405; beta: 0.000:  28%|█

iter: 6646; loss: 2.768; loss_rec: 31.683; loss_kl: 135.997; beta: 0.000:  28%|██████████████████████████████▍                                                                               | 6646/24000 [13:11<35:07,  8.23it/s][A
iter: 6646; loss: 2.768; loss_rec: 31.683; loss_kl: 135.997; beta: 0.000:  28%|██████████████████████████████▍                                                                               | 6647/24000 [13:11<34:50,  8.30it/s][A
iter: 6647; loss: 1.196; loss_rec: 14.571; loss_kl: 133.853; beta: 0.000:  28%|██████████████████████████████▍                                                                               | 6647/24000 [13:11<34:50,  8.30it/s][A
iter: 6647; loss: 1.196; loss_rec: 14.571; loss_kl: 133.853; beta: 0.000:  28%|██████████████████████████████▍                                                                               | 6648/24000 [13:12<34:39,  8.34it/s][A
iter: 6648; loss: 2.375; loss_rec: 36.323; loss_kl: 127.177; beta: 0.000:  28%|█

iter: 6681; loss: 1.938; loss_rec: 29.073; loss_kl: 137.865; beta: 0.000:  28%|██████████████████████████████▌                                                                               | 6681/24000 [13:16<34:47,  8.29it/s][A
iter: 6681; loss: 1.938; loss_rec: 29.073; loss_kl: 137.865; beta: 0.000:  28%|██████████████████████████████▋                                                                               | 6682/24000 [13:16<34:51,  8.28it/s][A
iter: 6682; loss: 2.201; loss_rec: 18.295; loss_kl: 137.166; beta: 0.000:  28%|██████████████████████████████▋                                                                               | 6682/24000 [13:16<34:51,  8.28it/s][A
iter: 6682; loss: 2.201; loss_rec: 18.295; loss_kl: 137.166; beta: 0.000:  28%|██████████████████████████████▋                                                                               | 6683/24000 [13:16<34:47,  8.29it/s][A
iter: 6683; loss: 2.348; loss_rec: 18.301; loss_kl: 130.362; beta: 0.000:  28%|█

iter: 6716; loss: 3.087; loss_rec: 38.082; loss_kl: 129.706; beta: 0.000:  28%|██████████████████████████████▊                                                                               | 6716/24000 [13:20<35:07,  8.20it/s][A
iter: 6716; loss: 3.087; loss_rec: 38.082; loss_kl: 129.706; beta: 0.000:  28%|██████████████████████████████▊                                                                               | 6717/24000 [13:20<34:57,  8.24it/s][A
iter: 6717; loss: 2.314; loss_rec: 25.020; loss_kl: 133.890; beta: 0.000:  28%|██████████████████████████████▊                                                                               | 6717/24000 [13:20<34:57,  8.24it/s][A
iter: 6717; loss: 2.314; loss_rec: 25.020; loss_kl: 133.890; beta: 0.000:  28%|██████████████████████████████▊                                                                               | 6718/24000 [13:20<34:47,  8.28it/s][A
iter: 6718; loss: 1.750; loss_rec: 13.682; loss_kl: 124.836; beta: 0.000:  28%|█

iter: 6751; loss: 1.673; loss_rec: 16.636; loss_kl: 122.105; beta: 0.000:  28%|██████████████████████████████▉                                                                               | 6751/24000 [13:24<34:40,  8.29it/s][A
iter: 6751; loss: 1.673; loss_rec: 16.636; loss_kl: 122.105; beta: 0.000:  28%|██████████████████████████████▉                                                                               | 6752/24000 [13:24<34:35,  8.31it/s][A
iter: 6752; loss: 3.139; loss_rec: 38.851; loss_kl: 129.110; beta: 0.000:  28%|██████████████████████████████▉                                                                               | 6752/24000 [13:24<34:35,  8.31it/s][A
iter: 6752; loss: 3.139; loss_rec: 38.851; loss_kl: 129.110; beta: 0.000:  28%|██████████████████████████████▉                                                                               | 6753/24000 [13:24<34:37,  8.30it/s][A
iter: 6753; loss: 1.463; loss_rec: 10.961; loss_kl: 119.241; beta: 0.000:  28%|█

iter: 6786; loss: 1.984; loss_rec: 21.602; loss_kl: 130.338; beta: 0.000:  28%|███████████████████████████████                                                                               | 6786/24000 [13:28<34:17,  8.37it/s][A
iter: 6786; loss: 1.984; loss_rec: 21.602; loss_kl: 130.338; beta: 0.000:  28%|███████████████████████████████                                                                               | 6787/24000 [13:28<33:59,  8.44it/s][A
iter: 6787; loss: 1.955; loss_rec: 17.580; loss_kl: 136.650; beta: 0.000:  28%|███████████████████████████████                                                                               | 6787/24000 [13:28<33:59,  8.44it/s][A
iter: 6787; loss: 1.955; loss_rec: 17.580; loss_kl: 136.650; beta: 0.000:  28%|███████████████████████████████                                                                               | 6788/24000 [13:28<33:57,  8.45it/s][A
iter: 6788; loss: 3.047; loss_rec: 46.565; loss_kl: 136.627; beta: 0.000:  28%|█

iter: 6821; loss: 1.802; loss_rec: 16.140; loss_kl: 127.213; beta: 0.000:  28%|███████████████████████████████▎                                                                              | 6821/24000 [13:32<34:51,  8.21it/s][A
iter: 6821; loss: 1.802; loss_rec: 16.140; loss_kl: 127.213; beta: 0.000:  28%|███████████████████████████████▎                                                                              | 6822/24000 [13:33<34:56,  8.19it/s][A
iter: 6822; loss: 1.239; loss_rec: 10.669; loss_kl: 125.005; beta: 0.000:  28%|███████████████████████████████▎                                                                              | 6822/24000 [13:33<34:56,  8.19it/s][A
iter: 6822; loss: 1.239; loss_rec: 10.669; loss_kl: 125.005; beta: 0.000:  28%|███████████████████████████████▎                                                                              | 6823/24000 [13:33<34:38,  8.26it/s][A
iter: 6823; loss: 2.427; loss_rec: 33.384; loss_kl: 135.446; beta: 0.000:  28%|█

iter: 6856; loss: 2.285; loss_rec: 26.185; loss_kl: 129.105; beta: 0.000:  29%|███████████████████████████████▍                                                                              | 6856/24000 [13:37<34:31,  8.27it/s][A
iter: 6856; loss: 2.285; loss_rec: 26.185; loss_kl: 129.105; beta: 0.000:  29%|███████████████████████████████▍                                                                              | 6857/24000 [13:37<34:22,  8.31it/s][A
iter: 6857; loss: 1.660; loss_rec: 19.828; loss_kl: 130.795; beta: 0.000:  29%|███████████████████████████████▍                                                                              | 6857/24000 [13:37<34:22,  8.31it/s][A
iter: 6857; loss: 1.660; loss_rec: 19.828; loss_kl: 130.795; beta: 0.000:  29%|███████████████████████████████▍                                                                              | 6858/24000 [13:37<34:33,  8.27it/s][A
iter: 6858; loss: 2.304; loss_rec: 19.593; loss_kl: 136.267; beta: 0.000:  29%|█

iter: 6891; loss: 1.752; loss_rec: 16.642; loss_kl: 131.327; beta: 0.000:  29%|███████████████████████████████▌                                                                              | 6891/24000 [13:41<35:43,  7.98it/s][A
iter: 6891; loss: 1.752; loss_rec: 16.642; loss_kl: 131.327; beta: 0.000:  29%|███████████████████████████████▌                                                                              | 6892/24000 [13:41<35:21,  8.06it/s][A
iter: 6892; loss: 1.538; loss_rec: 12.003; loss_kl: 136.673; beta: 0.000:  29%|███████████████████████████████▌                                                                              | 6892/24000 [13:41<35:21,  8.06it/s][A
iter: 6892; loss: 1.538; loss_rec: 12.003; loss_kl: 136.673; beta: 0.000:  29%|███████████████████████████████▌                                                                              | 6893/24000 [13:41<34:56,  8.16it/s][A
iter: 6893; loss: 1.470; loss_rec: 10.414; loss_kl: 130.331; beta: 0.000:  29%|█

iter: 6926; loss: 1.524; loss_rec: 14.777; loss_kl: 136.572; beta: 0.000:  29%|███████████████████████████████▋                                                                              | 6926/24000 [13:45<34:29,  8.25it/s][A
iter: 6926; loss: 1.524; loss_rec: 14.777; loss_kl: 136.572; beta: 0.000:  29%|███████████████████████████████▋                                                                              | 6927/24000 [13:45<34:24,  8.27it/s][A
iter: 6927; loss: 2.445; loss_rec: 25.938; loss_kl: 125.195; beta: 0.000:  29%|███████████████████████████████▋                                                                              | 6927/24000 [13:45<34:24,  8.27it/s][A
iter: 6927; loss: 2.445; loss_rec: 25.938; loss_kl: 125.195; beta: 0.000:  29%|███████████████████████████████▊                                                                              | 6928/24000 [13:45<34:11,  8.32it/s][A
iter: 6928; loss: 1.811; loss_rec: 23.083; loss_kl: 121.571; beta: 0.000:  29%|█

iter: 6961; loss: 1.867; loss_rec: 21.486; loss_kl: 130.987; beta: 0.000:  29%|███████████████████████████████▉                                                                              | 6961/24000 [13:49<34:51,  8.15it/s][A
iter: 6961; loss: 1.867; loss_rec: 21.486; loss_kl: 130.987; beta: 0.000:  29%|███████████████████████████████▉                                                                              | 6962/24000 [13:50<34:38,  8.20it/s][A
iter: 6962; loss: 1.772; loss_rec: 19.285; loss_kl: 125.591; beta: 0.000:  29%|███████████████████████████████▉                                                                              | 6962/24000 [13:50<34:38,  8.20it/s][A
iter: 6962; loss: 1.772; loss_rec: 19.285; loss_kl: 125.591; beta: 0.000:  29%|███████████████████████████████▉                                                                              | 6963/24000 [13:50<34:41,  8.18it/s][A
iter: 6963; loss: 2.845; loss_rec: 44.056; loss_kl: 134.579; beta: 0.000:  29%|█

iter: 6996; loss: 1.674; loss_rec: 14.200; loss_kl: 129.465; beta: 0.000:  29%|████████████████████████████████                                                                              | 6996/24000 [13:54<34:30,  8.21it/s][A
iter: 6996; loss: 1.674; loss_rec: 14.200; loss_kl: 129.465; beta: 0.000:  29%|████████████████████████████████                                                                              | 6997/24000 [13:54<34:53,  8.12it/s][A
iter: 6997; loss: 2.456; loss_rec: 20.945; loss_kl: 124.097; beta: 0.000:  29%|████████████████████████████████                                                                              | 6997/24000 [13:54<34:53,  8.12it/s][A
iter: 6997; loss: 2.456; loss_rec: 20.945; loss_kl: 124.097; beta: 0.000:  29%|████████████████████████████████                                                                              | 6998/24000 [13:54<34:23,  8.24it/s][A
iter: 6998; loss: 2.238; loss_rec: 30.064; loss_kl: 133.495; beta: 0.000:  29%|█

iter: 7031; loss: 2.821; loss_rec: 44.116; loss_kl: 129.562; beta: 0.000:  29%|████████████████████████████████▏                                                                             | 7031/24000 [13:58<34:08,  8.28it/s][A
iter: 7031; loss: 2.821; loss_rec: 44.116; loss_kl: 129.562; beta: 0.000:  29%|████████████████████████████████▏                                                                             | 7032/24000 [13:58<34:14,  8.26it/s][A
iter: 7032; loss: 1.266; loss_rec: 7.177; loss_kl: 122.811; beta: 0.000:  29%|████████████████████████████████▌                                                                              | 7032/24000 [13:58<34:14,  8.26it/s][A
iter: 7032; loss: 1.266; loss_rec: 7.177; loss_kl: 122.811; beta: 0.000:  29%|████████████████████████████████▌                                                                              | 7033/24000 [13:58<34:17,  8.25it/s][A
iter: 7033; loss: 1.759; loss_rec: 15.886; loss_kl: 128.429; beta: 0.000:  29%|█

iter: 7066; loss: 2.238; loss_rec: 28.853; loss_kl: 128.153; beta: 0.000:  29%|████████████████████████████████▍                                                                             | 7066/24000 [14:02<33:28,  8.43it/s][A
iter: 7066; loss: 2.238; loss_rec: 28.853; loss_kl: 128.153; beta: 0.000:  29%|████████████████████████████████▍                                                                             | 7067/24000 [14:02<33:29,  8.43it/s][A
iter: 7067; loss: 1.757; loss_rec: 14.039; loss_kl: 138.748; beta: 0.000:  29%|████████████████████████████████▍                                                                             | 7067/24000 [14:02<33:29,  8.43it/s][A
iter: 7067; loss: 1.757; loss_rec: 14.039; loss_kl: 138.748; beta: 0.000:  29%|████████████████████████████████▍                                                                             | 7068/24000 [14:02<33:50,  8.34it/s][A
iter: 7068; loss: 2.717; loss_rec: 32.628; loss_kl: 129.193; beta: 0.000:  29%|█

iter: 7101; loss: 2.498; loss_rec: 25.080; loss_kl: 130.430; beta: 0.000:  30%|████████████████████████████████▌                                                                             | 7101/24000 [14:06<33:39,  8.37it/s][A
iter: 7101; loss: 2.498; loss_rec: 25.080; loss_kl: 130.430; beta: 0.000:  30%|████████████████████████████████▌                                                                             | 7102/24000 [14:06<33:40,  8.36it/s][A
iter: 7102; loss: 2.178; loss_rec: 19.790; loss_kl: 121.068; beta: 0.000:  30%|████████████████████████████████▌                                                                             | 7102/24000 [14:07<33:40,  8.36it/s][A
iter: 7102; loss: 2.178; loss_rec: 19.790; loss_kl: 121.068; beta: 0.000:  30%|████████████████████████████████▌                                                                             | 7103/24000 [14:07<33:15,  8.47it/s][A
iter: 7103; loss: 1.053; loss_rec: 6.318; loss_kl: 121.737; beta: 0.000:  30%|██

iter: 7136; loss: 1.756; loss_rec: 13.476; loss_kl: 125.655; beta: 0.000:  30%|████████████████████████████████▋                                                                             | 7136/24000 [14:11<33:42,  8.34it/s][A
iter: 7136; loss: 1.756; loss_rec: 13.476; loss_kl: 125.655; beta: 0.000:  30%|████████████████████████████████▋                                                                             | 7137/24000 [14:11<33:43,  8.33it/s][A
iter: 7137; loss: 1.504; loss_rec: 13.797; loss_kl: 112.999; beta: 0.000:  30%|████████████████████████████████▋                                                                             | 7137/24000 [14:11<33:43,  8.33it/s][A
iter: 7137; loss: 1.504; loss_rec: 13.797; loss_kl: 112.999; beta: 0.000:  30%|████████████████████████████████▋                                                                             | 7138/24000 [14:11<33:58,  8.27it/s][A
iter: 7138; loss: 1.797; loss_rec: 19.772; loss_kl: 126.683; beta: 0.000:  30%|█

iter: 7171; loss: 1.728; loss_rec: 18.270; loss_kl: 127.642; beta: 0.000:  30%|████████████████████████████████▊                                                                             | 7171/24000 [14:15<33:33,  8.36it/s][A
iter: 7171; loss: 1.728; loss_rec: 18.270; loss_kl: 127.642; beta: 0.000:  30%|████████████████████████████████▊                                                                             | 7172/24000 [14:15<33:36,  8.35it/s][A
iter: 7172; loss: 2.451; loss_rec: 30.238; loss_kl: 132.387; beta: 0.000:  30%|████████████████████████████████▊                                                                             | 7172/24000 [14:15<33:36,  8.35it/s][A
iter: 7172; loss: 2.451; loss_rec: 30.238; loss_kl: 132.387; beta: 0.000:  30%|████████████████████████████████▉                                                                             | 7173/24000 [14:15<34:03,  8.23it/s][A
iter: 7173; loss: 1.576; loss_rec: 12.612; loss_kl: 126.617; beta: 0.000:  30%|█

iter: 7206; loss: 0.972; loss_rec: 5.753; loss_kl: 115.710; beta: 0.000:  30%|█████████████████████████████████▎                                                                             | 7206/24000 [14:19<34:04,  8.21it/s][A
iter: 7206; loss: 0.972; loss_rec: 5.753; loss_kl: 115.710; beta: 0.000:  30%|█████████████████████████████████▎                                                                             | 7207/24000 [14:19<33:49,  8.27it/s][A
iter: 7207; loss: 1.194; loss_rec: 12.414; loss_kl: 137.929; beta: 0.000:  30%|█████████████████████████████████                                                                             | 7207/24000 [14:19<33:49,  8.27it/s][A
iter: 7207; loss: 1.194; loss_rec: 12.414; loss_kl: 137.929; beta: 0.000:  30%|█████████████████████████████████                                                                             | 7208/24000 [14:19<34:08,  8.20it/s][A
iter: 7208; loss: 1.710; loss_rec: 14.593; loss_kl: 132.211; beta: 0.000:  30%|█

iter: 7241; loss: 2.665; loss_rec: 22.668; loss_kl: 137.974; beta: 0.000:  30%|█████████████████████████████████▏                                                                            | 7241/24000 [14:23<33:44,  8.28it/s][A
iter: 7241; loss: 2.665; loss_rec: 22.668; loss_kl: 137.974; beta: 0.000:  30%|█████████████████████████████████▏                                                                            | 7242/24000 [14:23<33:44,  8.28it/s][A
iter: 7242; loss: 1.369; loss_rec: 11.428; loss_kl: 134.238; beta: 0.000:  30%|█████████████████████████████████▏                                                                            | 7242/24000 [14:23<33:44,  8.28it/s][A
iter: 7242; loss: 1.369; loss_rec: 11.428; loss_kl: 134.238; beta: 0.000:  30%|█████████████████████████████████▏                                                                            | 7243/24000 [14:23<33:42,  8.29it/s][A
iter: 7243; loss: 2.755; loss_rec: 35.223; loss_kl: 135.286; beta: 0.000:  30%|█

iter: 7276; loss: 2.296; loss_rec: 34.672; loss_kl: 134.612; beta: 0.000:  30%|█████████████████████████████████▎                                                                            | 7276/24000 [14:27<33:39,  8.28it/s][A
iter: 7276; loss: 2.296; loss_rec: 34.672; loss_kl: 134.612; beta: 0.000:  30%|█████████████████████████████████▎                                                                            | 7277/24000 [14:28<33:33,  8.31it/s][A
iter: 7277; loss: 1.156; loss_rec: 8.091; loss_kl: 127.825; beta: 0.000:  30%|█████████████████████████████████▋                                                                             | 7277/24000 [14:28<33:33,  8.31it/s][A
iter: 7277; loss: 1.156; loss_rec: 8.091; loss_kl: 127.825; beta: 0.000:  30%|█████████████████████████████████▋                                                                             | 7278/24000 [14:28<33:26,  8.33it/s][A
iter: 7278; loss: 2.435; loss_rec: 25.592; loss_kl: 132.587; beta: 0.000:  30%|█

iter: 7311; loss: 1.962; loss_rec: 16.989; loss_kl: 134.379; beta: 0.000:  30%|█████████████████████████████████▌                                                                            | 7311/24000 [14:32<33:17,  8.36it/s][A
iter: 7311; loss: 1.962; loss_rec: 16.989; loss_kl: 134.379; beta: 0.000:  30%|█████████████████████████████████▌                                                                            | 7312/24000 [14:32<32:44,  8.50it/s][A
iter: 7312; loss: 3.086; loss_rec: 45.155; loss_kl: 128.517; beta: 0.000:  30%|█████████████████████████████████▌                                                                            | 7312/24000 [14:32<32:44,  8.50it/s][A
iter: 7312; loss: 3.086; loss_rec: 45.155; loss_kl: 128.517; beta: 0.000:  30%|█████████████████████████████████▌                                                                            | 7313/24000 [14:32<33:00,  8.43it/s][A
iter: 7313; loss: 2.224; loss_rec: 25.770; loss_kl: 137.409; beta: 0.000:  30%|█

iter: 7346; loss: 2.558; loss_rec: 33.258; loss_kl: 123.194; beta: 0.000:  31%|█████████████████████████████████▋                                                                            | 7346/24000 [14:36<32:59,  8.42it/s][A
iter: 7346; loss: 2.558; loss_rec: 33.258; loss_kl: 123.194; beta: 0.000:  31%|█████████████████████████████████▋                                                                            | 7347/24000 [14:36<33:12,  8.36it/s][A
iter: 7347; loss: 1.933; loss_rec: 20.219; loss_kl: 121.589; beta: 0.000:  31%|█████████████████████████████████▋                                                                            | 7347/24000 [14:36<33:12,  8.36it/s][A
iter: 7347; loss: 1.933; loss_rec: 20.219; loss_kl: 121.589; beta: 0.000:  31%|█████████████████████████████████▋                                                                            | 7348/24000 [14:36<33:27,  8.30it/s][A
iter: 7348; loss: 2.410; loss_rec: 37.710; loss_kl: 119.408; beta: 0.000:  31%|█

iter: 7381; loss: 1.243; loss_rec: 12.260; loss_kl: 131.789; beta: 0.000:  31%|█████████████████████████████████▊                                                                            | 7381/24000 [14:40<33:59,  8.15it/s][A
iter: 7381; loss: 1.243; loss_rec: 12.260; loss_kl: 131.789; beta: 0.000:  31%|█████████████████████████████████▊                                                                            | 7382/24000 [14:40<33:16,  8.32it/s][A
iter: 7382; loss: 1.696; loss_rec: 16.526; loss_kl: 127.396; beta: 0.000:  31%|█████████████████████████████████▊                                                                            | 7382/24000 [14:40<33:16,  8.32it/s][A
iter: 7382; loss: 1.696; loss_rec: 16.526; loss_kl: 127.396; beta: 0.000:  31%|█████████████████████████████████▊                                                                            | 7383/24000 [14:40<33:30,  8.27it/s][A
iter: 7383; loss: 2.833; loss_rec: 33.392; loss_kl: 115.115; beta: 0.000:  31%|█

iter: 7416; loss: 2.474; loss_rec: 35.676; loss_kl: 135.031; beta: 0.000:  31%|█████████████████████████████████▉                                                                            | 7416/24000 [14:44<33:34,  8.23it/s][A
iter: 7416; loss: 2.474; loss_rec: 35.676; loss_kl: 135.031; beta: 0.000:  31%|█████████████████████████████████▉                                                                            | 7417/24000 [14:44<33:59,  8.13it/s][A
iter: 7417; loss: 2.028; loss_rec: 23.411; loss_kl: 130.526; beta: 0.000:  31%|█████████████████████████████████▉                                                                            | 7417/24000 [14:44<33:59,  8.13it/s][A
iter: 7417; loss: 2.028; loss_rec: 23.411; loss_kl: 130.526; beta: 0.000:  31%|█████████████████████████████████▉                                                                            | 7418/24000 [14:45<33:58,  8.14it/s][A
iter: 7418; loss: 1.586; loss_rec: 20.340; loss_kl: 121.843; beta: 0.000:  31%|█

iter: 7451; loss: 1.269; loss_rec: 9.288; loss_kl: 127.558; beta: 0.000:  31%|██████████████████████████████████▍                                                                            | 7451/24000 [14:49<34:03,  8.10it/s][A
iter: 7451; loss: 1.269; loss_rec: 9.288; loss_kl: 127.558; beta: 0.000:  31%|██████████████████████████████████▍                                                                            | 7452/24000 [14:49<33:40,  8.19it/s][A
iter: 7452; loss: 2.028; loss_rec: 26.143; loss_kl: 138.114; beta: 0.000:  31%|██████████████████████████████████▏                                                                           | 7452/24000 [14:49<33:40,  8.19it/s][A
iter: 7452; loss: 2.028; loss_rec: 26.143; loss_kl: 138.114; beta: 0.000:  31%|██████████████████████████████████▏                                                                           | 7453/24000 [14:49<33:55,  8.13it/s][A
iter: 7453; loss: 1.532; loss_rec: 16.612; loss_kl: 143.263; beta: 0.000:  31%|█

iter: 7486; loss: 2.271; loss_rec: 20.653; loss_kl: 136.620; beta: 0.000:  31%|██████████████████████████████████▎                                                                           | 7486/24000 [14:53<33:25,  8.23it/s][A
iter: 7486; loss: 2.271; loss_rec: 20.653; loss_kl: 136.620; beta: 0.000:  31%|██████████████████████████████████▎                                                                           | 7487/24000 [14:53<33:07,  8.31it/s][A
iter: 7487; loss: 2.611; loss_rec: 41.384; loss_kl: 129.050; beta: 0.000:  31%|██████████████████████████████████▎                                                                           | 7487/24000 [14:53<33:07,  8.31it/s][A
iter: 7487; loss: 2.611; loss_rec: 41.384; loss_kl: 129.050; beta: 0.000:  31%|██████████████████████████████████▎                                                                           | 7488/24000 [14:53<32:57,  8.35it/s][A
iter: 7488; loss: 2.590; loss_rec: 36.679; loss_kl: 129.710; beta: 0.000:  31%|█

iter: 7521; loss: 1.996; loss_rec: 25.310; loss_kl: 139.519; beta: 0.000:  31%|██████████████████████████████████▍                                                                           | 7521/24000 [14:57<33:17,  8.25it/s][A
iter: 7521; loss: 1.996; loss_rec: 25.310; loss_kl: 139.519; beta: 0.000:  31%|██████████████████████████████████▍                                                                           | 7522/24000 [14:57<33:55,  8.10it/s][A
iter: 7522; loss: 1.512; loss_rec: 16.043; loss_kl: 129.868; beta: 0.000:  31%|██████████████████████████████████▍                                                                           | 7522/24000 [14:57<33:55,  8.10it/s][A
iter: 7522; loss: 1.512; loss_rec: 16.043; loss_kl: 129.868; beta: 0.000:  31%|██████████████████████████████████▍                                                                           | 7523/24000 [14:57<33:53,  8.10it/s][A
iter: 7523; loss: 2.401; loss_rec: 24.654; loss_kl: 136.912; beta: 0.000:  31%|█

iter: 7556; loss: 2.761; loss_rec: 33.290; loss_kl: 133.659; beta: 0.000:  31%|██████████████████████████████████▋                                                                           | 7556/24000 [15:01<33:40,  8.14it/s][A
iter: 7556; loss: 2.761; loss_rec: 33.290; loss_kl: 133.659; beta: 0.000:  31%|██████████████████████████████████▋                                                                           | 7557/24000 [15:01<33:51,  8.09it/s][A
iter: 7557; loss: 1.392; loss_rec: 8.355; loss_kl: 123.307; beta: 0.000:  31%|██████████████████████████████████▉                                                                            | 7557/24000 [15:02<33:51,  8.09it/s][A
iter: 7557; loss: 1.392; loss_rec: 8.355; loss_kl: 123.307; beta: 0.000:  31%|██████████████████████████████████▉                                                                            | 7558/24000 [15:02<33:11,  8.26it/s][A
iter: 7558; loss: 1.233; loss_rec: 8.270; loss_kl: 127.816; beta: 0.000:  31%|██

iter: 7591; loss: 2.612; loss_rec: 28.321; loss_kl: 133.212; beta: 0.000:  32%|██████████████████████████████████▊                                                                           | 7591/24000 [15:06<33:09,  8.25it/s][A
iter: 7591; loss: 2.612; loss_rec: 28.321; loss_kl: 133.212; beta: 0.000:  32%|██████████████████████████████████▊                                                                           | 7592/24000 [15:06<33:11,  8.24it/s][A
iter: 7592; loss: 2.361; loss_rec: 31.153; loss_kl: 134.583; beta: 0.000:  32%|██████████████████████████████████▊                                                                           | 7592/24000 [15:06<33:11,  8.24it/s][A
iter: 7592; loss: 2.361; loss_rec: 31.153; loss_kl: 134.583; beta: 0.000:  32%|██████████████████████████████████▊                                                                           | 7593/24000 [15:06<33:13,  8.23it/s][A
iter: 7593; loss: 2.256; loss_rec: 31.603; loss_kl: 127.559; beta: 0.000:  32%|█

iter: 7626; loss: 3.230; loss_rec: 43.143; loss_kl: 137.568; beta: 0.000:  32%|██████████████████████████████████▉                                                                           | 7626/24000 [15:10<32:17,  8.45it/s][A
iter: 7626; loss: 3.230; loss_rec: 43.143; loss_kl: 137.568; beta: 0.000:  32%|██████████████████████████████████▉                                                                           | 7627/24000 [15:10<32:20,  8.44it/s][A
iter: 7627; loss: 3.243; loss_rec: 45.127; loss_kl: 127.716; beta: 0.000:  32%|██████████████████████████████████▉                                                                           | 7627/24000 [15:10<32:20,  8.44it/s][A
iter: 7627; loss: 3.243; loss_rec: 45.127; loss_kl: 127.716; beta: 0.000:  32%|██████████████████████████████████▉                                                                           | 7628/24000 [15:10<32:55,  8.29it/s][A
iter: 7628; loss: 2.229; loss_rec: 26.380; loss_kl: 132.639; beta: 0.000:  32%|█

iter: 7661; loss: 2.279; loss_rec: 26.829; loss_kl: 119.224; beta: 0.000:  32%|███████████████████████████████████                                                                           | 7661/24000 [15:14<33:45,  8.07it/s][A
iter: 7661; loss: 2.279; loss_rec: 26.829; loss_kl: 119.224; beta: 0.000:  32%|███████████████████████████████████                                                                           | 7662/24000 [15:14<33:59,  8.01it/s][A
iter: 7662; loss: 0.856; loss_rec: 5.714; loss_kl: 105.631; beta: 0.000:  32%|███████████████████████████████████▍                                                                           | 7662/24000 [15:14<33:59,  8.01it/s][A
iter: 7662; loss: 0.856; loss_rec: 5.714; loss_kl: 105.631; beta: 0.000:  32%|███████████████████████████████████▍                                                                           | 7663/24000 [15:14<33:36,  8.10it/s][A
iter: 7663; loss: 3.033; loss_rec: 43.869; loss_kl: 133.022; beta: 0.000:  32%|█

iter: 7696; loss: 1.026; loss_rec: 8.505; loss_kl: 132.393; beta: 0.000:  32%|███████████████████████████████████▌                                                                           | 7696/24000 [15:18<32:32,  8.35it/s][A
iter: 7696; loss: 1.026; loss_rec: 8.505; loss_kl: 132.393; beta: 0.000:  32%|███████████████████████████████████▌                                                                           | 7697/24000 [15:18<32:06,  8.46it/s][A
iter: 7697; loss: 2.522; loss_rec: 32.065; loss_kl: 134.603; beta: 0.000:  32%|███████████████████████████████████▎                                                                          | 7697/24000 [15:18<32:06,  8.46it/s][A
iter: 7697; loss: 2.522; loss_rec: 32.065; loss_kl: 134.603; beta: 0.000:  32%|███████████████████████████████████▎                                                                          | 7698/24000 [15:19<32:42,  8.31it/s][A
iter: 7698; loss: 1.336; loss_rec: 15.097; loss_kl: 129.555; beta: 0.000:  32%|█

iter: 7731; loss: 1.654; loss_rec: 12.318; loss_kl: 128.920; beta: 0.000:  32%|███████████████████████████████████▍                                                                          | 7731/24000 [15:23<32:30,  8.34it/s][A
iter: 7731; loss: 1.654; loss_rec: 12.318; loss_kl: 128.920; beta: 0.000:  32%|███████████████████████████████████▍                                                                          | 7732/24000 [15:23<32:48,  8.26it/s][A
iter: 7732; loss: 1.730; loss_rec: 11.075; loss_kl: 128.451; beta: 0.000:  32%|███████████████████████████████████▍                                                                          | 7732/24000 [15:23<32:48,  8.26it/s][A
iter: 7732; loss: 1.730; loss_rec: 11.075; loss_kl: 128.451; beta: 0.000:  32%|███████████████████████████████████▍                                                                          | 7733/24000 [15:23<32:39,  8.30it/s][A
iter: 7733; loss: 2.819; loss_rec: 43.544; loss_kl: 141.535; beta: 0.000:  32%|█

iter: 7766; loss: 2.271; loss_rec: 27.149; loss_kl: 138.290; beta: 0.000:  32%|███████████████████████████████████▌                                                                          | 7766/24000 [15:27<32:00,  8.45it/s][A
iter: 7766; loss: 2.271; loss_rec: 27.149; loss_kl: 138.290; beta: 0.000:  32%|███████████████████████████████████▌                                                                          | 7767/24000 [15:27<32:27,  8.34it/s][A
iter: 7767; loss: 2.450; loss_rec: 28.364; loss_kl: 140.175; beta: 0.000:  32%|███████████████████████████████████▌                                                                          | 7767/24000 [15:27<32:27,  8.34it/s][A
iter: 7767; loss: 2.450; loss_rec: 28.364; loss_kl: 140.175; beta: 0.000:  32%|███████████████████████████████████▌                                                                          | 7768/24000 [15:27<33:00,  8.20it/s][A
iter: 7768; loss: 1.370; loss_rec: 9.090; loss_kl: 129.885; beta: 0.000:  32%|██

iter: 7801; loss: 2.195; loss_rec: 27.379; loss_kl: 111.193; beta: 0.000:  33%|███████████████████████████████████▊                                                                          | 7801/24000 [15:31<32:35,  8.28it/s][A
iter: 7801; loss: 2.195; loss_rec: 27.379; loss_kl: 111.193; beta: 0.000:  33%|███████████████████████████████████▊                                                                          | 7802/24000 [15:31<32:30,  8.30it/s][A
iter: 7802; loss: 2.649; loss_rec: 30.231; loss_kl: 133.013; beta: 0.000:  33%|███████████████████████████████████▊                                                                          | 7802/24000 [15:31<32:30,  8.30it/s][A
iter: 7802; loss: 2.649; loss_rec: 30.231; loss_kl: 133.013; beta: 0.000:  33%|███████████████████████████████████▊                                                                          | 7803/24000 [15:31<32:34,  8.29it/s][A
iter: 7803; loss: 2.574; loss_rec: 35.465; loss_kl: 139.685; beta: 0.000:  33%|█

iter: 7836; loss: 2.044; loss_rec: 18.042; loss_kl: 136.249; beta: 0.000:  33%|███████████████████████████████████▉                                                                          | 7836/24000 [15:35<33:08,  8.13it/s][A
iter: 7836; loss: 2.044; loss_rec: 18.042; loss_kl: 136.249; beta: 0.000:  33%|███████████████████████████████████▉                                                                          | 7837/24000 [15:35<33:21,  8.07it/s][A
iter: 7837; loss: 2.686; loss_rec: 41.760; loss_kl: 116.797; beta: 0.000:  33%|███████████████████████████████████▉                                                                          | 7837/24000 [15:35<33:21,  8.07it/s][A
iter: 7837; loss: 2.686; loss_rec: 41.760; loss_kl: 116.797; beta: 0.000:  33%|███████████████████████████████████▉                                                                          | 7838/24000 [15:36<32:52,  8.19it/s][A
iter: 7838; loss: 0.903; loss_rec: 7.878; loss_kl: 120.669; beta: 0.000:  33%|██

iter: 7871; loss: 2.136; loss_rec: 24.977; loss_kl: 142.910; beta: 0.000:  33%|████████████████████████████████████                                                                          | 7871/24000 [15:40<33:01,  8.14it/s][A
iter: 7871; loss: 2.136; loss_rec: 24.977; loss_kl: 142.910; beta: 0.000:  33%|████████████████████████████████████                                                                          | 7872/24000 [15:40<33:10,  8.10it/s][A
iter: 7872; loss: 2.615; loss_rec: 43.226; loss_kl: 128.708; beta: 0.000:  33%|████████████████████████████████████                                                                          | 7872/24000 [15:40<33:10,  8.10it/s][A
iter: 7872; loss: 2.615; loss_rec: 43.226; loss_kl: 128.708; beta: 0.000:  33%|████████████████████████████████████                                                                          | 7873/24000 [15:40<33:14,  8.08it/s][A
iter: 7873; loss: 1.467; loss_rec: 13.207; loss_kl: 130.828; beta: 0.000:  33%|█

iter: 7906; loss: 2.490; loss_rec: 37.110; loss_kl: 146.330; beta: 0.000:  33%|████████████████████████████████████▏                                                                         | 7906/24000 [15:44<33:20,  8.04it/s][A
iter: 7906; loss: 2.490; loss_rec: 37.110; loss_kl: 146.330; beta: 0.000:  33%|████████████████████████████████████▏                                                                         | 7907/24000 [15:44<33:48,  7.94it/s][A
iter: 7907; loss: 1.888; loss_rec: 23.583; loss_kl: 144.184; beta: 0.000:  33%|████████████████████████████████████▏                                                                         | 7907/24000 [15:44<33:48,  7.94it/s][A
iter: 7907; loss: 1.888; loss_rec: 23.583; loss_kl: 144.184; beta: 0.000:  33%|████████████████████████████████████▏                                                                         | 7908/24000 [15:44<34:06,  7.86it/s][A
iter: 7908; loss: 2.860; loss_rec: 36.423; loss_kl: 139.543; beta: 0.000:  33%|█

iter: 7941; loss: 1.526; loss_rec: 13.414; loss_kl: 128.039; beta: 0.000:  33%|████████████████████████████████████▍                                                                         | 7941/24000 [15:48<31:52,  8.40it/s][A
iter: 7941; loss: 1.526; loss_rec: 13.414; loss_kl: 128.039; beta: 0.000:  33%|████████████████████████████████████▍                                                                         | 7942/24000 [15:48<31:47,  8.42it/s][A
iter: 7942; loss: 1.806; loss_rec: 18.057; loss_kl: 121.092; beta: 0.000:  33%|████████████████████████████████████▍                                                                         | 7942/24000 [15:48<31:47,  8.42it/s][A
iter: 7942; loss: 1.806; loss_rec: 18.057; loss_kl: 121.092; beta: 0.000:  33%|████████████████████████████████████▍                                                                         | 7943/24000 [15:48<31:57,  8.37it/s][A
iter: 7943; loss: 2.298; loss_rec: 18.868; loss_kl: 123.562; beta: 0.000:  33%|█

iter: 7976; loss: 2.421; loss_rec: 18.337; loss_kl: 138.184; beta: 0.000:  33%|████████████████████████████████████▌                                                                         | 7976/24000 [15:52<32:21,  8.25it/s][A
iter: 7976; loss: 2.421; loss_rec: 18.337; loss_kl: 138.184; beta: 0.000:  33%|████████████████████████████████████▌                                                                         | 7977/24000 [15:52<32:22,  8.25it/s][A
iter: 7977; loss: 2.954; loss_rec: 32.439; loss_kl: 121.651; beta: 0.000:  33%|████████████████████████████████████▌                                                                         | 7977/24000 [15:52<32:22,  8.25it/s][A
iter: 7977; loss: 2.954; loss_rec: 32.439; loss_kl: 121.651; beta: 0.000:  33%|████████████████████████████████████▌                                                                         | 7978/24000 [15:53<32:03,  8.33it/s][A
iter: 7978; loss: 1.021; loss_rec: 5.103; loss_kl: 130.189; beta: 0.000:  33%|██

iter: 8011; loss: 2.024; loss_rec: 28.372; loss_kl: 130.628; beta: 0.000:  33%|████████████████████████████████████▋                                                                         | 8011/24000 [15:56<29:44,  8.96it/s][A
iter: 8011; loss: 2.024; loss_rec: 28.372; loss_kl: 130.628; beta: 0.000:  33%|████████████████████████████████████▋                                                                         | 8012/24000 [15:57<29:50,  8.93it/s][A
iter: 8012; loss: 1.048; loss_rec: 7.620; loss_kl: 137.345; beta: 0.000:  33%|█████████████████████████████████████                                                                          | 8012/24000 [15:57<29:50,  8.93it/s][A
iter: 8012; loss: 1.048; loss_rec: 7.620; loss_kl: 137.345; beta: 0.000:  33%|█████████████████████████████████████                                                                          | 8013/24000 [15:57<30:10,  8.83it/s][A
iter: 8013; loss: 1.434; loss_rec: 11.476; loss_kl: 123.262; beta: 0.000:  33%|█

iter: 8046; loss: 1.753; loss_rec: 21.128; loss_kl: 137.477; beta: 0.000:  34%|████████████████████████████████████▉                                                                         | 8046/24000 [16:01<31:00,  8.58it/s][A
iter: 8046; loss: 1.753; loss_rec: 21.128; loss_kl: 137.477; beta: 0.000:  34%|████████████████████████████████████▉                                                                         | 8047/24000 [16:01<31:39,  8.40it/s][A
iter: 8047; loss: 1.914; loss_rec: 21.242; loss_kl: 119.546; beta: 0.000:  34%|████████████████████████████████████▉                                                                         | 8047/24000 [16:01<31:39,  8.40it/s][A
iter: 8047; loss: 1.914; loss_rec: 21.242; loss_kl: 119.546; beta: 0.000:  34%|████████████████████████████████████▉                                                                         | 8048/24000 [16:01<31:24,  8.46it/s][A
iter: 8048; loss: 1.186; loss_rec: 11.862; loss_kl: 135.950; beta: 0.000:  34%|█

iter: 8081; loss: 0.988; loss_rec: 6.149; loss_kl: 134.615; beta: 0.000:  34%|█████████████████████████████████████▎                                                                         | 8081/24000 [16:05<30:05,  8.82it/s][A
iter: 8081; loss: 0.988; loss_rec: 6.149; loss_kl: 134.615; beta: 0.000:  34%|█████████████████████████████████████▍                                                                         | 8082/24000 [16:05<29:44,  8.92it/s][A
iter: 8082; loss: 1.654; loss_rec: 20.848; loss_kl: 151.041; beta: 0.000:  34%|█████████████████████████████████████                                                                         | 8082/24000 [16:05<29:44,  8.92it/s][A
iter: 8082; loss: 1.654; loss_rec: 20.848; loss_kl: 151.041; beta: 0.000:  34%|█████████████████████████████████████                                                                         | 8083/24000 [16:05<29:46,  8.91it/s][A
iter: 8083; loss: 2.320; loss_rec: 28.704; loss_kl: 144.255; beta: 0.000:  34%|█

iter: 8116; loss: 3.650; loss_rec: 39.358; loss_kl: 130.720; beta: 0.000:  34%|█████████████████████████████████████▏                                                                        | 8116/24000 [16:09<30:35,  8.65it/s][A
iter: 8116; loss: 3.650; loss_rec: 39.358; loss_kl: 130.720; beta: 0.000:  34%|█████████████████████████████████████▏                                                                        | 8117/24000 [16:09<30:32,  8.67it/s][A
iter: 8117; loss: 2.977; loss_rec: 36.317; loss_kl: 129.621; beta: 0.000:  34%|█████████████████████████████████████▏                                                                        | 8117/24000 [16:09<30:32,  8.67it/s][A
iter: 8117; loss: 2.977; loss_rec: 36.317; loss_kl: 129.621; beta: 0.000:  34%|█████████████████████████████████████▏                                                                        | 8118/24000 [16:09<30:34,  8.66it/s][A
iter: 8118; loss: 2.525; loss_rec: 38.494; loss_kl: 130.514; beta: 0.000:  34%|█

iter: 8151; loss: 0.791; loss_rec: 4.708; loss_kl: 126.743; beta: 0.000:  34%|█████████████████████████████████████▋                                                                         | 8151/24000 [16:13<29:53,  8.84it/s][A
iter: 8151; loss: 0.791; loss_rec: 4.708; loss_kl: 126.743; beta: 0.000:  34%|█████████████████████████████████████▋                                                                         | 8152/24000 [16:13<29:36,  8.92it/s][A
iter: 8152; loss: 2.357; loss_rec: 33.441; loss_kl: 134.012; beta: 0.000:  34%|█████████████████████████████████████▎                                                                        | 8152/24000 [16:13<29:36,  8.92it/s][A
iter: 8152; loss: 2.357; loss_rec: 33.441; loss_kl: 134.012; beta: 0.000:  34%|█████████████████████████████████████▎                                                                        | 8153/24000 [16:13<30:08,  8.76it/s][A
iter: 8153; loss: 2.380; loss_rec: 25.801; loss_kl: 133.951; beta: 0.000:  34%|█

iter: 8186; loss: 2.046; loss_rec: 22.537; loss_kl: 122.628; beta: 0.000:  34%|█████████████████████████████████████▌                                                                        | 8186/24000 [16:17<29:39,  8.89it/s][A
iter: 8186; loss: 2.046; loss_rec: 22.537; loss_kl: 122.628; beta: 0.000:  34%|█████████████████████████████████████▌                                                                        | 8187/24000 [16:17<29:28,  8.94it/s][A
iter: 8187; loss: 1.491; loss_rec: 12.126; loss_kl: 133.885; beta: 0.000:  34%|█████████████████████████████████████▌                                                                        | 8187/24000 [16:17<29:28,  8.94it/s][A
iter: 8187; loss: 1.491; loss_rec: 12.126; loss_kl: 133.885; beta: 0.000:  34%|█████████████████████████████████████▌                                                                        | 8188/24000 [16:17<29:52,  8.82it/s][A
iter: 8188; loss: 2.761; loss_rec: 25.566; loss_kl: 137.123; beta: 0.000:  34%|█

iter: 8221; loss: 1.270; loss_rec: 10.694; loss_kl: 117.056; beta: 0.000:  34%|█████████████████████████████████████▋                                                                        | 8221/24000 [16:21<30:28,  8.63it/s][A
iter: 8221; loss: 1.270; loss_rec: 10.694; loss_kl: 117.056; beta: 0.000:  34%|█████████████████████████████████████▋                                                                        | 8222/24000 [16:21<30:13,  8.70it/s][A
iter: 8222; loss: 1.934; loss_rec: 25.281; loss_kl: 139.916; beta: 0.000:  34%|█████████████████████████████████████▋                                                                        | 8222/24000 [16:21<30:13,  8.70it/s][A
iter: 8222; loss: 1.934; loss_rec: 25.281; loss_kl: 139.916; beta: 0.000:  34%|█████████████████████████████████████▋                                                                        | 8223/24000 [16:21<30:15,  8.69it/s][A
iter: 8223; loss: 2.234; loss_rec: 30.524; loss_kl: 129.109; beta: 0.000:  34%|█

iter: 8256; loss: 1.176; loss_rec: 9.213; loss_kl: 137.253; beta: 0.000:  34%|██████████████████████████████████████▏                                                                        | 8256/24000 [16:25<31:21,  8.37it/s][A
iter: 8256; loss: 1.176; loss_rec: 9.213; loss_kl: 137.253; beta: 0.000:  34%|██████████████████████████████████████▏                                                                        | 8257/24000 [16:25<31:26,  8.34it/s][A
iter: 8257; loss: 1.608; loss_rec: 16.979; loss_kl: 128.792; beta: 0.000:  34%|█████████████████████████████████████▊                                                                        | 8257/24000 [16:25<31:26,  8.34it/s][A
iter: 8257; loss: 1.608; loss_rec: 16.979; loss_kl: 128.792; beta: 0.000:  34%|█████████████████████████████████████▊                                                                        | 8258/24000 [16:25<31:32,  8.32it/s][A
iter: 8258; loss: 2.108; loss_rec: 24.254; loss_kl: 134.926; beta: 0.000:  34%|█

iter: 8291; loss: 0.827; loss_rec: 6.090; loss_kl: 130.570; beta: 0.000:  35%|██████████████████████████████████████▎                                                                        | 8291/24000 [16:29<32:12,  8.13it/s][A
iter: 8291; loss: 0.827; loss_rec: 6.090; loss_kl: 130.570; beta: 0.000:  35%|██████████████████████████████████████▎                                                                        | 8292/24000 [16:29<31:53,  8.21it/s][A
iter: 8292; loss: 2.792; loss_rec: 40.651; loss_kl: 127.259; beta: 0.000:  35%|██████████████████████████████████████                                                                        | 8292/24000 [16:29<31:53,  8.21it/s][A
iter: 8292; loss: 2.792; loss_rec: 40.651; loss_kl: 127.259; beta: 0.000:  35%|██████████████████████████████████████                                                                        | 8293/24000 [16:29<32:01,  8.18it/s][A
iter: 8293; loss: 1.853; loss_rec: 17.791; loss_kl: 134.440; beta: 0.000:  35%|█

iter: 8326; loss: 1.456; loss_rec: 14.473; loss_kl: 146.343; beta: 0.000:  35%|██████████████████████████████████████▏                                                                       | 8326/24000 [16:33<32:16,  8.09it/s][A
iter: 8326; loss: 1.456; loss_rec: 14.473; loss_kl: 146.343; beta: 0.000:  35%|██████████████████████████████████████▏                                                                       | 8327/24000 [16:34<31:52,  8.19it/s][A
iter: 8327; loss: 2.285; loss_rec: 31.595; loss_kl: 128.571; beta: 0.000:  35%|██████████████████████████████████████▏                                                                       | 8327/24000 [16:34<31:52,  8.19it/s][A
iter: 8327; loss: 2.285; loss_rec: 31.595; loss_kl: 128.571; beta: 0.000:  35%|██████████████████████████████████████▏                                                                       | 8328/24000 [16:34<31:49,  8.21it/s][A
iter: 8328; loss: 2.801; loss_rec: 36.665; loss_kl: 144.794; beta: 0.000:  35%|█

iter: 8361; loss: 1.606; loss_rec: 19.969; loss_kl: 126.718; beta: 0.000:  35%|██████████████████████████████████████▎                                                                       | 8361/24000 [16:38<32:06,  8.12it/s][A
iter: 8361; loss: 1.606; loss_rec: 19.969; loss_kl: 126.718; beta: 0.000:  35%|██████████████████████████████████████▎                                                                       | 8362/24000 [16:38<31:55,  8.16it/s][A
iter: 8362; loss: 3.034; loss_rec: 43.716; loss_kl: 125.460; beta: 0.000:  35%|██████████████████████████████████████▎                                                                       | 8362/24000 [16:38<31:55,  8.16it/s][A
iter: 8362; loss: 3.034; loss_rec: 43.716; loss_kl: 125.460; beta: 0.000:  35%|██████████████████████████████████████▎                                                                       | 8363/24000 [16:38<31:45,  8.21it/s][A
iter: 8363; loss: 1.610; loss_rec: 16.102; loss_kl: 141.139; beta: 0.000:  35%|█

iter: 8396; loss: 1.720; loss_rec: 17.006; loss_kl: 134.255; beta: 0.000:  35%|██████████████████████████████████████▍                                                                       | 8396/24000 [16:42<31:46,  8.19it/s][A
iter: 8396; loss: 1.720; loss_rec: 17.006; loss_kl: 134.255; beta: 0.000:  35%|██████████████████████████████████████▍                                                                       | 8397/24000 [16:42<31:53,  8.15it/s][A
iter: 8397; loss: 2.563; loss_rec: 34.815; loss_kl: 127.478; beta: 0.000:  35%|██████████████████████████████████████▍                                                                       | 8397/24000 [16:42<31:53,  8.15it/s][A
iter: 8397; loss: 2.563; loss_rec: 34.815; loss_kl: 127.478; beta: 0.000:  35%|██████████████████████████████████████▍                                                                       | 8398/24000 [16:42<31:41,  8.21it/s][A
iter: 8398; loss: 2.472; loss_rec: 21.389; loss_kl: 134.993; beta: 0.000:  35%|█

iter: 8431; loss: 1.909; loss_rec: 20.256; loss_kl: 119.483; beta: 0.000:  35%|██████████████████████████████████████▋                                                                       | 8431/24000 [16:46<32:02,  8.10it/s][A
iter: 8431; loss: 1.909; loss_rec: 20.256; loss_kl: 119.483; beta: 0.000:  35%|██████████████████████████████████████▋                                                                       | 8432/24000 [16:46<31:51,  8.15it/s][A
iter: 8432; loss: 1.505; loss_rec: 15.050; loss_kl: 126.586; beta: 0.000:  35%|██████████████████████████████████████▋                                                                       | 8432/24000 [16:46<31:51,  8.15it/s][A
iter: 8432; loss: 1.505; loss_rec: 15.050; loss_kl: 126.586; beta: 0.000:  35%|██████████████████████████████████████▋                                                                       | 8433/24000 [16:46<31:47,  8.16it/s][A
iter: 8433; loss: 1.934; loss_rec: 22.880; loss_kl: 123.993; beta: 0.000:  35%|█

iter: 8466; loss: 2.536; loss_rec: 23.885; loss_kl: 140.582; beta: 0.000:  35%|██████████████████████████████████████▊                                                                       | 8466/24000 [16:50<31:36,  8.19it/s][A
iter: 8466; loss: 2.536; loss_rec: 23.885; loss_kl: 140.582; beta: 0.000:  35%|██████████████████████████████████████▊                                                                       | 8467/24000 [16:51<30:52,  8.39it/s][A
iter: 8467; loss: 2.239; loss_rec: 24.020; loss_kl: 123.629; beta: 0.000:  35%|██████████████████████████████████████▊                                                                       | 8467/24000 [16:51<30:52,  8.39it/s][A
iter: 8467; loss: 2.239; loss_rec: 24.020; loss_kl: 123.629; beta: 0.000:  35%|██████████████████████████████████████▊                                                                       | 8468/24000 [16:51<31:10,  8.30it/s][A
iter: 8468; loss: 2.341; loss_rec: 31.875; loss_kl: 112.353; beta: 0.000:  35%|█

iter: 8501; loss: 2.069; loss_rec: 23.075; loss_kl: 142.017; beta: 0.000:  35%|██████████████████████████████████████▉                                                                       | 8501/24000 [16:55<28:32,  9.05it/s][A
iter: 8501; loss: 2.069; loss_rec: 23.075; loss_kl: 142.017; beta: 0.000:  35%|██████████████████████████████████████▉                                                                       | 8502/24000 [16:55<28:14,  9.15it/s][A
iter: 8502; loss: 3.137; loss_rec: 46.529; loss_kl: 138.328; beta: 0.000:  35%|██████████████████████████████████████▉                                                                       | 8502/24000 [16:55<28:14,  9.15it/s][A
iter: 8502; loss: 3.137; loss_rec: 46.529; loss_kl: 138.328; beta: 0.000:  35%|██████████████████████████████████████▉                                                                       | 8503/24000 [16:55<28:22,  9.10it/s][A
iter: 8503; loss: 1.549; loss_rec: 13.020; loss_kl: 141.256; beta: 0.000:  35%|█

iter: 8536; loss: 2.002; loss_rec: 21.748; loss_kl: 140.999; beta: 0.000:  36%|███████████████████████████████████████                                                                       | 8536/24000 [16:58<28:02,  9.19it/s][A
iter: 8536; loss: 2.002; loss_rec: 21.748; loss_kl: 140.999; beta: 0.000:  36%|███████████████████████████████████████▏                                                                      | 8537/24000 [16:58<27:52,  9.25it/s][A
iter: 8537; loss: 1.958; loss_rec: 19.727; loss_kl: 142.315; beta: 0.000:  36%|███████████████████████████████████████▏                                                                      | 8537/24000 [16:59<27:52,  9.25it/s][A
iter: 8537; loss: 1.958; loss_rec: 19.727; loss_kl: 142.315; beta: 0.000:  36%|███████████████████████████████████████▏                                                                      | 8538/24000 [16:59<27:45,  9.29it/s][A
iter: 8538; loss: 1.921; loss_rec: 16.961; loss_kl: 145.441; beta: 0.000:  36%|█

iter: 8571; loss: 2.537; loss_rec: 40.062; loss_kl: 146.217; beta: 0.000:  36%|███████████████████████████████████████▎                                                                      | 8571/24000 [17:02<27:44,  9.27it/s][A
iter: 8571; loss: 2.537; loss_rec: 40.062; loss_kl: 146.217; beta: 0.000:  36%|███████████████████████████████████████▎                                                                      | 8572/24000 [17:02<28:01,  9.17it/s][A
iter: 8572; loss: 2.564; loss_rec: 25.076; loss_kl: 142.868; beta: 0.000:  36%|███████████████████████████████████████▎                                                                      | 8572/24000 [17:02<28:01,  9.17it/s][A
iter: 8572; loss: 2.564; loss_rec: 25.076; loss_kl: 142.868; beta: 0.000:  36%|███████████████████████████████████████▎                                                                      | 8573/24000 [17:02<27:45,  9.26it/s][A
iter: 8573; loss: 2.768; loss_rec: 42.127; loss_kl: 141.972; beta: 0.000:  36%|█

iter: 9604; loss: 1.679; loss_rec: 16.612; loss_kl: 132.965; beta: 0.000:  40%|████████████████████████████████████████████                                                                  | 9605/24000 [19:03<27:38,  8.68it/s][A
iter: 9605; loss: 2.479; loss_rec: 38.843; loss_kl: 139.171; beta: 0.000:  40%|████████████████████████████████████████████                                                                  | 9605/24000 [19:03<27:38,  8.68it/s][A
iter: 9605; loss: 2.479; loss_rec: 38.843; loss_kl: 139.171; beta: 0.000:  40%|████████████████████████████████████████████                                                                  | 9606/24000 [19:03<27:39,  8.67it/s][A
iter: 9606; loss: 1.606; loss_rec: 18.260; loss_kl: 144.381; beta: 0.000:  40%|████████████████████████████████████████████                                                                  | 9606/24000 [19:03<27:39,  8.67it/s][A
iter: 9606; loss: 1.606; loss_rec: 18.260; loss_kl: 144.381; beta: 0.000:  40%|█

iter: 9639; loss: 2.048; loss_rec: 22.620; loss_kl: 146.031; beta: 0.000:  40%|████████████████████████████████████████████▏                                                                 | 9640/24000 [19:07<27:16,  8.78it/s][A
iter: 9640; loss: 2.645; loss_rec: 35.962; loss_kl: 143.903; beta: 0.000:  40%|████████████████████████████████████████████▏                                                                 | 9640/24000 [19:07<27:16,  8.78it/s][A
iter: 9640; loss: 2.645; loss_rec: 35.962; loss_kl: 143.903; beta: 0.000:  40%|████████████████████████████████████████████▏                                                                 | 9641/24000 [19:07<27:12,  8.79it/s][A
iter: 9641; loss: 2.148; loss_rec: 29.313; loss_kl: 137.729; beta: 0.000:  40%|████████████████████████████████████████████▏                                                                 | 9641/24000 [19:07<27:12,  8.79it/s][A
iter: 9641; loss: 2.148; loss_rec: 29.313; loss_kl: 137.729; beta: 0.000:  40%|█

iter: 9674; loss: 1.342; loss_rec: 8.050; loss_kl: 141.960; beta: 0.000:  40%|████████████████████████████████████████████▋                                                                  | 9675/24000 [19:11<26:36,  8.98it/s][A
iter: 9675; loss: 1.655; loss_rec: 18.835; loss_kl: 130.583; beta: 0.000:  40%|████████████████████████████████████████████▎                                                                 | 9675/24000 [19:11<26:36,  8.98it/s][A
iter: 9675; loss: 1.655; loss_rec: 18.835; loss_kl: 130.583; beta: 0.000:  40%|████████████████████████████████████████████▎                                                                 | 9676/24000 [19:11<26:39,  8.95it/s][A
iter: 9676; loss: 3.234; loss_rec: 44.915; loss_kl: 135.861; beta: 0.000:  40%|████████████████████████████████████████████▎                                                                 | 9676/24000 [19:11<26:39,  8.95it/s][A
iter: 9676; loss: 3.234; loss_rec: 44.915; loss_kl: 135.861; beta: 0.000:  40%|█

iter: 9709; loss: 1.852; loss_rec: 16.047; loss_kl: 146.004; beta: 0.000:  40%|████████████████████████████████████████████▌                                                                 | 9710/24000 [19:15<27:47,  8.57it/s][A
iter: 9710; loss: 1.019; loss_rec: 9.751; loss_kl: 135.826; beta: 0.000:  40%|████████████████████████████████████████████▉                                                                  | 9710/24000 [19:15<27:47,  8.57it/s][A
iter: 9710; loss: 1.019; loss_rec: 9.751; loss_kl: 135.826; beta: 0.000:  40%|████████████████████████████████████████████▉                                                                  | 9711/24000 [19:15<27:55,  8.53it/s][A
iter: 9711; loss: 1.111; loss_rec: 9.270; loss_kl: 128.345; beta: 0.000:  40%|████████████████████████████████████████████▉                                                                  | 9711/24000 [19:15<27:55,  8.53it/s][A
iter: 9711; loss: 1.111; loss_rec: 9.270; loss_kl: 128.345; beta: 0.000:  40%|██

iter: 9744; loss: 2.030; loss_rec: 20.580; loss_kl: 133.135; beta: 0.000:  41%|████████████████████████████████████████████▋                                                                 | 9745/24000 [19:19<27:50,  8.53it/s][A
iter: 9745; loss: 1.458; loss_rec: 11.093; loss_kl: 142.285; beta: 0.000:  41%|████████████████████████████████████████████▋                                                                 | 9745/24000 [19:19<27:50,  8.53it/s][A
iter: 9745; loss: 1.458; loss_rec: 11.093; loss_kl: 142.285; beta: 0.000:  41%|████████████████████████████████████████████▋                                                                 | 9746/24000 [19:19<27:32,  8.62it/s][A
iter: 9746; loss: 1.804; loss_rec: 16.774; loss_kl: 134.942; beta: 0.000:  41%|████████████████████████████████████████████▋                                                                 | 9746/24000 [19:19<27:32,  8.62it/s][A
iter: 9746; loss: 1.804; loss_rec: 16.774; loss_kl: 134.942; beta: 0.000:  41%|█

iter: 9779; loss: 2.076; loss_rec: 22.876; loss_kl: 149.361; beta: 0.000:  41%|████████████████████████████████████████████▊                                                                 | 9780/24000 [19:23<28:01,  8.46it/s][A
iter: 9780; loss: 2.648; loss_rec: 39.686; loss_kl: 146.240; beta: 0.000:  41%|████████████████████████████████████████████▊                                                                 | 9780/24000 [19:23<28:01,  8.46it/s][A
iter: 9780; loss: 2.648; loss_rec: 39.686; loss_kl: 146.240; beta: 0.000:  41%|████████████████████████████████████████████▊                                                                 | 9781/24000 [19:23<28:13,  8.40it/s][A
iter: 9781; loss: 2.146; loss_rec: 21.811; loss_kl: 152.560; beta: 0.000:  41%|████████████████████████████████████████████▊                                                                 | 9781/24000 [19:23<28:13,  8.40it/s][A
iter: 9781; loss: 2.146; loss_rec: 21.811; loss_kl: 152.560; beta: 0.000:  41%|█

iter: 9814; loss: 2.045; loss_rec: 26.806; loss_kl: 145.762; beta: 0.000:  41%|████████████████████████████████████████████▉                                                                 | 9815/24000 [19:27<28:09,  8.39it/s][A
iter: 9815; loss: 2.116; loss_rec: 26.689; loss_kl: 148.307; beta: 0.000:  41%|████████████████████████████████████████████▉                                                                 | 9815/24000 [19:27<28:09,  8.39it/s][A
iter: 9815; loss: 2.116; loss_rec: 26.689; loss_kl: 148.307; beta: 0.000:  41%|████████████████████████████████████████████▉                                                                 | 9816/24000 [19:27<28:14,  8.37it/s][A
iter: 9816; loss: 1.952; loss_rec: 22.661; loss_kl: 149.722; beta: 0.000:  41%|████████████████████████████████████████████▉                                                                 | 9816/24000 [19:27<28:14,  8.37it/s][A
iter: 9816; loss: 1.952; loss_rec: 22.661; loss_kl: 149.722; beta: 0.000:  41%|█

iter: 9849; loss: 1.399; loss_rec: 14.657; loss_kl: 141.023; beta: 0.000:  41%|█████████████████████████████████████████████▏                                                                | 9850/24000 [19:31<27:45,  8.50it/s][A
iter: 9850; loss: 2.111; loss_rec: 27.006; loss_kl: 138.998; beta: 0.000:  41%|█████████████████████████████████████████████▏                                                                | 9850/24000 [19:31<27:45,  8.50it/s][A
iter: 9850; loss: 2.111; loss_rec: 27.006; loss_kl: 138.998; beta: 0.000:  41%|█████████████████████████████████████████████▏                                                                | 9851/24000 [19:31<27:53,  8.45it/s][A
iter: 9851; loss: 2.300; loss_rec: 30.008; loss_kl: 141.611; beta: 0.000:  41%|█████████████████████████████████████████████▏                                                                | 9851/24000 [19:31<27:53,  8.45it/s][A
iter: 9851; loss: 2.300; loss_rec: 30.008; loss_kl: 141.611; beta: 0.000:  41%|█

iter: 9884; loss: 2.989; loss_rec: 32.220; loss_kl: 133.088; beta: 0.000:  41%|█████████████████████████████████████████████▎                                                                | 9885/24000 [19:35<27:34,  8.53it/s][A
iter: 9885; loss: 1.831; loss_rec: 21.971; loss_kl: 125.500; beta: 0.000:  41%|█████████████████████████████████████████████▎                                                                | 9885/24000 [19:35<27:34,  8.53it/s][A
iter: 9885; loss: 1.831; loss_rec: 21.971; loss_kl: 125.500; beta: 0.000:  41%|█████████████████████████████████████████████▎                                                                | 9886/24000 [19:36<27:40,  8.50it/s][A
iter: 9886; loss: 2.268; loss_rec: 32.853; loss_kl: 137.979; beta: 0.000:  41%|█████████████████████████████████████████████▎                                                                | 9886/24000 [19:36<27:40,  8.50it/s][A
iter: 9886; loss: 2.268; loss_rec: 32.853; loss_kl: 137.979; beta: 0.000:  41%|█

iter: 9919; loss: 1.875; loss_rec: 23.670; loss_kl: 146.794; beta: 0.000:  41%|█████████████████████████████████████████████▍                                                                | 9920/24000 [19:40<27:23,  8.57it/s][A
iter: 9920; loss: 3.087; loss_rec: 41.021; loss_kl: 140.950; beta: 0.000:  41%|█████████████████████████████████████████████▍                                                                | 9920/24000 [19:40<27:23,  8.57it/s][A
iter: 9920; loss: 3.087; loss_rec: 41.021; loss_kl: 140.950; beta: 0.000:  41%|█████████████████████████████████████████████▍                                                                | 9921/24000 [19:40<27:42,  8.47it/s][A
iter: 9921; loss: 1.682; loss_rec: 23.318; loss_kl: 150.930; beta: 0.000:  41%|█████████████████████████████████████████████▍                                                                | 9921/24000 [19:40<27:42,  8.47it/s][A
iter: 9921; loss: 1.682; loss_rec: 23.318; loss_kl: 150.930; beta: 0.000:  41%|█

iter: 9954; loss: 2.657; loss_rec: 35.828; loss_kl: 146.657; beta: 0.000:  41%|█████████████████████████████████████████████▋                                                                | 9955/24000 [19:44<27:58,  8.37it/s][A
iter: 9955; loss: 2.481; loss_rec: 29.930; loss_kl: 141.500; beta: 0.000:  41%|█████████████████████████████████████████████▋                                                                | 9955/24000 [19:44<27:58,  8.37it/s][A
iter: 9955; loss: 2.481; loss_rec: 29.930; loss_kl: 141.500; beta: 0.000:  41%|█████████████████████████████████████████████▋                                                                | 9956/24000 [19:44<27:37,  8.47it/s][A
iter: 9956; loss: 2.392; loss_rec: 32.827; loss_kl: 146.950; beta: 0.000:  41%|█████████████████████████████████████████████▋                                                                | 9956/24000 [19:44<27:37,  8.47it/s][A
iter: 9956; loss: 2.392; loss_rec: 32.827; loss_kl: 146.950; beta: 0.000:  41%|█

iter: 9989; loss: 2.390; loss_rec: 29.688; loss_kl: 136.790; beta: 0.000:  42%|█████████████████████████████████████████████▊                                                                | 9990/24000 [19:48<27:23,  8.53it/s][A
iter: 9990; loss: 1.824; loss_rec: 18.770; loss_kl: 136.509; beta: 0.000:  42%|█████████████████████████████████████████████▊                                                                | 9990/24000 [19:48<27:23,  8.53it/s][A
iter: 9990; loss: 1.824; loss_rec: 18.770; loss_kl: 136.509; beta: 0.000:  42%|█████████████████████████████████████████████▊                                                                | 9991/24000 [19:48<27:30,  8.49it/s][A
iter: 9991; loss: 1.730; loss_rec: 20.579; loss_kl: 140.209; beta: 0.000:  42%|█████████████████████████████████████████████▊                                                                | 9991/24000 [19:48<27:30,  8.49it/s][A
iter: 9991; loss: 1.730; loss_rec: 20.579; loss_kl: 140.209; beta: 0.000:  42%|█

iter: 10024; loss: 2.888; loss_rec: 43.827; loss_kl: 150.775; beta: 0.000:  42%|█████████████████████████████████████████████                                                               | 10024/24000 [19:53<27:29,  8.47it/s][A
iter: 10024; loss: 2.888; loss_rec: 43.827; loss_kl: 150.775; beta: 0.000:  42%|█████████████████████████████████████████████                                                               | 10025/24000 [19:53<27:15,  8.54it/s][A
iter: 10025; loss: 2.306; loss_rec: 27.680; loss_kl: 142.221; beta: 0.000:  42%|█████████████████████████████████████████████                                                               | 10025/24000 [19:53<27:15,  8.54it/s][A
iter: 10025; loss: 2.306; loss_rec: 27.680; loss_kl: 142.221; beta: 0.000:  42%|█████████████████████████████████████████████                                                               | 10026/24000 [19:53<27:43,  8.40it/s][A
iter: 10026; loss: 3.020; loss_rec: 45.813; loss_kl: 141.701; beta: 0.000:  42%|

iter: 10059; loss: 2.062; loss_rec: 18.552; loss_kl: 139.511; beta: 0.000:  42%|█████████████████████████████████████████████▎                                                              | 10059/24000 [19:57<27:25,  8.47it/s][A
iter: 10059; loss: 2.062; loss_rec: 18.552; loss_kl: 139.511; beta: 0.000:  42%|█████████████████████████████████████████████▎                                                              | 10060/24000 [19:57<27:29,  8.45it/s][A
iter: 10060; loss: 1.816; loss_rec: 21.657; loss_kl: 154.055; beta: 0.000:  42%|█████████████████████████████████████████████▎                                                              | 10060/24000 [19:57<27:29,  8.45it/s][A
iter: 10060; loss: 1.816; loss_rec: 21.657; loss_kl: 154.055; beta: 0.000:  42%|█████████████████████████████████████████████▎                                                              | 10061/24000 [19:57<27:32,  8.44it/s][A
iter: 10061; loss: 2.476; loss_rec: 35.268; loss_kl: 143.309; beta: 0.000:  42%|

iter: 12661; loss: 2.899; loss_rec: 41.720; loss_kl: 157.336; beta: 0.000:  53%|████████████████████████████████████████████████████████▉                                                   | 12661/24000 [25:09<22:15,  8.49it/s][A
iter: 12661; loss: 2.899; loss_rec: 41.720; loss_kl: 157.336; beta: 0.000:  53%|████████████████████████████████████████████████████████▉                                                   | 12662/24000 [25:10<22:28,  8.41it/s][A
iter: 12662; loss: 2.437; loss_rec: 25.657; loss_kl: 158.469; beta: 0.000:  53%|████████████████████████████████████████████████████████▉                                                   | 12662/24000 [25:10<22:28,  8.41it/s][A
iter: 12662; loss: 2.437; loss_rec: 25.657; loss_kl: 158.469; beta: 0.000:  53%|████████████████████████████████████████████████████████▉                                                   | 12663/24000 [25:10<22:53,  8.26it/s][A
iter: 12663; loss: 2.267; loss_rec: 33.360; loss_kl: 164.228; beta: 0.000:  53%|

iter: 12696; loss: 3.092; loss_rec: 39.358; loss_kl: 171.315; beta: 0.000:  53%|█████████████████████████████████████████████████████████▏                                                  | 12696/24000 [25:14<22:30,  8.37it/s][A
iter: 12696; loss: 3.092; loss_rec: 39.358; loss_kl: 171.315; beta: 0.000:  53%|█████████████████████████████████████████████████████████▏                                                  | 12697/24000 [25:14<22:24,  8.41it/s][A
iter: 12697; loss: 2.555; loss_rec: 39.570; loss_kl: 167.548; beta: 0.000:  53%|█████████████████████████████████████████████████████████▏                                                  | 12697/24000 [25:14<22:24,  8.41it/s][A
iter: 12697; loss: 2.555; loss_rec: 39.570; loss_kl: 167.548; beta: 0.000:  53%|█████████████████████████████████████████████████████████▏                                                  | 12698/24000 [25:14<22:35,  8.34it/s][A
iter: 12698; loss: 2.233; loss_rec: 33.883; loss_kl: 164.918; beta: 0.000:  53%|

iter: 12731; loss: 2.851; loss_rec: 36.791; loss_kl: 168.283; beta: 0.000:  53%|█████████████████████████████████████████████████████████▎                                                  | 12731/24000 [25:18<22:41,  8.27it/s][A
iter: 12731; loss: 2.851; loss_rec: 36.791; loss_kl: 168.283; beta: 0.000:  53%|█████████████████████████████████████████████████████████▎                                                  | 12732/24000 [25:18<22:41,  8.27it/s][A
iter: 12732; loss: 1.363; loss_rec: 15.413; loss_kl: 171.803; beta: 0.000:  53%|█████████████████████████████████████████████████████████▎                                                  | 12732/24000 [25:18<22:41,  8.27it/s][A
iter: 12732; loss: 1.363; loss_rec: 15.413; loss_kl: 171.803; beta: 0.000:  53%|█████████████████████████████████████████████████████████▎                                                  | 12733/24000 [25:18<22:49,  8.23it/s][A
iter: 12733; loss: 1.505; loss_rec: 15.561; loss_kl: 155.988; beta: 0.000:  53%|

iter: 12766; loss: 2.317; loss_rec: 22.707; loss_kl: 165.597; beta: 0.000:  53%|█████████████████████████████████████████████████████████▍                                                  | 12766/24000 [25:22<22:51,  8.19it/s][A
iter: 12766; loss: 2.317; loss_rec: 22.707; loss_kl: 165.597; beta: 0.000:  53%|█████████████████████████████████████████████████████████▍                                                  | 12767/24000 [25:22<23:11,  8.07it/s][A
iter: 12767; loss: 1.592; loss_rec: 16.892; loss_kl: 164.660; beta: 0.000:  53%|█████████████████████████████████████████████████████████▍                                                  | 12767/24000 [25:22<23:11,  8.07it/s][A
iter: 12767; loss: 1.592; loss_rec: 16.892; loss_kl: 164.660; beta: 0.000:  53%|█████████████████████████████████████████████████████████▍                                                  | 12768/24000 [25:22<23:00,  8.14it/s][A
iter: 12768; loss: 1.328; loss_rec: 13.557; loss_kl: 159.529; beta: 0.000:  53%|

iter: 12801; loss: 2.040; loss_rec: 24.095; loss_kl: 151.942; beta: 0.000:  53%|█████████████████████████████████████████████████████████▌                                                  | 12801/24000 [25:26<22:31,  8.29it/s][A
iter: 12801; loss: 2.040; loss_rec: 24.095; loss_kl: 151.942; beta: 0.000:  53%|█████████████████████████████████████████████████████████▌                                                  | 12802/24000 [25:26<22:29,  8.30it/s][A
iter: 12802; loss: 2.410; loss_rec: 29.706; loss_kl: 168.146; beta: 0.000:  53%|█████████████████████████████████████████████████████████▌                                                  | 12802/24000 [25:26<22:29,  8.30it/s][A
iter: 12802; loss: 2.410; loss_rec: 29.706; loss_kl: 168.146; beta: 0.000:  53%|█████████████████████████████████████████████████████████▌                                                  | 12803/24000 [25:27<22:25,  8.32it/s][A
iter: 12803; loss: 1.142; loss_rec: 7.867; loss_kl: 128.026; beta: 0.000:  53%|█

iter: 12836; loss: 2.260; loss_rec: 35.510; loss_kl: 163.211; beta: 0.000:  53%|█████████████████████████████████████████████████████████▊                                                  | 12836/24000 [25:30<22:04,  8.43it/s][A
iter: 12836; loss: 2.260; loss_rec: 35.510; loss_kl: 163.211; beta: 0.000:  53%|█████████████████████████████████████████████████████████▊                                                  | 12837/24000 [25:31<22:17,  8.35it/s][A
iter: 12837; loss: 1.365; loss_rec: 16.129; loss_kl: 161.709; beta: 0.000:  53%|█████████████████████████████████████████████████████████▊                                                  | 12837/24000 [25:31<22:17,  8.35it/s][A
iter: 12837; loss: 1.365; loss_rec: 16.129; loss_kl: 161.709; beta: 0.000:  53%|█████████████████████████████████████████████████████████▊                                                  | 12838/24000 [25:31<22:30,  8.26it/s][A
iter: 12838; loss: 2.114; loss_rec: 22.903; loss_kl: 165.889; beta: 0.000:  53%|

iter: 12871; loss: 1.987; loss_rec: 18.600; loss_kl: 165.617; beta: 0.000:  54%|█████████████████████████████████████████████████████████▉                                                  | 12871/24000 [25:35<22:36,  8.20it/s][A
iter: 12871; loss: 1.987; loss_rec: 18.600; loss_kl: 165.617; beta: 0.000:  54%|█████████████████████████████████████████████████████████▉                                                  | 12872/24000 [25:35<22:27,  8.26it/s][A
iter: 12872; loss: 2.838; loss_rec: 36.546; loss_kl: 155.201; beta: 0.000:  54%|█████████████████████████████████████████████████████████▉                                                  | 12872/24000 [25:35<22:27,  8.26it/s][A
iter: 12872; loss: 2.838; loss_rec: 36.546; loss_kl: 155.201; beta: 0.000:  54%|█████████████████████████████████████████████████████████▉                                                  | 12873/24000 [25:35<22:28,  8.25it/s][A
iter: 12873; loss: 2.111; loss_rec: 22.435; loss_kl: 159.626; beta: 0.000:  54%|

iter: 12906; loss: 1.636; loss_rec: 16.193; loss_kl: 149.473; beta: 0.000:  54%|██████████████████████████████████████████████████████████                                                  | 12906/24000 [25:39<22:18,  8.29it/s][A
iter: 12906; loss: 1.636; loss_rec: 16.193; loss_kl: 149.473; beta: 0.000:  54%|██████████████████████████████████████████████████████████                                                  | 12907/24000 [25:39<22:12,  8.33it/s][A
iter: 12907; loss: 1.647; loss_rec: 13.442; loss_kl: 145.750; beta: 0.000:  54%|██████████████████████████████████████████████████████████                                                  | 12907/24000 [25:39<22:12,  8.33it/s][A
iter: 12907; loss: 1.647; loss_rec: 13.442; loss_kl: 145.750; beta: 0.000:  54%|██████████████████████████████████████████████████████████                                                  | 12908/24000 [25:39<22:11,  8.33it/s][A
iter: 12908; loss: 2.193; loss_rec: 32.988; loss_kl: 147.175; beta: 0.000:  54%|

iter: 12941; loss: 2.604; loss_rec: 34.296; loss_kl: 143.510; beta: 0.000:  54%|██████████████████████████████████████████████████████████▏                                                 | 12941/24000 [25:43<22:17,  8.27it/s][A
iter: 12941; loss: 2.604; loss_rec: 34.296; loss_kl: 143.510; beta: 0.000:  54%|██████████████████████████████████████████████████████████▏                                                 | 12942/24000 [25:43<22:14,  8.29it/s][A
iter: 12942; loss: 1.722; loss_rec: 20.087; loss_kl: 140.056; beta: 0.000:  54%|██████████████████████████████████████████████████████████▏                                                 | 12942/24000 [25:43<22:14,  8.29it/s][A
iter: 12942; loss: 1.722; loss_rec: 20.087; loss_kl: 140.056; beta: 0.000:  54%|██████████████████████████████████████████████████████████▏                                                 | 12943/24000 [25:43<22:09,  8.32it/s][A
iter: 12943; loss: 1.051; loss_rec: 5.255; loss_kl: 137.262; beta: 0.000:  54%|█

iter: 12976; loss: 1.999; loss_rec: 27.156; loss_kl: 145.282; beta: 0.000:  54%|██████████████████████████████████████████████████████████▍                                                 | 12976/24000 [25:47<22:09,  8.29it/s][A
iter: 12976; loss: 1.999; loss_rec: 27.156; loss_kl: 145.282; beta: 0.000:  54%|██████████████████████████████████████████████████████████▍                                                 | 12977/24000 [25:47<22:11,  8.28it/s][A
iter: 12977; loss: 2.747; loss_rec: 39.218; loss_kl: 148.401; beta: 0.000:  54%|██████████████████████████████████████████████████████████▍                                                 | 12977/24000 [25:47<22:11,  8.28it/s][A
iter: 12977; loss: 2.747; loss_rec: 39.218; loss_kl: 148.401; beta: 0.000:  54%|██████████████████████████████████████████████████████████▍                                                 | 12978/24000 [25:48<22:21,  8.22it/s][A
iter: 12978; loss: 1.679; loss_rec: 14.667; loss_kl: 145.593; beta: 0.000:  54%|

iter: 13011; loss: 2.097; loss_rec: 27.961; loss_kl: 140.583; beta: 0.000:  54%|██████████████████████████████████████████████████████████▌                                                 | 13011/24000 [25:52<22:17,  8.22it/s][A
iter: 13011; loss: 2.097; loss_rec: 27.961; loss_kl: 140.583; beta: 0.000:  54%|██████████████████████████████████████████████████████████▌                                                 | 13012/24000 [25:52<22:12,  8.25it/s][A
iter: 13012; loss: 2.172; loss_rec: 24.874; loss_kl: 158.463; beta: 0.000:  54%|██████████████████████████████████████████████████████████▌                                                 | 13012/24000 [25:52<22:12,  8.25it/s][A
iter: 13012; loss: 2.172; loss_rec: 24.874; loss_kl: 158.463; beta: 0.000:  54%|██████████████████████████████████████████████████████████▌                                                 | 13013/24000 [25:52<22:06,  8.29it/s][A
iter: 13013; loss: 2.204; loss_rec: 32.393; loss_kl: 167.826; beta: 0.000:  54%|

iter: 13046; loss: 2.221; loss_rec: 22.421; loss_kl: 152.982; beta: 0.000:  54%|██████████████████████████████████████████████████████████▋                                                 | 13046/24000 [25:56<22:14,  8.21it/s][A
iter: 13046; loss: 2.221; loss_rec: 22.421; loss_kl: 152.982; beta: 0.000:  54%|██████████████████████████████████████████████████████████▋                                                 | 13047/24000 [25:56<22:20,  8.17it/s][A
iter: 13047; loss: 2.698; loss_rec: 41.267; loss_kl: 142.766; beta: 0.000:  54%|██████████████████████████████████████████████████████████▋                                                 | 13047/24000 [25:56<22:20,  8.17it/s][A
iter: 13047; loss: 2.698; loss_rec: 41.267; loss_kl: 142.766; beta: 0.000:  54%|██████████████████████████████████████████████████████████▋                                                 | 13048/24000 [25:56<22:44,  8.02it/s][A
iter: 13048; loss: 1.072; loss_rec: 9.266; loss_kl: 151.342; beta: 0.000:  54%|█

iter: 13081; loss: 2.214; loss_rec: 19.363; loss_kl: 162.145; beta: 0.000:  55%|██████████████████████████████████████████████████████████▊                                                 | 13081/24000 [26:00<21:51,  8.32it/s][A
iter: 13081; loss: 2.214; loss_rec: 19.363; loss_kl: 162.145; beta: 0.000:  55%|██████████████████████████████████████████████████████████▊                                                 | 13082/24000 [26:00<22:04,  8.25it/s][A
iter: 13082; loss: 2.264; loss_rec: 31.858; loss_kl: 151.739; beta: 0.000:  55%|██████████████████████████████████████████████████████████▊                                                 | 13082/24000 [26:00<22:04,  8.25it/s][A
iter: 13082; loss: 2.264; loss_rec: 31.858; loss_kl: 151.739; beta: 0.000:  55%|██████████████████████████████████████████████████████████▊                                                 | 13083/24000 [26:00<22:01,  8.26it/s][A
iter: 13083; loss: 2.274; loss_rec: 32.235; loss_kl: 142.965; beta: 0.000:  55%|

iter: 13116; loss: 2.318; loss_rec: 31.747; loss_kl: 167.522; beta: 0.000:  55%|███████████████████████████████████████████████████████████                                                 | 13116/24000 [26:04<22:01,  8.24it/s][A
iter: 13116; loss: 2.318; loss_rec: 31.747; loss_kl: 167.522; beta: 0.000:  55%|███████████████████████████████████████████████████████████                                                 | 13117/24000 [26:04<21:58,  8.25it/s][A
iter: 13117; loss: 2.139; loss_rec: 21.131; loss_kl: 175.759; beta: 0.000:  55%|███████████████████████████████████████████████████████████                                                 | 13117/24000 [26:04<21:58,  8.25it/s][A
iter: 13117; loss: 2.139; loss_rec: 21.131; loss_kl: 175.759; beta: 0.000:  55%|███████████████████████████████████████████████████████████                                                 | 13118/24000 [26:04<21:50,  8.31it/s][A
iter: 13118; loss: 2.149; loss_rec: 18.626; loss_kl: 160.020; beta: 0.000:  55%|

iter: 13151; loss: 1.391; loss_rec: 10.353; loss_kl: 154.534; beta: 0.000:  55%|███████████████████████████████████████████████████████████▏                                                | 13151/24000 [26:08<21:45,  8.31it/s][A
iter: 13151; loss: 1.391; loss_rec: 10.353; loss_kl: 154.534; beta: 0.000:  55%|███████████████████████████████████████████████████████████▏                                                | 13152/24000 [26:09<21:41,  8.33it/s][A
iter: 13152; loss: 1.474; loss_rec: 19.162; loss_kl: 170.447; beta: 0.000:  55%|███████████████████████████████████████████████████████████▏                                                | 13152/24000 [26:09<21:41,  8.33it/s][A
iter: 13152; loss: 1.474; loss_rec: 19.162; loss_kl: 170.447; beta: 0.000:  55%|███████████████████████████████████████████████████████████▏                                                | 13153/24000 [26:09<21:44,  8.32it/s][A
iter: 13153; loss: 1.469; loss_rec: 10.783; loss_kl: 157.041; beta: 0.000:  55%|

iter: 15623; loss: 1.742; loss_rec: 20.024; loss_kl: 169.607; beta: 0.000:  65%|██████████████████████████████████████████████████████████████████████▎                                     | 15624/24000 [31:02<16:44,  8.34it/s][A
iter: 15624; loss: 2.866; loss_rec: 40.849; loss_kl: 166.679; beta: 0.000:  65%|██████████████████████████████████████████████████████████████████████▎                                     | 15624/24000 [31:02<16:44,  8.34it/s][A
iter: 15624; loss: 2.866; loss_rec: 40.849; loss_kl: 166.679; beta: 0.000:  65%|██████████████████████████████████████████████████████████████████████▎                                     | 15625/24000 [31:02<16:47,  8.32it/s][A
iter: 15625; loss: 2.511; loss_rec: 36.015; loss_kl: 159.539; beta: 0.000:  65%|██████████████████████████████████████████████████████████████████████▎                                     | 15625/24000 [31:02<16:47,  8.32it/s][A
iter: 15625; loss: 2.511; loss_rec: 36.015; loss_kl: 159.539; beta: 0.000:  65%|

iter: 15658; loss: 1.020; loss_rec: 10.399; loss_kl: 167.253; beta: 0.000:  65%|██████████████████████████████████████████████████████████████████████▍                                     | 15659/24000 [31:06<16:46,  8.28it/s][A
iter: 15659; loss: 0.665; loss_rec: 4.313; loss_kl: 139.384; beta: 0.000:  65%|███████████████████████████████████████████████████████████████████████                                      | 15659/24000 [31:06<16:46,  8.28it/s][A
iter: 15659; loss: 0.665; loss_rec: 4.313; loss_kl: 139.384; beta: 0.000:  65%|███████████████████████████████████████████████████████████████████████                                      | 15660/24000 [31:06<16:43,  8.31it/s][A
iter: 15660; loss: 1.097; loss_rec: 10.851; loss_kl: 159.566; beta: 0.000:  65%|██████████████████████████████████████████████████████████████████████▍                                     | 15660/24000 [31:06<16:43,  8.31it/s][A
iter: 15660; loss: 1.097; loss_rec: 10.851; loss_kl: 159.566; beta: 0.000:  65%|

iter: 15693; loss: 1.458; loss_rec: 11.666; loss_kl: 167.584; beta: 0.000:  65%|██████████████████████████████████████████████████████████████████████▌                                     | 15694/24000 [31:10<16:16,  8.51it/s][A
iter: 15694; loss: 2.299; loss_rec: 34.729; loss_kl: 166.430; beta: 0.000:  65%|██████████████████████████████████████████████████████████████████████▌                                     | 15694/24000 [31:10<16:16,  8.51it/s][A
iter: 15694; loss: 2.299; loss_rec: 34.729; loss_kl: 166.430; beta: 0.000:  65%|██████████████████████████████████████████████████████████████████████▋                                     | 15695/24000 [31:11<16:24,  8.43it/s][A
iter: 15695; loss: 0.813; loss_rec: 6.126; loss_kl: 169.386; beta: 0.000:  65%|███████████████████████████████████████████████████████████████████████▎                                     | 15695/24000 [31:11<16:24,  8.43it/s][A
iter: 15695; loss: 0.813; loss_rec: 6.126; loss_kl: 169.386; beta: 0.000:  65%|█

iter: 15728; loss: 1.305; loss_rec: 9.297; loss_kl: 143.985; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▍                                     | 15729/24000 [31:15<15:58,  8.63it/s][A
iter: 15729; loss: 1.878; loss_rec: 21.178; loss_kl: 164.376; beta: 0.000:  66%|██████████████████████████████████████████████████████████████████████▊                                     | 15729/24000 [31:15<15:58,  8.63it/s][A
iter: 15729; loss: 1.878; loss_rec: 21.178; loss_kl: 164.376; beta: 0.000:  66%|██████████████████████████████████████████████████████████████████████▊                                     | 15730/24000 [31:15<16:05,  8.56it/s][A
iter: 15730; loss: 2.814; loss_rec: 34.770; loss_kl: 164.152; beta: 0.000:  66%|██████████████████████████████████████████████████████████████████████▊                                     | 15730/24000 [31:15<16:05,  8.56it/s][A
iter: 15730; loss: 2.814; loss_rec: 34.770; loss_kl: 164.152; beta: 0.000:  66%|

iter: 15763; loss: 2.123; loss_rec: 28.335; loss_kl: 180.553; beta: 0.000:  66%|██████████████████████████████████████████████████████████████████████▉                                     | 15764/24000 [31:19<16:21,  8.39it/s][A
iter: 15764; loss: 2.193; loss_rec: 33.189; loss_kl: 163.004; beta: 0.000:  66%|██████████████████████████████████████████████████████████████████████▉                                     | 15764/24000 [31:19<16:21,  8.39it/s][A
iter: 15764; loss: 2.193; loss_rec: 33.189; loss_kl: 163.004; beta: 0.000:  66%|██████████████████████████████████████████████████████████████████████▉                                     | 15765/24000 [31:19<16:26,  8.35it/s][A
iter: 15765; loss: 1.729; loss_rec: 15.022; loss_kl: 156.500; beta: 0.000:  66%|██████████████████████████████████████████████████████████████████████▉                                     | 15765/24000 [31:19<16:26,  8.35it/s][A
iter: 15765; loss: 1.729; loss_rec: 15.022; loss_kl: 156.500; beta: 0.000:  66%|

iter: 15798; loss: 2.281; loss_rec: 20.158; loss_kl: 167.766; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████                                     | 15799/24000 [31:23<16:31,  8.27it/s][A
iter: 15799; loss: 2.456; loss_rec: 32.346; loss_kl: 160.082; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████                                     | 15799/24000 [31:23<16:31,  8.27it/s][A
iter: 15799; loss: 2.456; loss_rec: 32.346; loss_kl: 160.082; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████                                     | 15800/24000 [31:23<16:21,  8.36it/s][A
iter: 15800; loss: 2.247; loss_rec: 29.210; loss_kl: 173.355; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████                                     | 15800/24000 [31:23<16:21,  8.36it/s][A
iter: 15800; loss: 2.247; loss_rec: 29.210; loss_kl: 173.355; beta: 0.000:  66%|

iter: 15833; loss: 0.947; loss_rec: 7.299; loss_kl: 160.694; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▉                                     | 15834/24000 [31:27<15:56,  8.53it/s][A
iter: 15834; loss: 1.755; loss_rec: 22.812; loss_kl: 169.978; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▎                                    | 15834/24000 [31:27<15:56,  8.53it/s][A
iter: 15834; loss: 1.755; loss_rec: 22.812; loss_kl: 169.978; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▎                                    | 15835/24000 [31:27<16:06,  8.45it/s][A
iter: 15835; loss: 1.197; loss_rec: 10.390; loss_kl: 156.561; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▎                                    | 15835/24000 [31:27<16:06,  8.45it/s][A
iter: 15835; loss: 1.197; loss_rec: 10.390; loss_kl: 156.561; beta: 0.000:  66%|

iter: 15868; loss: 2.282; loss_rec: 29.571; loss_kl: 176.343; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▍                                    | 15869/24000 [31:31<16:18,  8.31it/s][A
iter: 15869; loss: 2.160; loss_rec: 23.297; loss_kl: 171.584; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▍                                    | 15869/24000 [31:31<16:18,  8.31it/s][A
iter: 15869; loss: 2.160; loss_rec: 23.297; loss_kl: 171.584; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▍                                    | 15870/24000 [31:31<16:15,  8.33it/s][A
iter: 15870; loss: 1.608; loss_rec: 17.248; loss_kl: 154.118; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▍                                    | 15870/24000 [31:31<16:15,  8.33it/s][A
iter: 15870; loss: 1.608; loss_rec: 17.248; loss_kl: 154.118; beta: 0.000:  66%|

iter: 15903; loss: 1.393; loss_rec: 18.401; loss_kl: 154.998; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▌                                    | 15904/24000 [31:35<15:33,  8.68it/s][A
iter: 15904; loss: 1.281; loss_rec: 16.258; loss_kl: 186.373; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▌                                    | 15904/24000 [31:36<15:33,  8.68it/s][A
iter: 15904; loss: 1.281; loss_rec: 16.258; loss_kl: 186.373; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▌                                    | 15905/24000 [31:36<15:33,  8.67it/s][A
iter: 15905; loss: 1.530; loss_rec: 12.239; loss_kl: 155.931; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▌                                    | 15905/24000 [31:36<15:33,  8.67it/s][A
iter: 15905; loss: 1.530; loss_rec: 12.239; loss_kl: 155.931; beta: 0.000:  66%|

iter: 15938; loss: 1.738; loss_rec: 17.487; loss_kl: 161.245; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▋                                    | 15939/24000 [31:40<16:07,  8.33it/s][A
iter: 15939; loss: 2.798; loss_rec: 33.580; loss_kl: 167.435; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▋                                    | 15939/24000 [31:40<16:07,  8.33it/s][A
iter: 15939; loss: 2.798; loss_rec: 33.580; loss_kl: 167.435; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▋                                    | 15940/24000 [31:40<16:11,  8.29it/s][A
iter: 15940; loss: 2.145; loss_rec: 22.812; loss_kl: 173.517; beta: 0.000:  66%|███████████████████████████████████████████████████████████████████████▋                                    | 15940/24000 [31:40<16:11,  8.29it/s][A
iter: 15940; loss: 2.145; loss_rec: 22.812; loss_kl: 173.517; beta: 0.000:  66%|

iter: 15973; loss: 2.539; loss_rec: 19.541; loss_kl: 150.702; beta: 0.000:  67%|███████████████████████████████████████████████████████████████████████▉                                    | 15974/24000 [31:44<15:58,  8.37it/s][A
iter: 15974; loss: 1.636; loss_rec: 13.069; loss_kl: 156.293; beta: 0.000:  67%|███████████████████████████████████████████████████████████████████████▉                                    | 15974/24000 [31:44<15:58,  8.37it/s][A
iter: 15974; loss: 1.636; loss_rec: 13.069; loss_kl: 156.293; beta: 0.000:  67%|███████████████████████████████████████████████████████████████████████▉                                    | 15975/24000 [31:44<15:39,  8.54it/s][A
iter: 15975; loss: 1.849; loss_rec: 21.827; loss_kl: 164.349; beta: 0.000:  67%|███████████████████████████████████████████████████████████████████████▉                                    | 15975/24000 [31:44<15:39,  8.54it/s][A
iter: 15975; loss: 1.849; loss_rec: 21.827; loss_kl: 164.349; beta: 0.000:  67%|

iter: 16008; loss: 0.618; loss_rec: 4.329; loss_kl: 143.312; beta: 0.000:  67%|████████████████████████████████████████████████████████████████████████▋                                    | 16009/24000 [31:48<15:19,  8.69it/s][A
iter: 16009; loss: 2.165; loss_rec: 25.380; loss_kl: 183.645; beta: 0.000:  67%|████████████████████████████████████████████████████████████████████████                                    | 16009/24000 [31:48<15:19,  8.69it/s][A
iter: 16009; loss: 2.165; loss_rec: 25.380; loss_kl: 183.645; beta: 0.000:  67%|████████████████████████████████████████████████████████████████████████                                    | 16010/24000 [31:48<15:20,  8.68it/s][A
iter: 16010; loss: 1.936; loss_rec: 18.055; loss_kl: 167.432; beta: 0.000:  67%|████████████████████████████████████████████████████████████████████████                                    | 16010/24000 [31:48<15:20,  8.68it/s][A
iter: 16010; loss: 1.936; loss_rec: 18.055; loss_kl: 167.432; beta: 0.000:  67%|

iter: 16043; loss: 1.164; loss_rec: 10.770; loss_kl: 167.913; beta: 0.000:  67%|████████████████████████████████████████████████████████████████████████▏                                   | 16044/24000 [31:52<15:26,  8.58it/s][A
iter: 16044; loss: 1.726; loss_rec: 19.331; loss_kl: 144.433; beta: 0.000:  67%|████████████████████████████████████████████████████████████████████████▏                                   | 16044/24000 [31:52<15:26,  8.58it/s][A
iter: 16044; loss: 1.726; loss_rec: 19.331; loss_kl: 144.433; beta: 0.000:  67%|████████████████████████████████████████████████████████████████████████▏                                   | 16045/24000 [31:52<15:27,  8.58it/s][A
iter: 16045; loss: 1.012; loss_rec: 9.177; loss_kl: 159.191; beta: 0.000:  67%|████████████████████████████████████████████████████████████████████████▊                                    | 16045/24000 [31:52<15:27,  8.58it/s][A
iter: 16045; loss: 1.012; loss_rec: 9.177; loss_kl: 159.191; beta: 0.000:  67%|█

iter: 16078; loss: 1.559; loss_rec: 13.380; loss_kl: 172.613; beta: 0.000:  67%|████████████████████████████████████████████████████████████████████████▎                                   | 16079/24000 [31:56<15:46,  8.37it/s][A
iter: 16079; loss: 2.118; loss_rec: 22.463; loss_kl: 168.536; beta: 0.000:  67%|████████████████████████████████████████████████████████████████████████▎                                   | 16079/24000 [31:56<15:46,  8.37it/s][A
iter: 16079; loss: 2.118; loss_rec: 22.463; loss_kl: 168.536; beta: 0.000:  67%|████████████████████████████████████████████████████████████████████████▎                                   | 16080/24000 [31:56<15:41,  8.41it/s][A
iter: 16080; loss: 1.768; loss_rec: 19.322; loss_kl: 175.927; beta: 0.000:  67%|████████████████████████████████████████████████████████████████████████▎                                   | 16080/24000 [31:56<15:41,  8.41it/s][A
iter: 16080; loss: 1.768; loss_rec: 19.322; loss_kl: 175.927; beta: 0.000:  67%|

iter: 18636; loss: 2.454; loss_rec: 36.707; loss_kl: 148.140; beta: 0.000:  78%|███████████████████████████████████████████████████████████████████████████████████▊                        | 18636/24000 [37:03<10:14,  8.73it/s][A
iter: 18636; loss: 2.454; loss_rec: 36.707; loss_kl: 148.140; beta: 0.000:  78%|███████████████████████████████████████████████████████████████████████████████████▊                        | 18637/24000 [37:03<10:11,  8.77it/s][A
iter: 18637; loss: 2.166; loss_rec: 27.329; loss_kl: 164.026; beta: 0.000:  78%|███████████████████████████████████████████████████████████████████████████████████▊                        | 18637/24000 [37:03<10:11,  8.77it/s][A
iter: 18637; loss: 2.166; loss_rec: 27.329; loss_kl: 164.026; beta: 0.000:  78%|███████████████████████████████████████████████████████████████████████████████████▊                        | 18638/24000 [37:03<10:09,  8.79it/s][A
iter: 18638; loss: 2.009; loss_rec: 26.136; loss_kl: 168.766; beta: 0.000:  78%|

iter: 18671; loss: 2.657; loss_rec: 39.744; loss_kl: 165.680; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████                        | 18671/24000 [37:07<10:13,  8.69it/s][A
iter: 18671; loss: 2.657; loss_rec: 39.744; loss_kl: 165.680; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████                        | 18672/24000 [37:07<10:11,  8.72it/s][A
iter: 18672; loss: 1.806; loss_rec: 13.242; loss_kl: 152.914; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████                        | 18672/24000 [37:07<10:11,  8.72it/s][A
iter: 18672; loss: 1.806; loss_rec: 13.242; loss_kl: 152.914; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████                        | 18673/24000 [37:07<10:19,  8.60it/s][A
iter: 18673; loss: 1.223; loss_rec: 7.338; loss_kl: 158.699; beta: 0.000:  78%|█

iter: 18706; loss: 1.674; loss_rec: 19.908; loss_kl: 154.911; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▏                       | 18706/24000 [37:11<10:18,  8.56it/s][A
iter: 18706; loss: 1.674; loss_rec: 19.908; loss_kl: 154.911; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▏                       | 18707/24000 [37:11<10:18,  8.56it/s][A
iter: 18707; loss: 2.151; loss_rec: 23.747; loss_kl: 186.208; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▏                       | 18707/24000 [37:11<10:18,  8.56it/s][A
iter: 18707; loss: 2.151; loss_rec: 23.747; loss_kl: 186.208; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▏                       | 18708/24000 [37:12<10:13,  8.62it/s][A
iter: 18708; loss: 2.147; loss_rec: 30.128; loss_kl: 159.067; beta: 0.000:  78%|

iter: 18741; loss: 2.333; loss_rec: 34.294; loss_kl: 160.945; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▎                       | 18741/24000 [37:15<10:40,  8.21it/s][A
iter: 18741; loss: 2.333; loss_rec: 34.294; loss_kl: 160.945; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▎                       | 18742/24000 [37:16<10:40,  8.21it/s][A
iter: 18742; loss: 0.555; loss_rec: 3.686; loss_kl: 146.277; beta: 0.000:  78%|█████████████████████████████████████████████████████████████████████████████████████                        | 18742/24000 [37:16<10:40,  8.21it/s][A
iter: 18742; loss: 0.555; loss_rec: 3.686; loss_kl: 146.277; beta: 0.000:  78%|█████████████████████████████████████████████████████████████████████████████████████                        | 18743/24000 [37:16<10:31,  8.33it/s][A
iter: 18743; loss: 2.138; loss_rec: 23.331; loss_kl: 165.186; beta: 0.000:  78%|

iter: 18776; loss: 2.138; loss_rec: 27.832; loss_kl: 175.972; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▍                       | 18776/24000 [37:20<10:17,  8.46it/s][A
iter: 18776; loss: 2.138; loss_rec: 27.832; loss_kl: 175.972; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▍                       | 18777/24000 [37:20<10:12,  8.53it/s][A
iter: 18777; loss: 1.967; loss_rec: 27.328; loss_kl: 171.245; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▍                       | 18777/24000 [37:20<10:12,  8.53it/s][A
iter: 18777; loss: 1.967; loss_rec: 27.328; loss_kl: 171.245; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▌                       | 18778/24000 [37:20<10:06,  8.61it/s][A
iter: 18778; loss: 2.743; loss_rec: 31.998; loss_kl: 171.153; beta: 0.000:  78%|

iter: 18811; loss: 1.937; loss_rec: 26.843; loss_kl: 175.202; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▋                       | 18811/24000 [37:24<10:10,  8.50it/s][A
iter: 18811; loss: 1.937; loss_rec: 26.843; loss_kl: 175.202; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▋                       | 18812/24000 [37:24<10:14,  8.44it/s][A
iter: 18812; loss: 0.974; loss_rec: 11.647; loss_kl: 171.185; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▋                       | 18812/24000 [37:24<10:14,  8.44it/s][A
iter: 18812; loss: 0.974; loss_rec: 11.647; loss_kl: 171.185; beta: 0.000:  78%|████████████████████████████████████████████████████████████████████████████████████▋                       | 18813/24000 [37:24<10:25,  8.30it/s][A
iter: 18813; loss: 2.512; loss_rec: 29.067; loss_kl: 172.505; beta: 0.000:  78%|

iter: 18846; loss: 1.783; loss_rec: 18.770; loss_kl: 170.955; beta: 0.000:  79%|████████████████████████████████████████████████████████████████████████████████████▊                       | 18846/24000 [37:28<09:58,  8.61it/s][A
iter: 18846; loss: 1.783; loss_rec: 18.770; loss_kl: 170.955; beta: 0.000:  79%|████████████████████████████████████████████████████████████████████████████████████▊                       | 18847/24000 [37:28<09:55,  8.66it/s][A
iter: 18847; loss: 0.971; loss_rec: 9.709; loss_kl: 165.978; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▌                       | 18847/24000 [37:28<09:55,  8.66it/s][A
iter: 18847; loss: 0.971; loss_rec: 9.709; loss_kl: 165.978; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▌                       | 18848/24000 [37:28<10:03,  8.54it/s][A
iter: 18848; loss: 1.917; loss_rec: 25.721; loss_kl: 176.939; beta: 0.000:  79%|

iter: 18881; loss: 2.564; loss_rec: 39.174; loss_kl: 174.630; beta: 0.000:  79%|████████████████████████████████████████████████████████████████████████████████████▉                       | 18881/24000 [37:32<09:44,  8.76it/s][A
iter: 18881; loss: 2.564; loss_rec: 39.174; loss_kl: 174.630; beta: 0.000:  79%|████████████████████████████████████████████████████████████████████████████████████▉                       | 18882/24000 [37:32<10:06,  8.44it/s][A
iter: 18882; loss: 1.739; loss_rec: 20.176; loss_kl: 167.257; beta: 0.000:  79%|████████████████████████████████████████████████████████████████████████████████████▉                       | 18882/24000 [37:32<10:06,  8.44it/s][A
iter: 18882; loss: 1.739; loss_rec: 20.176; loss_kl: 167.257; beta: 0.000:  79%|████████████████████████████████████████████████████████████████████████████████████▉                       | 18883/24000 [37:32<10:15,  8.31it/s][A
iter: 18883; loss: 1.056; loss_rec: 11.760; loss_kl: 175.488; beta: 0.000:  79%|

iter: 18916; loss: 1.870; loss_rec: 19.288; loss_kl: 190.537; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████                       | 18916/24000 [37:36<10:17,  8.23it/s][A
iter: 18916; loss: 1.870; loss_rec: 19.288; loss_kl: 190.537; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▏                      | 18917/24000 [37:36<10:15,  8.26it/s][A
iter: 18917; loss: 0.089; loss_rec: 0.444; loss_kl: 132.594; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▉                       | 18917/24000 [37:36<10:15,  8.26it/s][A
iter: 18917; loss: 0.089; loss_rec: 0.444; loss_kl: 132.594; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▉                       | 18918/24000 [37:36<10:12,  8.30it/s][A
iter: 18918; loss: 2.168; loss_rec: 25.012; loss_kl: 190.113; beta: 0.000:  79%|

iter: 18951; loss: 1.488; loss_rec: 19.172; loss_kl: 168.380; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▎                      | 18951/24000 [37:40<10:18,  8.16it/s][A
iter: 18951; loss: 1.488; loss_rec: 19.172; loss_kl: 168.380; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▎                      | 18952/24000 [37:40<10:25,  8.07it/s][A
iter: 18952; loss: 2.606; loss_rec: 40.956; loss_kl: 160.712; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▎                      | 18952/24000 [37:40<10:25,  8.07it/s][A
iter: 18952; loss: 2.606; loss_rec: 40.956; loss_kl: 160.712; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▎                      | 18953/24000 [37:40<10:32,  7.98it/s][A
iter: 18953; loss: 1.417; loss_rec: 17.298; loss_kl: 180.218; beta: 0.000:  79%|

iter: 18986; loss: 2.012; loss_rec: 27.875; loss_kl: 178.754; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▍                      | 18986/24000 [37:44<09:41,  8.62it/s][A
iter: 18986; loss: 2.012; loss_rec: 27.875; loss_kl: 178.754; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▍                      | 18987/24000 [37:44<09:49,  8.50it/s][A
iter: 18987; loss: 3.444; loss_rec: 43.173; loss_kl: 186.318; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▍                      | 18987/24000 [37:44<09:49,  8.50it/s][A
iter: 18987; loss: 3.444; loss_rec: 43.173; loss_kl: 186.318; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▍                      | 18988/24000 [37:44<09:58,  8.38it/s][A
iter: 18988; loss: 2.292; loss_rec: 17.373; loss_kl: 160.402; beta: 0.000:  79%|

iter: 19021; loss: 1.908; loss_rec: 24.550; loss_kl: 184.438; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▌                      | 19021/24000 [37:48<09:53,  8.39it/s][A
iter: 19021; loss: 1.908; loss_rec: 24.550; loss_kl: 184.438; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▌                      | 19022/24000 [37:48<09:57,  8.33it/s][A
iter: 19022; loss: 1.353; loss_rec: 17.362; loss_kl: 176.025; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▌                      | 19022/24000 [37:48<09:57,  8.33it/s][A
iter: 19022; loss: 1.353; loss_rec: 17.362; loss_kl: 176.025; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▌                      | 19023/24000 [37:49<09:57,  8.33it/s][A
iter: 19023; loss: 1.484; loss_rec: 19.842; loss_kl: 169.334; beta: 0.000:  79%|

iter: 19056; loss: 1.214; loss_rec: 11.191; loss_kl: 162.998; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▊                      | 19056/24000 [37:53<09:58,  8.27it/s][A
iter: 19056; loss: 1.214; loss_rec: 11.191; loss_kl: 162.998; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▊                      | 19057/24000 [37:53<09:56,  8.29it/s][A
iter: 19057; loss: 2.295; loss_rec: 25.416; loss_kl: 176.331; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▊                      | 19057/24000 [37:53<09:56,  8.29it/s][A
iter: 19057; loss: 2.295; loss_rec: 25.416; loss_kl: 176.331; beta: 0.000:  79%|█████████████████████████████████████████████████████████████████████████████████████▊                      | 19058/24000 [37:53<09:48,  8.39it/s][A
iter: 19058; loss: 2.204; loss_rec: 28.144; loss_kl: 181.877; beta: 0.000:  79%|

iter: 19091; loss: 0.517; loss_rec: 4.279; loss_kl: 170.288; beta: 0.000:  80%|██████████████████████████████████████████████████████████████████████████████████████▋                      | 19091/24000 [37:57<09:32,  8.58it/s][A
iter: 19091; loss: 0.517; loss_rec: 4.279; loss_kl: 170.288; beta: 0.000:  80%|██████████████████████████████████████████████████████████████████████████████████████▋                      | 19092/24000 [37:57<09:30,  8.61it/s][A
iter: 19092; loss: 0.901; loss_rec: 7.840; loss_kl: 169.216; beta: 0.000:  80%|██████████████████████████████████████████████████████████████████████████████████████▋                      | 19092/24000 [37:57<09:30,  8.61it/s][A
iter: 19092; loss: 0.901; loss_rec: 7.840; loss_kl: 169.216; beta: 0.000:  80%|██████████████████████████████████████████████████████████████████████████████████████▋                      | 19093/24000 [37:57<09:28,  8.63it/s][A
iter: 19093; loss: 1.019; loss_rec: 10.577; loss_kl: 163.376; beta: 0.000:  80%|

iter: 21537; loss: 1.727; loss_rec: 19.415; loss_kl: 165.156; beta: 0.000:  90%|████████████████████████████████████████████████████████████████████████████████████████████████▉           | 21537/24000 [42:50<04:57,  8.27it/s][A
iter: 21537; loss: 1.727; loss_rec: 19.415; loss_kl: 165.156; beta: 0.000:  90%|████████████████████████████████████████████████████████████████████████████████████████████████▉           | 21538/24000 [42:50<04:57,  8.28it/s][A
iter: 21538; loss: 2.718; loss_rec: 43.642; loss_kl: 175.818; beta: 0.000:  90%|████████████████████████████████████████████████████████████████████████████████████████████████▉           | 21538/24000 [42:51<04:57,  8.28it/s][A
iter: 21538; loss: 2.718; loss_rec: 43.642; loss_kl: 175.818; beta: 0.000:  90%|████████████████████████████████████████████████████████████████████████████████████████████████▉           | 21539/24000 [42:51<04:55,  8.34it/s][A
iter: 21539; loss: 2.365; loss_rec: 33.500; loss_kl: 176.330; beta: 0.000:  90%|

iter: 21572; loss: 1.339; loss_rec: 15.676; loss_kl: 149.763; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████           | 21572/24000 [42:55<04:57,  8.16it/s][A
iter: 21572; loss: 1.339; loss_rec: 15.676; loss_kl: 149.763; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████           | 21573/24000 [42:55<04:56,  8.19it/s][A
iter: 21573; loss: 1.091; loss_rec: 8.829; loss_kl: 173.133; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▉           | 21573/24000 [42:55<04:56,  8.19it/s][A
iter: 21573; loss: 1.091; loss_rec: 8.829; loss_kl: 173.133; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▉           | 21574/24000 [42:55<04:50,  8.36it/s][A
iter: 21574; loss: 0.912; loss_rec: 9.330; loss_kl: 161.199; beta: 0.000:  90%|█

iter: 21607; loss: 1.125; loss_rec: 15.857; loss_kl: 186.468; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 21607/24000 [42:59<04:52,  8.18it/s][A
iter: 21607; loss: 1.125; loss_rec: 15.857; loss_kl: 186.468; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 21608/24000 [42:59<04:52,  8.19it/s][A
iter: 21608; loss: 1.358; loss_rec: 11.121; loss_kl: 175.942; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 21608/24000 [42:59<04:52,  8.19it/s][A
iter: 21608; loss: 1.358; loss_rec: 11.121; loss_kl: 175.942; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 21609/24000 [42:59<04:54,  8.11it/s][A
iter: 21609; loss: 0.616; loss_rec: 3.694; loss_kl: 161.086; beta: 0.000:  90%|█

iter: 21642; loss: 1.833; loss_rec: 17.952; loss_kl: 162.368; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▍          | 21642/24000 [43:03<04:50,  8.11it/s][A
iter: 21642; loss: 1.833; loss_rec: 17.952; loss_kl: 162.368; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▍          | 21643/24000 [43:03<04:50,  8.12it/s][A
iter: 21643; loss: 2.357; loss_rec: 36.182; loss_kl: 168.240; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▍          | 21643/24000 [43:03<04:50,  8.12it/s][A
iter: 21643; loss: 2.357; loss_rec: 36.182; loss_kl: 168.240; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▍          | 21644/24000 [43:03<04:51,  8.09it/s][A
iter: 21644; loss: 1.595; loss_rec: 18.110; loss_kl: 161.351; beta: 0.000:  90%|

iter: 21677; loss: 1.935; loss_rec: 26.651; loss_kl: 174.587; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▌          | 21677/24000 [43:07<04:38,  8.33it/s][A
iter: 21677; loss: 1.935; loss_rec: 26.651; loss_kl: 174.587; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▌          | 21678/24000 [43:08<04:40,  8.28it/s][A
iter: 21678; loss: 0.711; loss_rec: 5.408; loss_kl: 166.339; beta: 0.000:  90%|██████████████████████████████████████████████████████████████████████████████████████████████████▍          | 21678/24000 [43:08<04:40,  8.28it/s][A
iter: 21678; loss: 0.711; loss_rec: 5.408; loss_kl: 166.339; beta: 0.000:  90%|██████████████████████████████████████████████████████████████████████████████████████████████████▍          | 21679/24000 [43:08<04:38,  8.33it/s][A
iter: 21679; loss: 1.387; loss_rec: 16.032; loss_kl: 179.720; beta: 0.000:  90%|

iter: 21712; loss: 2.442; loss_rec: 33.740; loss_kl: 180.173; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▋          | 21712/24000 [43:12<04:35,  8.31it/s][A
iter: 21712; loss: 2.442; loss_rec: 33.740; loss_kl: 180.173; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▋          | 21713/24000 [43:12<04:36,  8.26it/s][A
iter: 21713; loss: 2.142; loss_rec: 35.064; loss_kl: 164.556; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▋          | 21713/24000 [43:12<04:36,  8.26it/s][A
iter: 21713; loss: 2.142; loss_rec: 35.064; loss_kl: 164.556; beta: 0.000:  90%|█████████████████████████████████████████████████████████████████████████████████████████████████▋          | 21714/24000 [43:12<04:39,  8.18it/s][A
iter: 21714; loss: 1.445; loss_rec: 13.334; loss_kl: 171.085; beta: 0.000:  90%|

iter: 21747; loss: 1.650; loss_rec: 23.759; loss_kl: 188.216; beta: 0.000:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████▊          | 21747/24000 [43:16<04:37,  8.13it/s][A
iter: 21747; loss: 1.650; loss_rec: 23.759; loss_kl: 188.216; beta: 0.000:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████▊          | 21748/24000 [43:16<04:37,  8.12it/s][A
iter: 21748; loss: 2.568; loss_rec: 38.522; loss_kl: 171.008; beta: 0.000:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████▊          | 21748/24000 [43:16<04:37,  8.12it/s][A
iter: 21748; loss: 2.568; loss_rec: 38.522; loss_kl: 171.008; beta: 0.000:  91%|█████████████████████████████████████████████████████████████████████████████████████████████████▊          | 21749/24000 [43:16<04:34,  8.21it/s][A
iter: 21749; loss: 2.118; loss_rec: 33.114; loss_kl: 153.301; beta: 0.000:  91%|

iter: 21782; loss: 1.775; loss_rec: 19.786; loss_kl: 172.763; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████          | 21782/24000 [43:20<04:35,  8.07it/s][A
iter: 21782; loss: 1.775; loss_rec: 19.786; loss_kl: 172.763; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████          | 21783/24000 [43:20<04:33,  8.12it/s][A
iter: 21783; loss: 2.093; loss_rec: 21.629; loss_kl: 180.477; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████          | 21783/24000 [43:20<04:33,  8.12it/s][A
iter: 21783; loss: 2.093; loss_rec: 21.629; loss_kl: 180.477; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████          | 21784/24000 [43:20<04:29,  8.21it/s][A
iter: 21784; loss: 1.689; loss_rec: 18.437; loss_kl: 173.174; beta: 0.000:  91%|

iter: 21817; loss: 1.825; loss_rec: 26.870; loss_kl: 171.270; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████▏         | 21817/24000 [43:25<04:25,  8.23it/s][A
iter: 21817; loss: 1.825; loss_rec: 26.870; loss_kl: 171.270; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████▏         | 21818/24000 [43:25<04:24,  8.25it/s][A
iter: 21818; loss: 1.548; loss_rec: 21.316; loss_kl: 167.370; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████▏         | 21818/24000 [43:25<04:24,  8.25it/s][A
iter: 21818; loss: 1.548; loss_rec: 21.316; loss_kl: 167.370; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████▏         | 21819/24000 [43:25<04:24,  8.23it/s][A
iter: 21819; loss: 1.168; loss_rec: 11.411; loss_kl: 164.870; beta: 0.000:  91%|

iter: 21852; loss: 1.921; loss_rec: 22.040; loss_kl: 167.302; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████▎         | 21852/24000 [43:29<04:17,  8.35it/s][A
iter: 21852; loss: 1.921; loss_rec: 22.040; loss_kl: 167.302; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████▎         | 21853/24000 [43:29<04:16,  8.36it/s][A
iter: 21853; loss: 1.267; loss_rec: 14.452; loss_kl: 173.754; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████▎         | 21853/24000 [43:29<04:16,  8.36it/s][A
iter: 21853; loss: 1.267; loss_rec: 14.452; loss_kl: 173.754; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████▎         | 21854/24000 [43:29<04:20,  8.22it/s][A
iter: 21854; loss: 1.804; loss_rec: 18.452; loss_kl: 186.535; beta: 0.000:  91%|

iter: 21887; loss: 2.129; loss_rec: 29.802; loss_kl: 185.282; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████▍         | 21887/24000 [43:33<04:20,  8.12it/s][A
iter: 21887; loss: 2.129; loss_rec: 29.802; loss_kl: 185.282; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████▍         | 21888/24000 [43:33<04:22,  8.04it/s][A
iter: 21888; loss: 1.689; loss_rec: 19.222; loss_kl: 176.328; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████▍         | 21888/24000 [43:33<04:22,  8.04it/s][A
iter: 21888; loss: 1.689; loss_rec: 19.222; loss_kl: 176.328; beta: 0.000:  91%|██████████████████████████████████████████████████████████████████████████████████████████████████▌         | 21889/24000 [43:33<04:22,  8.05it/s][A
iter: 21889; loss: 1.242; loss_rec: 12.675; loss_kl: 183.422; beta: 0.000:  91%|

In [1]:
import os
import time
import math
import numpy as np
import random
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

import datetime
now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

In [2]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer, AutoConfig
from matplotlib import pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

2022-06-16 13:58:07.806306: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-16 13:58:07.806327: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
label_list = ['UNK',1,2,3,4,5,6,7,8,9,10]

In [4]:
df_train_l=pd.read_csv("../../yahoo/assigned/train_l.csv", index_col="Unnamed: 0")
df_test_l=pd.read_csv("../../yahoo/assigned/test_l.csv", index_col="Unnamed: 0")
df_u=pd.read_csv("../../yahoo/assigned/u.csv", index_col="Unnamed: 0")
df_train_u=pd.read_csv("../../yahoo/assigned/train_u.csv", index_col="Unnamed: 0")#.head(30000)
df_test_u=pd.read_csv("../../yahoo/assigned/test_u.csv", index_col="Unnamed: 0")#.head(5000)
df_all = pd.concat([df_train_l, df_test_l, df_u, df_train_u, df_test_u])

In [5]:
train_l =  list(df_train_l.to_records(index=False))
test_l = list(df_test_l.to_records(index=False))
u_list = list(df_u.to_records(index=False))
test_u = list(df_test_u.to_records(index=False))
train_u = list(df_train_u.to_records(index=False))
data_all = list(df_all["0"])

In [6]:
#--------------------------------
#  Transformer parameters
#--------------------------------
max_seq_length = 64
batch_size_d = 12

#--------------------------------
#  GAN-BERT specific parameters
#--------------------------------
# number of hidden layers in the generator, 
# each of the size of the output space
#num_hidden_layers_g = 1; 
# number of hidden layers in the discriminator, 
# each of the size of the input space
num_hidden_layers_d = 1; 
# size of the generator's input noisy vectors
noise_size = 100
# dropout to be applied to discriminator's input vectors
out_dropout_rate = 0.2

# Replicate labeled data to balance poorly represented datasets, 
# e.g., less than 1% of labeled material
apply_balance = True

#--------------------------------
#  Optimization parameters
#--------------------------------
learning_rate_discriminator = 5e-5 #5e-6?
#learning_rate_generator = 5e-5
epsilon = 1e-8
num_train_epochs = 50
multi_gpu = True
# Scheduler
apply_scheduler = False
warmup_proportion = 0.1
# Print
print_each_n_step = 10

#--------------------------------
#  Adopted Tranformer model
#--------------------------------
# Since this version is compatible with Huggingface transformers, you can uncomment
# (or add) transformer models compatible with GAN

model_name = "bert-base-cased"
#model_name = "bert-base-uncased"
#model_name = "roberta-base"
#model_name = "albert-base-v2"
#model_name = "xlm-roberta-base"
#model_name = "amazon/bort"
#model_name="google/electra-large-discriminator"
#model_name="google/electra-small-discriminator"
#model_name="microsoft/deberta-v2-xxlarge"
#model_name="microsoft/deberta-v3-base"
#model_name = "google/electra-base-discriminator"

In [7]:
transformer = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
def generate_data_loader(input_examples, label_masks, label_map, do_shuffle = False, balance_label_examples = False):
  '''
  Generate a Dataloader given the input examples, eventually masked if they are 
  to be considered NOT labeled.
  '''
  examples = []

  # Count the percentage of labeled examples  
  num_labeled_examples = 0
  for label_mask in label_masks:
    if label_mask: 
      num_labeled_examples += 1
  label_mask_rate = num_labeled_examples/len(input_examples)

  # if required it applies the balance
  for index, ex in enumerate(input_examples): 
    if label_mask_rate == 1 or not balance_label_examples:
      examples.append((ex, label_masks[index]))
    else:
      # IT SIMULATE A LABELED EXAMPLE
      if label_masks[index]:
        balance = int(1/label_mask_rate)
        balance = int(math.log(balance,2))
        if balance < 1:
          balance = 1
        for b in range(0, int(balance)):
          examples.append((ex, label_masks[index]))
      else:
        examples.append((ex, label_masks[index]))
  
  #-----------------------------------------------
  # Generate input examples to the Transformer
  #-----------------------------------------------
  input_ids = []
  input_mask_array = []
  label_mask_array = []
  label_id_array = []

  # Tokenization 
  for (text, label_mask) in examples:
    encoded_sent = tokenizer.encode(text[0], add_special_tokens=True, max_length=max_seq_length, padding="max_length", truncation=True)
    input_ids.append(encoded_sent)
    label_id_array.append(label_map[text[1]])
    label_mask_array.append(label_mask)
  
  # Attention to token (to ignore padded input wordpieces)
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]                          
    input_mask_array.append(att_mask)
  # Convertion to Tensor
  input_ids = torch.tensor(input_ids) 
  input_mask_array = torch.tensor(input_mask_array)
  label_id_array = torch.tensor(label_id_array, dtype=torch.long)
  label_mask_array = torch.tensor(label_mask_array)

  # Building the TensorDataset
  dataset = TensorDataset(input_ids, input_mask_array, label_id_array, label_mask_array)

  if do_shuffle:
    sampler = RandomSampler
  else:
    sampler = SequentialSampler

  # Building the DataLoader
  return DataLoader(
              dataset,  # The training samples.
              sampler = sampler(dataset), 
              batch_size = batch_size) # Trains with this batch size.

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [9]:
def generate_data_fake(input_examples):
  '''
  Generate a Dataloader given the input examples, eventually masked if they are 
  to be considered NOT labeled.
  '''
  
  #-----------------------------------------------
  # Generate input examples to the Transformer
  #-----------------------------------------------
  input_ids = []
  input_mask_array = []

  # Tokenization 
  for text in input_examples:
    encoded_sent = tokenizer.encode(text, add_special_tokens=True, max_length=max_seq_length, padding="max_length", truncation=True)
    input_ids.append(encoded_sent)
  
  # Attention to token (to ignore padded input wordpieces)
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]                          
    input_mask_array.append(att_mask)
  # Convertion to Tensor
  input_ids = torch.tensor(input_ids) 
  input_mask_array = torch.tensor(input_mask_array)

  # Building the DataLoader
  return input_ids, input_mask_array # Trains with this batch size.

In [10]:
#Load the examples
labeled_examples = train_l
unlabeled_examples = u_list
test_examples = test_l

In [11]:
batch_size = 64
label_map = {}
for (i, label) in enumerate(label_list):
  label_map[label] = i
#------------------------------
#   Load the train dataset
#------------------------------
train_examples = labeled_examples
#The labeled (train) dataset is assigned with a mask set to True
train_label_masks = np.ones(len(labeled_examples), dtype=bool)
#If unlabel examples are available
if unlabeled_examples:
  train_examples = train_examples + unlabeled_examples
  #The unlabeled (train) dataset is assigned with a mask set to False
  tmp_masks = np.zeros(len(unlabeled_examples), dtype=bool)
  train_label_masks = np.concatenate([train_label_masks,tmp_masks])

train_dataloader = generate_data_loader(train_examples, train_label_masks, label_map, do_shuffle = True, balance_label_examples = apply_balance)

#------------------------------
#   Load the test dataset
#------------------------------
#The labeled (test) dataset is assigned with a mask set to True
test_label_masks = np.ones(len(test_examples), dtype=bool)

test_dataloader = generate_data_loader(test_examples, test_label_masks, label_map, do_shuffle = False, balance_label_examples = False)

  label_mask_array = torch.tensor(label_mask_array)


In [12]:
#------------------------------
#   The Discriminator
#   https://www.aclweb.org/anthology/2020.acl-main.191/
#   https://github.com/crux82/ganbert
#------------------------------
class Discriminator(nn.Module):
    def __init__(self, input_size=512, hidden_sizes=[512], num_labels=2, dropout_rate=0.1):
        super(Discriminator, self).__init__()
        self.input_dropout = nn.Dropout(p=dropout_rate)
        layers = []
        hidden_sizes = [input_size] + hidden_sizes
        for i in range(len(hidden_sizes)-1):
            layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])

        self.layers = nn.Sequential(*layers) #per il flatten
        self.logit = nn.Linear(hidden_sizes[-1],num_labels+1) # +1 for the probability of this sample being fake/real.
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, input_rep):
        input_rep = self.input_dropout(input_rep)
        last_rep = self.layers(input_rep)
        logits = self.logit(last_rep)
        probs = self.softmax(logits)
        return last_rep, logits, probs

In [13]:
# The config file is required to get the dimension of the vector produced by 
# the underlying transformer
config = AutoConfig.from_pretrained(model_name)
hidden_size = int(config.hidden_size)
# Define the number and width of hidden layers
#hidden_levels_g = [hidden_size for i in range(0, num_hidden_layers_g)]
hidden_levels_d = [hidden_size for i in range(0, num_hidden_layers_d)]

#-------------------------------------------------
#   Instantiate the Generator and Discriminator
#-------------------------------------------------
#generator = Generator(noise_size=noise_size, output_size=hidden_size, hidden_sizes=hidden_levels_g, dropout_rate=out_dropout_rate)
discriminator = Discriminator(input_size=hidden_size, hidden_sizes=hidden_levels_d, num_labels=len(label_list), dropout_rate=out_dropout_rate)

# Put everything in the GPU if available
if torch.cuda.is_available():    
  #generator.cuda()
  discriminator.cuda()
  transformer.cuda()
  if multi_gpu:
    transformer = torch.nn.DataParallel(transformer)

# print(config)

In [14]:
training_stats = []

accuracy_array=[]

# Measure the total training time for the whole run.
total_t0 = time.time()

#models parameters
transformer_vars = [i for i in transformer.parameters()]
d_vars = transformer_vars + [v for v in discriminator.parameters()]
#g_vars = [v for v in generator.parameters()]

#optimizer
dis_optimizer = torch.optim.AdamW(d_vars, lr=learning_rate_discriminator)
#gen_optimizer = torch.optim.AdamW(g_vars, lr=learning_rate_generator) 

#scheduler
if apply_scheduler:
  num_train_examples = len(train_examples)
  num_train_steps = int(num_train_examples / batch_size * num_train_epochs)
  num_warmup_steps = int(num_train_steps * warmup_proportion)

  scheduler_d = get_constant_schedule_with_warmup(dis_optimizer, 
                                           num_warmup_steps = num_warmup_steps)
  scheduler_g = get_constant_schedule_with_warmup(gen_optimizer, 
                                           num_warmup_steps = num_warmup_steps)

In [None]:
#OPTAGAN
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse

import logging
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import numpy as np

from torch.autograd import Variable
from modules.gan import Generator, Critic

import copy
import math
import glob
import os
import pickle
import random

import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from func import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, BertConfig
from func import GPT2LMHeadModel, GPT2Tokenizer, GPT2ForLatentConnector, GPT2ForLatentConnectorValueHead
from func import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
from func import XLNetLMHeadModel, XLNetTokenizer
from func import TransfoXLLMHeadModel, TransfoXLTokenizer
from func import BertForLatentConnector, BertTokenizer

from collections import defaultdict
from utils import (TextDataset_Split, TextDataset_2Tokenizers, BucketingDataLoader)
import pdb
from modules.utils import (calc_blue_parallel_func, pad_seq, rollout, rollout_test)
#from transformers.modeling_utils import top_k_top_p_filtering


MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2ForLatentConnectorValueHead, GPT2Tokenizer),
    'bert': (BertConfig, BertForLatentConnector, BertTokenizer)
}

def load_and_cache_examples(args, tokenizer):
    if isinstance(tokenizer, list):
        dataset = TextDataset_2Tokenizers(tokenizer, args, args.train_data_file, block_size=args.block_size)
    else:
        dataset = TextDataset_Split(tokenizer, args, args.train_data_file, block_size=args.block_size)
    return dataset

def build_dataload_and_cache_examples(args, tokenizer):
    if isinstance(tokenizer, list):
        args.batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
        file_path=args.train_data_file
        dataloader = BucketingDataLoader(file_path, args.batch_size, args.max_seq_length, tokenizer, args, bucket=100, shuffle=True)
        print(dataloader)
    else:
        pass 
    return dataloader

def compute_grad_penalty(critic, real_data, fake_data):
    B = real_data.size(0)
    alpha = torch.FloatTensor(np.random.random((B, 1)))
    if args.cuda:
        alpha = alpha.cuda()
    sample = alpha*real_data + (1-alpha)*fake_data
    sample.requires_grad_(True)
    score = critic(sample)

    outputs = torch.FloatTensor(B, 1).fill_(1.0) #args.latent_size
    outputs.requires_grad_(False)
    if args.cuda:
        outputs = outputs.cuda()
    grads = autograd.grad(
        outputs=score,
        inputs=sample,
        grad_outputs=outputs,
        create_graph=True,
        retain_graph=True,
        only_inputs=True)[0]
    grad_penalty = ((grads.norm(2, dim=1) - 1.) ** 2).mean()
    return grad_penalty

def train(epoch):
    model_encoder.eval()
    model_decoder.eval()
    generator.train()
    critic.train()
    c_train_loss = 0.
    g_train_loss = 0.
    g_batches = 0
    c_batches = 0
    c_loss_0 = 1
    g_loss_0 = 1
    for i, x in enumerate(train_loader):
        x = x[0]
        if args.cuda:
            x = x.cuda()
        # Generate noise
        B = args.per_gpu_train_batch_size
        noise = torch.from_numpy(np.random.normal(0, 1, (B,
                                 args.latent_size))).float()
        if args.cuda:
            noise = noise.cuda()
        # Get original text latent embeddings
        with torch.no_grad(): 
            pooled_hidden_fea = model_encoder(x, attention_mask=(x > 0).float())[1]
            mean, logvar = model_encoder.linear(pooled_hidden_fea).chunk(2, -1)
            z_real = mean.squeeze(1) 

        # Evaluate and get losses
        z_fake = generator(noise)
        real_score = critic(z_real)
        fake_score = critic(z_fake)
        grad_penalty = compute_grad_penalty(critic, z_real.data, z_fake.data)
        c_loss = -torch.mean(real_score) + torch.mean(fake_score) + \
                 args.gp_lambda*grad_penalty

        fake_score = critic(generator(noise))
        g_loss = -torch.mean(fake_score)
        
        r_g = abs(((g_loss.item() - g_loss_0) / (g_loss_0 + 0.001))) 
        r_c = abs(((c_loss.item() - c_loss_0) / (c_loss_0 + 0.001))) 
        
        # Update critic or generator
        if ((2 + epoch) / epoch) * r_c > r_g:
            c_optimizer.zero_grad()
            c_batches += 1
            c_train_loss += c_loss.item()
            c_loss.backward()
            c_optimizer.step()
        else:
            g_optimizer.zero_grad()
            g_batches += 1
            g_train_loss += g_loss.item()
            g_loss.backward()
            g_optimizer.step()

        c_loss_0 = c_loss.item()
        g_loss_0 = g_loss.item()

        if args.interval > 0 and i % args.interval == 0:
            logger.info('Epoch: {} | Batch: {}/{} ({:.0f}%) | G Loss: {:.6f} | C Loss: {:.6f}'.format(
                epoch, args.batch_size*i, len(train_loader.dataset),
                100.*(args.batch_size*i)/len(train_loader.dataset),
                g_loss.item(), c_loss.item()
            ))
            test_noise = torch.Tensor(np.random.normal(0, 1, (1, args.latent_size))).to(args.device)
            test_new_z = generator(test_noise).data
            # create new sent
            test_z = rollout_test(model_decoder, test_new_z, tokenizer_decoder, args.max_seq_length, 1, 0, 1)
            logger.info("Text: {}".format(test_z))

    c_train_loss /= c_batches + 1
    g_train_loss /= g_batches + 1
    logger.info('* (Train) Epoch: {} | G Loss: {:.4f} | C Loss: {:.4f} | Updates G: {} | Updates C: {}'.format(
        epoch, g_train_loss, c_train_loss, g_batches, c_batches
    ))
    return (g_train_loss, c_train_loss)

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--epochs', type=int, default=15)
    parser.add_argument('--lr', type=float, default=1e-4)
    parser.add_argument('--gp_lambda', type=int, default=10)
    parser.add_argument('--n_layers', type=int, default=20, help="Number of layers of generator and critic")
    parser.add_argument('--block_dim', type=int, default=100)
    parser.add_argument('--interval', type=int, default=10, help="Steps before logging output")
    parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
    
    # Optimus parameters
    parser.add_argument("--train_data_file", default=None, type=str, required=True,
                        help="The input training data file (a text file).")
    parser.add_argument("--valid_data_file", default=None, type=str, required=True,
                        help="The input validation data file (a text file).")
    parser.add_argument("--checkpoint_dir", default=None, type=str, required=True,
                        help="The directory where checkpoints are saved.")
    parser.add_argument('--generator_dir', default=None, type=str, help="Directory where GAN models are saved")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument("--dataset", default='Snli', type=str, help="The dataset.")    
    parser.add_argument("--latent_size", default=32, type=int, help="Latent space dimension.")
    ## Encoder options
    parser.add_argument("--encoder_model_type", default="bert", type=str,
                        help="The encoder model architecture to be fine-tuned.")
    parser.add_argument("--encoder_model_name_or_path", default="bert-base-cased", type=str,
                        help="The encoder model checkpoint for weights initialization.")
    parser.add_argument("--encoder_config_name", default="", type=str,
                        help="Optional pretrained config name or path if not the same as model_name_or_path")
    parser.add_argument("--encoder_tokenizer_name", default="", type=str,
                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
    ## Decoder options
    parser.add_argument("--decoder_model_type", default="gpt2", type=str,
                        help="The decoder model architecture to be fine-tuned.")
    parser.add_argument("--decoder_model_name_or_path", default="bert-base-cased", type=str,
                        help="The decoder model checkpoint for weights initialization.")
    parser.add_argument("--decoder_config_name", default="", type=str,
                        help="Optional pretrained config name or path if not the same as model_name_or_path")
    parser.add_argument("--decoder_tokenizer_name", default="", type=str,
                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
    parser.add_argument("--per_gpu_train_batch_size", default=1, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--max_seq_length", default=512, type=int,
                        help="Optional input sequence length before tokenization. The sequence will be dropped if it is longer the max_seq_length")

    ## Variational auto-encoder(check this)
    parser.add_argument("--prompt", type=str, default="")
    parser.add_argument("--padding_text", type=str, default="")
    parser.add_argument("--length", type=int, default=20)
    parser.add_argument("--block_size", default=-1, type=int,
                        help="Optional input sequence length after tokenization."
                             "The training dataset will be truncated in block of this size for training."
                             "Default to the model max input length for single sentence inputs (take into account special tokens).")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--use_philly", action='store_true',
                        help="Use Philly for computing.")
    parser.add_argument('--gloabl_step_eval', type=int, default=661,
                        help="Evaluate the results at the given global step")
    # Reinforcement learning parameters
    parser.add_argument('--finetune_decoder', type=bool, default=True)
    parser.add_argument('--epochs_rl', type=int, default=1000)
    parser.add_argument('--batch_size_rl', type=int, default=32)
    parser.add_argument('--lr_rl', type=float, default=1e-6)


    # Load a trained Encoder model and vocabulary that you have fine-tuned
    args = parser.parse_args("--dataset EMNLP \
    --checkpoint_dir=output_dir_yahoo \
    --output_dir=output_dir_yahoo \
    --encoder_model_type=bert \
    --encoder_model_name_or_path=bert-base-cased \
    --decoder_model_type=gpt2 \
    --decoder_model_name_or_path=gpt2 \
    --train_data_file=../../yahoo/unlabelled/train.txt \
    --valid_data_file=../../yahoo/unlabelled/test.txt \
    --per_gpu_train_batch_size 12 \
    --block_size 100 \
    --max_seq_length 24 \
    --gloabl_step_eval 24000 \
    --latent_size 32 \
    --block_dim 100 \
    --n_layers 10 \
    --interval 50 \
    --epochs 200 \
    --finetune_decoder True \
    --lr_rl 1e-6 \
    --epochs_rl 100 \
    --batch_size_rl 32".split())
    
    print(args)

    global_step = args.gloabl_step_eval

    torch.backends.cudnn.deterministic = True
    #args.device = torch.device("cuda" if args.cuda else "cpu")
    #args.n_gpu = torch.cuda.device_count()
    args.device = torch.device("cuda:0")
    args.n_gpu=1
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)       
    
    args.encoder_model_type = args.encoder_model_type.lower()
    args.decoder_model_type = args.decoder_model_type.lower()

    output_encoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-encoder-{}'.format(global_step))
    output_decoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-decoder-{}'.format(global_step)) 
    checkpoints = [ [output_encoder_dir, output_decoder_dir] ]

    # Load a trained Encoder model and vocabulary that you have fine-tuned
    encoder_config_class, encoder_model_class, encoder_tokenizer_class = MODEL_CLASSES[args.encoder_model_type]
    model_encoder = encoder_model_class.from_pretrained(output_encoder_dir, latent_size=args.latent_size)
    tokenizer_encoder = encoder_tokenizer_class.from_pretrained(args.encoder_tokenizer_name if args.encoder_tokenizer_name else args.encoder_model_name_or_path, do_lower_case=args.do_lower_case)

    model_encoder.to(args.device)
    if args.block_size <= 0:
        args.block_size = tokenizer_encoder.max_len_single_sentence  # Our input block size will be the max possible for the model
    args.block_size = min(args.block_size, tokenizer_encoder.max_len_single_sentence)

    # Load a trained Decoder model and vocabulary that you have fine-tuned
    decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES[args.decoder_model_type]
    model_decoder = decoder_model_class.from_pretrained(output_decoder_dir, latent_size=args.latent_size)
    tokenizer_decoder = decoder_tokenizer_class.from_pretrained(args.decoder_tokenizer_name if args.decoder_tokenizer_name else args.decoder_model_name_or_path, do_lower_case=args.do_lower_case)
    model_decoder.to(args.device)
    if args.block_size <= 0:
        args.block_size = tokenizer_decoder.max_len_single_sentence  # Our input block size will be the max possible for the model
    args.block_size = min(args.block_size, tokenizer_decoder.max_len_single_sentence)

    # Chunyuan: Add Padding token to GPT2
    special_tokens_dict = {'pad_token': '<PAD>', 'bos_token': '<BOS>', 'eos_token': '<EOS>'}
    num_added_toks = tokenizer_decoder.add_special_tokens(special_tokens_dict)
    logger.info('We have added {} tokens to GPT2'.format(num_added_toks))
    model_decoder.resize_token_embeddings(len(tokenizer_decoder))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
    assert tokenizer_decoder.pad_token == '<PAD>'

    train_loader = build_dataload_and_cache_examples(args, [tokenizer_encoder, tokenizer_decoder]) 
    generator = Generator(args.n_layers, args.block_dim,args.latent_size)
    critic = Critic(args.n_layers, args.block_dim,args.latent_size)

    if args.generator_dir!=None:
        logger.info("Loading generator and critic")
        generator.load_state_dict(torch.load(args.generator_dir+'/generator_'+str(args.gloabl_step_eval)+'.th'))
        critic.load_state_dict(torch.load(args.generator_dir+'/critic_'+str(args.gloabl_step_eval)+'.th'))

    g_optimizer = optim.Adam(generator.parameters(), lr=args.lr, betas=(0.5, 0.999))
    c_optimizer = optim.Adam(critic.parameters(), lr=args.lr, betas=(0.5, 0.999))
    
    if args.cuda:
        generator = generator.cuda()
        critic = critic.cuda()
    
    logger.info('G Parameters:{}'.format(sum([p.numel() for p in generator.parameters() if \
                                p.requires_grad])))
    logger.info('C Parameters:{}'.format(sum([p.numel() for p in critic.parameters() if \
                                p.requires_grad])))
    
    device = args.device
    
    best_bleu = 0
    reference = list()
    with(open(args.valid_data_file,"r")) as valid:
        for sents in valid:
            reference.append(sents.replace("\n", ""))
            
    for epoch in range(1, args.epochs + 1):
        
        #Insert GAN-BERT Code Here
        
        print("Train classification discriminator")
        # ========================================
        #               Training
        # ========================================
        # Perform one full pass over the training set.
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch, args.epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        tr_g_loss = 0
        tr_d_loss = 0

        # Put the model into training mode.
        transformer.train() 
        #generator.train()
        discriminator.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            # Progress update every print_each_n_step batches.
            if step % print_each_n_step == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            # Unpack this training batch from our dataloader. 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            b_label_mask = batch[3].to(device)

            real_batch_size = b_input_ids.shape[0]

            # Encode real data in the Transformer
            model_outputs = transformer(b_input_ids, attention_mask=b_input_mask)
            hidden_states = model_outputs.last_hidden_state[:,0,:] 
            #hidden_states = model_outputs[-1]
            #print("  Number of real sentences (labelled and unlabelled): {}".format(len(hidden_states)))
            
            # Generate fake data that should have the same distribution of the ones
            # encoded by the transformer. 
            # First noisy input are used in input to the Generator
            fixed_noise = torch.Tensor(np.random.normal(0, 1, (real_batch_size, args.latent_size))).to(args.device)
            test_z_gb = generator(fixed_noise).data
            fake_sentences = rollout_test(model_decoder, test_z_gb, tokenizer_decoder, args.max_seq_length, real_batch_size, 0, 1)
            #print("  Number of generated sentences: {}".format(len(fake_sentences)))

            b_input_ids_fake, b_input_mask_fake = generate_data_fake(fake_sentences)
            model_outputs_fake = transformer(b_input_ids_fake, attention_mask=b_input_mask_fake)
            hidden_states_fake = model_outputs_fake.last_hidden_state[:,0,:] 
            #hidden_states_fake = model_outputs_fake[-1]

            #noise = torch.zeros(real_batch_size, noise_size, device=device).uniform_(0, 1)
            # Gnerate Fake data
            #gen_rep = generator(noise)
            #print("Length of generator output {}".format(len(gen_rep)))
            #print("Length of single generator output {}".format(len(gen_rep[0])))

            # Generate the output of the Discriminator for real and fake data.
            # First, we put together the output of the tranformer and the generator
            disciminator_input = torch.cat([hidden_states, hidden_states_fake], dim=0)
            # Then, we select the output of the disciminator
            features, logits, probs = discriminator(disciminator_input)

            # Finally, we separate the discriminator's output for the real and fake
            # data
            features_list = torch.split(features, real_batch_size)
            D_real_features = features_list[0]
            D_fake_features = features_list[1]

            logits_list = torch.split(logits, real_batch_size)
            D_real_logits = logits_list[0]
            D_fake_logits = logits_list[1]

            probs_list = torch.split(probs, real_batch_size)
            D_real_probs = probs_list[0]
            D_fake_probs = probs_list[1]

            #---------------------------------
            #  LOSS evaluation
            #---------------------------------
            # Generator's LOSS estimation
            g_loss_d = -1 * torch.mean(torch.log(1 - D_fake_probs[:,-1] + epsilon))
            g_feat_reg = torch.mean(torch.pow(torch.mean(D_real_features, dim=0) - torch.mean(D_fake_features, dim=0), 2))
            g_loss = g_loss_d + g_feat_reg

            # Disciminator's LOSS estimation
            logits = D_real_logits[:,0:-1]
            log_probs = F.log_softmax(logits, dim=-1)
            # The discriminator provides an output for labeled and unlabeled real data
            # so the loss evaluated for unlabeled data is ignored (masked)
            label2one_hot = torch.nn.functional.one_hot(b_labels, len(label_list))
            per_example_loss = -torch.sum(label2one_hot * log_probs, dim=-1)
            per_example_loss = torch.masked_select(per_example_loss, b_label_mask.to(device))
            labeled_example_count = per_example_loss.type(torch.float32).numel()

            # It may be the case that a batch does not contain labeled examples, 
            # so the "supervised loss" in this case is not evaluated
            if labeled_example_count == 0:
              D_L_Supervised = 0
            else:
              D_L_Supervised = torch.div(torch.sum(per_example_loss.to(device)), labeled_example_count)

            D_L_unsupervised1U = -1 * torch.mean(torch.log(1 - D_real_probs[:, -1] + epsilon))
            D_L_unsupervised2U = -1 * torch.mean(torch.log(D_fake_probs[:, -1] + epsilon))
            d_loss = D_L_Supervised + D_L_unsupervised1U + D_L_unsupervised2U

            #---------------------------------
            #  OPTIMIZATION
            #---------------------------------
            # Avoid gradient accumulation
            #gen_optimizer.zero_grad()
            dis_optimizer.zero_grad()

            # Calculate weigth updates
            # retain_graph=True is required since the underlying graph will be deleted after backward
            g_loss.backward(retain_graph=True)
            d_loss.backward() 

            # Apply modifications
            #gen_optimizer.step()
            dis_optimizer.step()

            # A detail log of the individual losses
            #print("{0:.4f}\t{1:.4f}\t{2:.4f}\t{3:.4f}\t{4:.4f}".
            #      format(D_L_Supervised, D_L_unsupervised1U, D_L_unsupervised2U,
            #             g_loss_d, g_feat_reg))

            # Save the losses to print them later
            tr_g_loss += g_loss.item()
            tr_d_loss += d_loss.item()

            # Update the learning rate with the scheduler
            if apply_scheduler:
              scheduler_d.step()
              #scheduler_g.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss_g = tr_g_loss / len(train_dataloader)
        avg_train_loss_d = tr_d_loss / len(train_dataloader)             

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss generetor: {0:.3f}".format(avg_train_loss_g))
        print("  Average training loss discriminator: {0:.3f}".format(avg_train_loss_d))
        print("  Training epcoh took: {:}".format(training_time))

        # ========================================
        #     TEST ON THE EVALUATION DATASET
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our test set.
        print("")
        print("Running Test...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        transformer.eval() #maybe redundant
        discriminator.eval()
        #generator.eval()

        # Tracking variables 
        total_test_accuracy = 0

        total_test_loss = 0
        nb_test_steps = 0

        all_preds = []
        all_labels_ids = []

        #loss
        nll_loss = torch.nn.CrossEntropyLoss(ignore_index=-1)

        # Evaluate data for one epoch
        for batch in test_dataloader:

            # Unpack this training batch from our dataloader. 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():        
                model_outputs = transformer(b_input_ids, attention_mask=b_input_mask)
                hidden_states = model_outputs.last_hidden_state[:,0,:] 
                #hidden_states = model_outputs[-1]
                _, logits, probs = discriminator(hidden_states)
                ###log_probs = F.log_softmax(probs[:,1:], dim=-1)
                filtered_logits = logits[:,0:-1]
                # Accumulate the test loss.
                total_test_loss += nll_loss(filtered_logits, b_labels)

            # Accumulate the predictions and the input labels
            _, preds = torch.max(filtered_logits, 1)
            all_preds += preds.detach().cpu()
            all_labels_ids += b_labels.detach().cpu()

        # Report the final accuracy for this validation run.
        all_preds = torch.stack(all_preds).numpy()
        all_labels_ids = torch.stack(all_labels_ids).numpy()
        test_accuracy = np.sum(all_preds == all_labels_ids) / len(all_preds)
        print("  Accuracy: {0:.3f}".format(test_accuracy))

        # Calculate the average loss over all of the batches.
        avg_test_loss = total_test_loss / len(test_dataloader)
        avg_test_loss = avg_test_loss.item()

        # Measure how long the validation run took.
        test_time = format_time(time.time() - t0)

        print("  Test Loss: {0:.3f}".format(avg_test_loss))
        print("  Test took: {:}".format(test_time))

        # Record all statistics from this epoch.
        training_stats.append(
            {
                'epoch': epoch + 1,
                'Training Loss generator': avg_train_loss_g,
                'Training Loss discriminator': avg_train_loss_d,
                'Valid. Loss': avg_test_loss,
                'Valid. Accur.': test_accuracy,
                'Training Time': training_time,
                'Test Time': test_time
            }
        )

        accuracy_array.append(test_accuracy)
        
        #OPTAGAN Code
        
        g_loss, c_loss = train(epoch)

        data_test = list()
        for i in range(2):
            test_noise = torch.Tensor(np.random.normal(0, 1, (250, args.latent_size))).to(args.device)
            test_z = generator(test_noise).data
            print(test_z)
            new_sent = rollout_test(model_decoder, test_z, tokenizer_decoder, args.max_seq_length, 250, 0, 1)
            data_test.extend(new_sent)

        p_reference = random.sample(reference, 500)
        bleu = calc_blue_parallel_func(p_reference, data_test, 2, 500)
        b_bleu = calc_blue_parallel_func(data_test, p_reference, 2, 500)
        logger.info("Bleu-2:{:0.3f} | B-Bleu-2:{:0.3f}".format(bleu, b_bleu))
        
        print(bleu+b_bleu)
        if (bleu+b_bleu) > best_bleu:
            best_bleu = bleu + b_bleu
            logger.info('* Saving. Best Score:{:0.3f} | Bleu-2:{:0.3f} | B-Bleu-2:{:0.3f}'.format(best_bleu, bleu, b_bleu))
            torch.save(generator.state_dict(), args.output_dir+'/generator_'+str(args.gloabl_step_eval)+'.th')
            torch.save(critic.state_dict(), args.output_dir+'/critic_'+str(args.gloabl_step_eval)+'.th')
            
        

    if args.finetune_decoder: 
        logger.info("Loading generator")
        generator.load_state_dict(torch.load(args.output_dir+'/generator_'+str(args.gloabl_step_eval)+'.th'))
        
        model_decoder.train()
        generator.eval()
        dec_optimizer = optim.Adam(model_decoder.parameters(), lr=1e-4, betas=(0.5, 0.999))
        value_loss = nn.L1Loss()
        B = args.batch_size_rl
        total_scores = 0
        total_entropy = 0
        total_values = 0
        total_v_loss = 0
        for epoch_ in range(args.epochs_rl):
            if epoch_ == 200:
                # Finetune decoder after training of value head
                dec_optimizer = optim.Adam(model_decoder.parameters(), lr=args.lr_rl, betas=(0.5, 0.999))
            noise = torch.from_numpy(np.random.normal(0, 1, (B, args.latent_size))).float()
            noise = noise.to(args.device)
            z_fake = generator(noise)            
            sents, logprobs, values, entropy = rollout(model_decoder, z_fake, tokenizer_decoder, args.max_seq_length, B, 1)
            p_reference = random.sample(reference, 500)

            blue = []
            for i in sents:
                blue.append(calc_blue_parallel_func(p_reference, [i], 1, 0))

            values = torch.stack(values, dim=1)
            logprobs = torch.stack(logprobs, dim=1)
            entropy = torch.stack(entropy, dim=1)

            # Get tokens and mask of batch
            toks_gpt = [([50258] + tokenizer_decoder.encode(j) + [50259]) for j in sents]
            toks_gpt, mask = pad_seq(toks_gpt, tokenizer_decoder.encode("<PAD>")[0], values.size(1)+1)
            toks_gpt = torch.tensor(toks_gpt).to(args.device)
            mask = torch.tensor(mask).to(args.device)
              
            values = values * mask[:,1:]
            logprobs = logprobs * mask[:,1:]
            entropy = entropy * mask[:,1:]
            scores = torch.tensor(blue).to(args.device)
            # Get value loss
            v_loss = value_loss(torch.sum(values, dim=1), scores) 
              
            if epoch_ >= 200:
                R = 0
                rewards = []

                # Discount future rewards back to the present using gamma
                for j in range(len(values.tolist())):
                    R = 0
                    batch_rewards = []
                    for r in reversed(values.tolist()[j]):
                        R = r + 0.99 * R
                        batch_rewards.insert(0,R)
                    rewards.append(batch_rewards)

                # Penalizing low entropy states
                rewards = torch.FloatTensor(rewards).to(args.device)
                rewards = rewards + torch.log(torch.clamp(entropy,0.2,1))
                # Calculate loss
                d_loss = torch.sum(torch.mul(logprobs, rewards.detach()).mul(-1))
            else:
                d_loss = torch.tensor(0)

            # Backpropagate losses
            loss = v_loss + d_loss              
            dec_optimizer.zero_grad()              
            loss.backward()
            dec_optimizer.step()

            total_scores += torch.mean(scores).item()
            total_values += torch.mean(torch.sum(values,-1)).item()
            total_v_loss += v_loss.item()
            total_entropy += torch.mean(torch.mean(entropy,dim=1)).item()
            if (epoch_ % args.interval) == 0:
                logger.info("Epoch {}/{} | Value Loss:{} | Mean values:{} | Mean BLEU scores:{} | Mean Entropy: {}".format(epoch_, 
                args.epochs_rl, total_v_loss/args.interval, total_values/args.interval, total_scores/args.interval, total_entropy/args.interval))
                total_scores = 0
                total_values = 0
                total_v_loss = 0
                total_entropy = 0
        logger.info("Saving decoder")
        output_decoder_dir = os.path.join(args.output_dir, 'checkpoint-decoder-{}'.format(global_step))
        if not os.path.exists(output_decoder_dir):
            os.makedirs(output_decoder_dir)
        model_decoder.save_pretrained(output_decoder_dir)
        torch.save(args, os.path.join(output_decoder_dir, 'training_encoder_args.bin'))   

[nltk_data] Downloading package punkt to /home/harry/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


06/16/2022 13:58:29 - INFO - func.configuration_utils -   loading configuration file output_dir_yahoo/checkpoint-encoder-24000/config.json
06/16/2022 13:58:29 - INFO - func.configuration_utils -   Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 28996
}

06/16/2022 13:58:29 - INFO - func.modeling_utils -   loading weights file output_dir_yahoo/checkpoint-encoder-24000/pytorch_model.bin


Namespace(batch_size_rl=32, block_dim=100, block_size=100, checkpoint_dir='output_dir_yahoo', cuda=True, dataset='EMNLP', decoder_config_name='', decoder_model_name_or_path='gpt2', decoder_model_type='gpt2', decoder_tokenizer_name='', do_lower_case=False, encoder_config_name='', encoder_model_name_or_path='bert-base-cased', encoder_model_type='bert', encoder_tokenizer_name='', epochs=200, epochs_rl=100, finetune_decoder=True, generator_dir=None, gloabl_step_eval=24000, gp_lambda=10, interval=50, latent_size=32, length=20, lr=0.0001, lr_rl=1e-06, max_seq_length=24, n_layers=10, output_dir='output_dir_yahoo', padding_text='', per_gpu_train_batch_size=12, prompt='', seed=0, train_data_file='../../yahoo/unlabelled/train.txt', use_philly=False, valid_data_file='../../yahoo/unlabelled/test.txt')


06/16/2022 13:58:32 - INFO - func.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /home/harry/.cache/torch/pytorch_transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
06/16/2022 13:58:32 - INFO - func.configuration_utils -   loading configuration file output_dir_yahoo/checkpoint-decoder-24000/config.json
06/16/2022 13:58:32 - INFO - func.configuration_utils -   Model config {
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "initializer_range": 0.02,
  "latent_size": 32,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "sum

<utils.BucketingDataLoader object at 0x7fcc64e017c0>
Train classification discriminator

Training...
  Batch    10  of    172.    Elapsed: 0:00:20.
  Batch    20  of    172.    Elapsed: 0:00:39.
  Batch    30  of    172.    Elapsed: 0:00:57.
  Batch    40  of    172.    Elapsed: 0:01:15.
  Batch    50  of    172.    Elapsed: 0:01:33.
  Batch    60  of    172.    Elapsed: 0:01:52.
  Batch    70  of    172.    Elapsed: 0:02:09.
  Batch    80  of    172.    Elapsed: 0:02:27.
  Batch    90  of    172.    Elapsed: 0:02:45.
  Batch   100  of    172.    Elapsed: 0:03:03.
  Batch   170  of    172.    Elapsed: 0:05:09.

  Average training loss generetor: 0.707
  Average training loss discriminator: 2.228
  Training epcoh took: 0:05:13

Running Test...


06/16/2022 14:03:52 - INFO - __main__ -   Epoch: 1 | Batch: 0/120000 (0%) | G Loss: 2.980527 | C Loss: 0.571864
06/16/2022 14:03:52 - INFO - __main__ -   Text: ['in women sexual development in Sanada queina?']


  Accuracy: 0.422
  Test Loss: 1.864
  Test took: 0:00:01


06/16/2022 14:03:54 - INFO - __main__ -   Epoch: 1 | Batch: 600/120000 (0%) | G Loss: 11.048007 | C Loss: -9.614460
06/16/2022 14:03:54 - INFO - __main__ -   Text: ['in female sex in a bisexual life in the male?']
06/16/2022 14:03:55 - INFO - __main__ -   Epoch: 1 | Batch: 1200/120000 (1%) | G Loss: 6.290960 | C Loss: -7.688527
06/16/2022 14:03:55 - INFO - __main__ -   Text: ['in the sexual virity of young African in world life..??']
06/16/2022 14:03:57 - INFO - __main__ -   Epoch: 1 | Batch: 1800/120000 (2%) | G Loss: 2.615803 | C Loss: -4.154205
06/16/2022 14:03:57 - INFO - __main__ -   Text: ['what is theosis in mississippi woman??']
06/16/2022 14:03:58 - INFO - __main__ -   Epoch: 1 | Batch: 2400/120000 (2%) | G Loss: -1.330492 | C Loss: -1.301166
06/16/2022 14:03:58 - INFO - __main__ -   Text: ['What does it all mean to be yo religion godmother?']
06/16/2022 14:03:59 - INFO - __main__ -   Epoch: 1 | Batch: 3000/120000 (2%) | G Loss: -0.612423 | C Loss: -2.432378
06/16/2022 14:03:5

06/16/2022 14:04:48 - INFO - __main__ -   Text: ['Has anyone seen History and built a statue?']
06/16/2022 14:04:49 - INFO - __main__ -   Epoch: 1 | Batch: 24600/120000 (20%) | G Loss: -1.688596 | C Loss: -1.010111
06/16/2022 14:04:49 - INFO - __main__ -   Text: ['Making up my lungs?']
06/16/2022 14:04:51 - INFO - __main__ -   Epoch: 1 | Batch: 25200/120000 (21%) | G Loss: 1.764206 | C Loss: -1.532498
06/16/2022 14:04:51 - INFO - __main__ -   Text: ['What is it worth in the century saturn?']
06/16/2022 14:04:52 - INFO - __main__ -   Epoch: 1 | Batch: 25800/120000 (22%) | G Loss: 1.518203 | C Loss: -2.148867
06/16/2022 14:04:52 - INFO - __main__ -   Text: ['wat is awe quotes?']
06/16/2022 14:04:53 - INFO - __main__ -   Epoch: 1 | Batch: 26400/120000 (22%) | G Loss: -1.930583 | C Loss: -1.114608
06/16/2022 14:04:54 - INFO - __main__ -   Text: ['What are the starting resources for an Essay student?']
06/16/2022 14:04:55 - INFO - __main__ -   Epoch: 1 | Batch: 27000/120000 (22%) | G Loss: 

06/16/2022 14:05:42 - INFO - __main__ -   Text: ['Can I start an individual tissue in 1986?']
06/16/2022 14:05:44 - INFO - __main__ -   Epoch: 1 | Batch: 48000/120000 (40%) | G Loss: -0.676107 | C Loss: -0.910606
06/16/2022 14:05:44 - INFO - __main__ -   Text: ['Who know of this Poolrunner??']
06/16/2022 14:05:45 - INFO - __main__ -   Epoch: 1 | Batch: 48600/120000 (40%) | G Loss: -0.013287 | C Loss: -0.250920
06/16/2022 14:05:45 - INFO - __main__ -   Text: ['At what age do you deserve to know guys are their BEST?']
06/16/2022 14:05:46 - INFO - __main__ -   Epoch: 1 | Batch: 49200/120000 (41%) | G Loss: 0.313873 | C Loss: -1.230368
06/16/2022 14:05:47 - INFO - __main__ -   Text: ['what doesakura ogame wake up from one night stand??']
06/16/2022 14:05:48 - INFO - __main__ -   Epoch: 1 | Batch: 49800/120000 (42%) | G Loss: 0.330857 | C Loss: -1.021791
06/16/2022 14:05:48 - INFO - __main__ -   Text: ['tilt your hand if getting into serious stuff?']
06/16/2022 14:05:49 - INFO - __main__ - 

06/16/2022 14:06:39 - INFO - __main__ -   Epoch: 1 | Batch: 71400/120000 (60%) | G Loss: -0.432461 | C Loss: -0.637807
06/16/2022 14:06:39 - INFO - __main__ -   Text: ['Could I learn from Chinese Imperial Explorer Sentries to Easton?']
06/16/2022 14:06:40 - INFO - __main__ -   Epoch: 1 | Batch: 72000/120000 (60%) | G Loss: 3.197965 | C Loss: -1.486767
06/16/2022 14:06:41 - INFO - __main__ -   Text: ['who doesnclear information?']
06/16/2022 14:06:42 - INFO - __main__ -   Epoch: 1 | Batch: 72600/120000 (60%) | G Loss: -1.119115 | C Loss: -0.644078
06/16/2022 14:06:42 - INFO - __main__ -   Text: ['What should I ask other Humans.?']
06/16/2022 14:06:43 - INFO - __main__ -   Epoch: 1 | Batch: 73200/120000 (61%) | G Loss: -1.835280 | C Loss: -0.766771
06/16/2022 14:06:43 - INFO - __main__ -   Text: ['what is bad Ann Coulter music about?']
06/16/2022 14:06:45 - INFO - __main__ -   Epoch: 1 | Batch: 73800/120000 (62%) | G Loss: -2.377273 | C Loss: -0.617347
06/16/2022 14:06:45 - INFO - __main

06/16/2022 14:07:35 - INFO - __main__ -   Text: ['What type of STOP is the september 2 feeding cans?']
06/16/2022 14:07:36 - INFO - __main__ -   Epoch: 1 | Batch: 95400/120000 (80%) | G Loss: 0.216676 | C Loss: -0.236606
06/16/2022 14:07:36 - INFO - __main__ -   Text: ['where is the dower y get and?']
06/16/2022 14:07:37 - INFO - __main__ -   Epoch: 1 | Batch: 96000/120000 (80%) | G Loss: 2.627959 | C Loss: -1.714558
06/16/2022 14:07:37 - INFO - __main__ -   Text: ['If I can get some Guys?']
06/16/2022 14:07:39 - INFO - __main__ -   Epoch: 1 | Batch: 96600/120000 (80%) | G Loss: -0.452325 | C Loss: -0.567764
06/16/2022 14:07:39 - INFO - __main__ -   Text: ["What's the best World Cup feeling to you?"]
06/16/2022 14:07:40 - INFO - __main__ -   Epoch: 1 | Batch: 97200/120000 (81%) | G Loss: 3.544294 | C Loss: -1.901793
06/16/2022 14:07:40 - INFO - __main__ -   Text: ['How do I recover from Oprah??']
06/16/2022 14:07:41 - INFO - __main__ -   Epoch: 1 | Batch: 97800/120000 (82%) | G Loss: -

06/16/2022 14:08:30 - INFO - __main__ -   Text: ["how many r you child's top 2 get this year?"]
06/16/2022 14:08:31 - INFO - __main__ -   Epoch: 1 | Batch: 118800/120000 (99%) | G Loss: -0.776697 | C Loss: -1.160295
06/16/2022 14:08:31 - INFO - __main__ -   Text: ['Was u sure as !?']
06/16/2022 14:08:32 - INFO - __main__ -   Epoch: 1 | Batch: 119400/120000 (100%) | G Loss: -0.602786 | C Loss: -0.735098
06/16/2022 14:08:33 - INFO - __main__ -   Text: ['where does i need my ads for free in pc?']
06/16/2022 14:08:34 - INFO - __main__ -   * (Train) Epoch: 1 | G Loss: -0.0350 | C Loss: -1.2542 | Updates G: 2035 | Updates C: 7965


tensor([[-0.1809,  0.4851,  0.2624,  ..., -0.1999, -0.0361,  0.1617],
        [ 0.5997,  0.0539,  0.1853,  ..., -0.8519,  1.1720, -0.3175],
        [-0.9002,  0.0969,  0.4832,  ...,  0.4666,  0.2755,  0.0935],
        ...,
        [ 0.1907,  0.0785,  0.5279,  ...,  1.2622,  0.5928, -0.8267],
        [-0.2517, -1.8863, -0.4236,  ...,  0.1106, -0.3282,  0.6933],
        [-0.3633,  0.2909, -0.4032,  ...,  1.6263, -0.6954,  0.1861]],
       device='cuda:0')
tensor([[-1.2645,  0.0242, -1.1121,  ...,  0.5345, -0.7096, -0.4492],
        [ 0.2689,  0.0913, -0.4995,  ...,  1.1069, -1.1632, -0.1163],
        [-0.6734, -0.2141,  0.4097,  ...,  0.0440,  1.5259, -0.0029],
        ...,
        [-1.4852,  0.0820, -1.2612,  ...,  0.6655,  0.0902, -0.7081],
        [ 1.0144,  0.3194, -0.0371,  ..., -0.0303, -0.6635, -0.1417],
        [-0.7203, -0.5807,  2.3402,  ..., -0.6814,  1.6594, -0.0157]],
       device='cuda:0')
huggingface/tokenizers: The current process just got forked, after parallelism has a

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

06/16/2022 14:08:40 - INFO - __main__ -   Bleu-2:0.326 | B-Bleu-2:0.367
06/16/2022 14:08:40 - INFO - __main__ -   * Saving. Best Score:0.693 | Bleu-2:0.326 | B-Bleu-2:0.367


0.692808299172532
Train classification discriminator

Training...
  Batch    10  of    172.    Elapsed: 0:00:17.
  Batch    20  of    172.    Elapsed: 0:00:33.
  Batch    30  of    172.    Elapsed: 0:00:48.
  Batch    40  of    172.    Elapsed: 0:01:04.
  Batch    50  of    172.    Elapsed: 0:01:20.
  Batch    60  of    172.    Elapsed: 0:01:36.
  Batch    70  of    172.    Elapsed: 0:01:52.
  Batch    80  of    172.    Elapsed: 0:02:08.
  Batch    90  of    172.    Elapsed: 0:02:24.
  Batch   100  of    172.    Elapsed: 0:02:41.
  Batch   110  of    172.    Elapsed: 0:02:57.
  Batch   120  of    172.    Elapsed: 0:03:14.
  Batch   130  of    172.    Elapsed: 0:03:31.
  Batch   140  of    172.    Elapsed: 0:03:47.
  Batch   150  of    172.    Elapsed: 0:04:05.
  Batch   160  of    172.    Elapsed: 0:04:21.
  Batch   170  of    172.    Elapsed: 0:04:37.

  Average training loss generetor: 0.513
  Average training loss discriminator: 1.437
  Training epcoh took: 0:04:40

Running Test...


06/16/2022 14:13:21 - INFO - __main__ -   Epoch: 2 | Batch: 0/120000 (0%) | G Loss: -0.553460 | C Loss: -0.817488
06/16/2022 14:13:21 - INFO - __main__ -   Text: ['What companies will be my Booker?']


  Accuracy: 0.393
  Test Loss: 1.946
  Test took: 0:00:01


06/16/2022 14:13:22 - INFO - __main__ -   Epoch: 2 | Batch: 600/120000 (0%) | G Loss: 2.397423 | C Loss: -0.870881
06/16/2022 14:13:22 - INFO - __main__ -   Text: ['Going electric car replacement in main doors faster?']
06/16/2022 14:13:24 - INFO - __main__ -   Epoch: 2 | Batch: 1200/120000 (1%) | G Loss: 0.417593 | C Loss: -1.151418
06/16/2022 14:13:24 - INFO - __main__ -   Text: ['What is the best level to obtain the data?']
06/16/2022 14:13:25 - INFO - __main__ -   Epoch: 2 | Batch: 1800/120000 (2%) | G Loss: -1.787995 | C Loss: 0.360184
06/16/2022 14:13:25 - INFO - __main__ -   Text: ['Are insects very dangerous in a human race?']
06/16/2022 14:13:27 - INFO - __main__ -   Epoch: 2 | Batch: 2400/120000 (2%) | G Loss: 4.695592 | C Loss: -0.572437
06/16/2022 14:13:27 - INFO - __main__ -   Text: ['Have you ever married another little boy but letting your dog throw me in the dark?']
06/16/2022 14:13:28 - INFO - __main__ -   Epoch: 2 | Batch: 3000/120000 (2%) | G Loss: 2.983103 | C Loss:

06/16/2022 14:14:17 - INFO - __main__ -   Text: ["speed Error dropping math c'trples?"]
06/16/2022 14:14:19 - INFO - __main__ -   Epoch: 2 | Batch: 24600/120000 (20%) | G Loss: -1.091478 | C Loss: -0.813490
06/16/2022 14:14:19 - INFO - __main__ -   Text: ['What moral is the globe ganger?']
06/16/2022 14:14:20 - INFO - __main__ -   Epoch: 2 | Batch: 25200/120000 (21%) | G Loss: 2.978771 | C Loss: -0.500775
06/16/2022 14:14:20 - INFO - __main__ -   Text: ['Listering beach site?']
06/16/2022 14:14:22 - INFO - __main__ -   Epoch: 2 | Batch: 25800/120000 (22%) | G Loss: -1.183906 | C Loss: -0.327748
06/16/2022 14:14:22 - INFO - __main__ -   Text: ['where want i rent out discounting my porny center ?']
06/16/2022 14:14:23 - INFO - __main__ -   Epoch: 2 | Batch: 26400/120000 (22%) | G Loss: 3.837474 | C Loss: -0.868104
06/16/2022 14:14:23 - INFO - __main__ -   Text: ['is anyone from the US voting republican?']
06/16/2022 14:14:25 - INFO - __main__ -   Epoch: 2 | Batch: 27000/120000 (22%) | G 

06/16/2022 14:15:15 - INFO - __main__ -   Epoch: 2 | Batch: 48000/120000 (40%) | G Loss: 3.994416 | C Loss: -1.103137
06/16/2022 14:15:15 - INFO - __main__ -   Text: ["how to do when God doesn't work?"]
06/16/2022 14:15:16 - INFO - __main__ -   Epoch: 2 | Batch: 48600/120000 (40%) | G Loss: 0.851043 | C Loss: -0.926834
06/16/2022 14:15:17 - INFO - __main__ -   Text: ['How could udder Pronom binopes be participated in?']
06/16/2022 14:15:18 - INFO - __main__ -   Epoch: 2 | Batch: 49200/120000 (41%) | G Loss: -1.107704 | C Loss: -0.936876
06/16/2022 14:15:18 - INFO - __main__ -   Text: ['Why does the world have a negative purpose?']
06/16/2022 14:15:19 - INFO - __main__ -   Epoch: 2 | Batch: 49800/120000 (42%) | G Loss: 1.012654 | C Loss: -1.089078
06/16/2022 14:15:19 - INFO - __main__ -   Text: ['how do i am a fan of captain cricket?']
06/16/2022 14:15:21 - INFO - __main__ -   Epoch: 2 | Batch: 50400/120000 (42%) | G Loss: 1.655448 | C Loss: -0.393721
06/16/2022 14:15:21 - INFO - __main

06/16/2022 14:16:12 - INFO - __main__ -   Epoch: 2 | Batch: 72000/120000 (60%) | G Loss: 4.590646 | C Loss: -0.772807
06/16/2022 14:16:12 - INFO - __main__ -   Text: ['How shall do this racing stops which come back?']
06/16/2022 14:16:13 - INFO - __main__ -   Epoch: 2 | Batch: 72600/120000 (60%) | G Loss: 4.861483 | C Loss: -2.092507
06/16/2022 14:16:13 - INFO - __main__ -   Text: ['why is the load pulled from different methods?']
06/16/2022 14:16:15 - INFO - __main__ -   Epoch: 2 | Batch: 73200/120000 (61%) | G Loss: -3.357095 | C Loss: -1.143167
06/16/2022 14:16:15 - INFO - __main__ -   Text: ['how do u ask an e-fiction?']
06/16/2022 14:16:16 - INFO - __main__ -   Epoch: 2 | Batch: 73800/120000 (62%) | G Loss: -0.501074 | C Loss: -0.425337
06/16/2022 14:16:16 - INFO - __main__ -   Text: ['italian puiseworthy ear and a neck?']
06/16/2022 14:16:18 - INFO - __main__ -   Epoch: 2 | Batch: 74400/120000 (62%) | G Loss: 1.412008 | C Loss: -1.177427
06/16/2022 14:16:18 - INFO - __main__ -   

06/16/2022 14:17:08 - INFO - __main__ -   Text: ["Whang is that used to mean thing? and aren't?"]
06/16/2022 14:17:10 - INFO - __main__ -   Epoch: 2 | Batch: 96000/120000 (80%) | G Loss: -1.454218 | C Loss: -0.579423
06/16/2022 14:17:10 - INFO - __main__ -   Text: ['How long can Pink?']
06/16/2022 14:17:11 - INFO - __main__ -   Epoch: 2 | Batch: 96600/120000 (80%) | G Loss: -0.311008 | C Loss: -0.341201
06/16/2022 14:17:11 - INFO - __main__ -   Text: ['how will it feel to get a good holiday at work?']
06/16/2022 14:17:13 - INFO - __main__ -   Epoch: 2 | Batch: 97200/120000 (81%) | G Loss: 2.453820 | C Loss: -0.829272
06/16/2022 14:17:13 - INFO - __main__ -   Text: ['What printer name has 256 letters ’ datapro’ (a)?']
06/16/2022 14:17:14 - INFO - __main__ -   Epoch: 2 | Batch: 97800/120000 (82%) | G Loss: 0.022175 | C Loss: -0.586808
06/16/2022 14:17:14 - INFO - __main__ -   Text: ['what are the integral functions?']
06/16/2022 14:17:16 - INFO - __main__ -   Epoch: 2 | Batch: 98400/1200

06/16/2022 14:18:05 - INFO - __main__ -   Text: ['What does it take to YOUR expires ?']
06/16/2022 14:18:06 - INFO - __main__ -   Epoch: 2 | Batch: 119400/120000 (100%) | G Loss: 3.329620 | C Loss: -0.505288
06/16/2022 14:18:07 - INFO - __main__ -   Text: ['tell me what i am a little girl scared of?']
06/16/2022 14:18:08 - INFO - __main__ -   * (Train) Epoch: 2 | G Loss: 0.2691 | C Loss: -0.7679 | Updates G: 1650 | Updates C: 8350


tensor([[-0.4255, -1.5460,  1.2001,  ..., -0.4040, -0.9995, -0.4735],
        [-0.2868, -0.1329, -0.4028,  ...,  0.4644, -0.6778, -0.4481],
        [ 1.5984, -0.5684, -0.5609,  ..., -0.4853, -0.8015, -0.2660],
        ...,
        [-0.0268,  0.3430,  0.4511,  ...,  0.4447, -1.7681,  0.6871],
        [-0.2243,  0.8861,  0.0234,  ...,  0.0982, -1.3944, -0.4630],
        [ 2.4371, -0.6525, -1.4260,  ..., -0.9246,  0.0900,  0.1241]],
       device='cuda:0')
tensor([[ 0.9379, -0.1197, -0.5416,  ...,  1.0833, -1.6124,  0.2594],
        [-1.1773,  0.4008, -0.5584,  ...,  0.2546, -0.1619, -0.5293],
        [-0.0473, -0.1295, -0.1380,  ..., -0.4747,  0.2341, -0.9833],
        ...,
        [-0.0739, -0.1380,  0.1899,  ..., -0.0770,  0.3154, -0.7454],
        [-1.0067, -0.2315,  0.3619,  ..., -0.1387, -0.4976, -0.0440],
        [-0.8132, -0.0835,  0.7796,  ..., -0.1652,  1.3362, -1.7761]],
       device='cuda:0')
huggingface/tokenizers: The current process just got forked, after parallelism has a

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

06/16/2022 14:18:17 - INFO - __main__ -   Bleu-2:0.373 | B-Bleu-2:0.403
06/16/2022 14:18:17 - INFO - __main__ -   * Saving. Best Score:0.776 | Bleu-2:0.373 | B-Bleu-2:0.403


0.7760117999379044
Train classification discriminator

Training...
  Batch    10  of    172.    Elapsed: 0:00:18.
  Batch    20  of    172.    Elapsed: 0:00:34.
  Batch    30  of    172.    Elapsed: 0:00:50.
  Batch    40  of    172.    Elapsed: 0:01:06.
  Batch    50  of    172.    Elapsed: 0:01:22.
  Batch    60  of    172.    Elapsed: 0:01:38.
  Batch    70  of    172.    Elapsed: 0:01:53.
  Batch    80  of    172.    Elapsed: 0:02:10.
  Batch    90  of    172.    Elapsed: 0:02:27.
  Batch   100  of    172.    Elapsed: 0:02:44.
  Batch   110  of    172.    Elapsed: 0:03:00.
  Batch   120  of    172.    Elapsed: 0:03:17.
  Batch   130  of    172.    Elapsed: 0:03:33.
  Batch   140  of    172.    Elapsed: 0:03:49.
  Batch   150  of    172.    Elapsed: 0:04:06.
  Batch   160  of    172.    Elapsed: 0:04:23.
  Batch   170  of    172.    Elapsed: 0:04:41.

  Average training loss generetor: 0.535
  Average training loss discriminator: 1.242
  Training epcoh took: 0:04:44

Running Test...

06/16/2022 14:23:02 - INFO - __main__ -   Epoch: 3 | Batch: 0/120000 (0%) | G Loss: 0.856994 | C Loss: -0.733540
06/16/2022 14:23:02 - INFO - __main__ -   Text: ["name a really good one person that's going to raise your friends?"]


  Accuracy: 0.417
  Test Loss: 1.770
  Test took: 0:00:01


06/16/2022 14:23:03 - INFO - __main__ -   Epoch: 3 | Batch: 600/120000 (0%) | G Loss: -2.041191 | C Loss: -0.554485
06/16/2022 14:23:03 - INFO - __main__ -   Text: ['How I need to read a year of within?']
06/16/2022 14:23:05 - INFO - __main__ -   Epoch: 3 | Batch: 1200/120000 (1%) | G Loss: -0.298803 | C Loss: -0.292789
06/16/2022 14:23:05 - INFO - __main__ -   Text: ['how do those "donkeys kick it to." words?']
06/16/2022 14:23:06 - INFO - __main__ -   Epoch: 3 | Batch: 1800/120000 (2%) | G Loss: 2.265612 | C Loss: -0.360829
06/16/2022 14:23:06 - INFO - __main__ -   Text: ['what is the " Effexorcert"?']
06/16/2022 14:23:07 - INFO - __main__ -   Epoch: 3 | Batch: 2400/120000 (2%) | G Loss: 10.303062 | C Loss: -6.274389
06/16/2022 14:23:08 - INFO - __main__ -   Text: ['where is the point how can i get people to know the for the test?']
06/16/2022 14:23:09 - INFO - __main__ -   Epoch: 3 | Batch: 3000/120000 (2%) | G Loss: -1.566651 | C Loss: -0.060996
06/16/2022 14:23:09 - INFO - __main_

06/16/2022 14:24:01 - INFO - __main__ -   Epoch: 3 | Batch: 24600/120000 (20%) | G Loss: -3.021806 | C Loss: -0.539924
06/16/2022 14:24:02 - INFO - __main__ -   Text: ['what was the host to show for the weeklies?']
06/16/2022 14:24:03 - INFO - __main__ -   Epoch: 3 | Batch: 25200/120000 (21%) | G Loss: 5.577167 | C Loss: -0.408099
06/16/2022 14:24:03 - INFO - __main__ -   Text: ['Do u?!?']
06/16/2022 14:24:04 - INFO - __main__ -   Epoch: 3 | Batch: 25800/120000 (22%) | G Loss: 6.006683 | C Loss: -0.875365
06/16/2022 14:24:04 - INFO - __main__ -   Text: ["Whr think Officer's?"]
06/16/2022 14:24:06 - INFO - __main__ -   Epoch: 3 | Batch: 26400/120000 (22%) | G Loss: 5.102259 | C Loss: -1.070178
06/16/2022 14:24:06 - INFO - __main__ -   Text: ['Why sentences brnder?']
06/16/2022 14:24:07 - INFO - __main__ -   Epoch: 3 | Batch: 27000/120000 (22%) | G Loss: 1.535164 | C Loss: -0.536294
06/16/2022 14:24:08 - INFO - __main__ -   Text: ['What is the duty to unease rearview?']
06/16/2022 14:24:

06/16/2022 14:24:57 - INFO - __main__ -   Text: ['do u think the future leader can be located?']
06/16/2022 14:24:59 - INFO - __main__ -   Epoch: 3 | Batch: 48000/120000 (40%) | G Loss: -1.795484 | C Loss: 0.074731
06/16/2022 14:24:59 - INFO - __main__ -   Text: ['how is it possible that over-eating have a loss of storage?']
06/16/2022 14:25:00 - INFO - __main__ -   Epoch: 3 | Batch: 48600/120000 (40%) | G Loss: -4.329081 | C Loss: -0.221348
06/16/2022 14:25:00 - INFO - __main__ -   Text: ['what does barcode means ?']
06/16/2022 14:25:02 - INFO - __main__ -   Epoch: 3 | Batch: 49200/120000 (41%) | G Loss: 0.544824 | C Loss: -0.127038
06/16/2022 14:25:02 - INFO - __main__ -   Text: ['How many Muslims believe?']
06/16/2022 14:25:03 - INFO - __main__ -   Epoch: 3 | Batch: 49800/120000 (42%) | G Loss: 5.096611 | C Loss: -0.278727
06/16/2022 14:25:03 - INFO - __main__ -   Text: ['Who knows the man kingdom???']
06/16/2022 14:25:05 - INFO - __main__ -   Epoch: 3 | Batch: 50400/120000 (42%) | 

06/16/2022 14:25:55 - INFO - __main__ -   Text: ['population of souls means perfect?']
06/16/2022 14:25:56 - INFO - __main__ -   Epoch: 3 | Batch: 71400/120000 (60%) | G Loss: -3.986520 | C Loss: -0.131828
06/16/2022 14:25:56 - INFO - __main__ -   Text: ['looking basussies of telephone world map?']
06/16/2022 14:25:58 - INFO - __main__ -   Epoch: 3 | Batch: 72000/120000 (60%) | G Loss: -3.508492 | C Loss: -0.391704
06/16/2022 14:25:58 - INFO - __main__ -   Text: ['How I find if DNA is?']
06/16/2022 14:25:59 - INFO - __main__ -   Epoch: 3 | Batch: 72600/120000 (60%) | G Loss: -2.793053 | C Loss: -0.450411
06/16/2022 14:25:59 - INFO - __main__ -   Text: ['How many wise people break the law here?']
06/16/2022 14:26:01 - INFO - __main__ -   Epoch: 3 | Batch: 73200/120000 (61%) | G Loss: -0.742774 | C Loss: -0.157546
06/16/2022 14:26:01 - INFO - __main__ -   Text: ['where can you give me sight of wildome penguins?']
06/16/2022 14:26:02 - INFO - __main__ -   Epoch: 3 | Batch: 73800/120000 (6

06/16/2022 14:26:52 - INFO - __main__ -   Text: ['are there currently qualified military surgeon?']
06/16/2022 14:26:54 - INFO - __main__ -   Epoch: 3 | Batch: 94800/120000 (79%) | G Loss: -4.945535 | C Loss: -1.203471
06/16/2022 14:26:54 - INFO - __main__ -   Text: ['fave idea to?']
06/16/2022 14:26:55 - INFO - __main__ -   Epoch: 3 | Batch: 95400/120000 (80%) | G Loss: -4.282823 | C Loss: 0.076789
06/16/2022 14:26:55 - INFO - __main__ -   Text: ['How do they get this Yahoo fan quiz?']
06/16/2022 14:26:57 - INFO - __main__ -   Epoch: 3 | Batch: 96000/120000 (80%) | G Loss: -2.912485 | C Loss: -0.287813
06/16/2022 14:26:57 - INFO - __main__ -   Text: ['what\'s "move away from" market?']
06/16/2022 14:26:58 - INFO - __main__ -   Epoch: 3 | Batch: 96600/120000 (80%) | G Loss: -4.643416 | C Loss: -0.376815
06/16/2022 14:26:58 - INFO - __main__ -   Text: ['where in the world buy plastic gun?']
06/16/2022 14:27:00 - INFO - __main__ -   Epoch: 3 | Batch: 97200/120000 (81%) | G Loss: -5.39814

06/16/2022 14:27:52 - INFO - __main__ -   Epoch: 3 | Batch: 118200/120000 (98%) | G Loss: 3.622264 | C Loss: 0.166461
06/16/2022 14:27:52 - INFO - __main__ -   Text: ['A question about the "Girl Power"?']
06/16/2022 14:27:54 - INFO - __main__ -   Epoch: 3 | Batch: 118800/120000 (99%) | G Loss: 4.288147 | C Loss: -1.045565
06/16/2022 14:27:54 - INFO - __main__ -   Text: ['I could like this first name...?']
06/16/2022 14:27:55 - INFO - __main__ -   Epoch: 3 | Batch: 119400/120000 (100%) | G Loss: 3.566059 | C Loss: -0.414379
06/16/2022 14:27:55 - INFO - __main__ -   Text: ["What do you think a least specific place to get job's?"]
06/16/2022 14:27:57 - INFO - __main__ -   * (Train) Epoch: 3 | G Loss: 0.3504 | C Loss: -0.6239 | Updates G: 945 | Updates C: 9055


tensor([[ 1.0155, -0.4540, -0.1085,  ...,  0.2785, -1.0737, -0.6045],
        [ 0.7785, -0.4059, -0.6252,  ...,  1.4169, -1.4788, -0.6419],
        [ 0.6646, -0.7891,  0.8736,  ...,  0.1176, -0.0739, -0.0183],
        ...,
        [-0.6725, -0.0551,  1.7606,  ...,  0.4268, -1.1975, -0.7485],
        [-0.8832,  0.1338,  0.9026,  ...,  0.5354, -1.0887, -1.7120],
        [ 0.5893,  1.1778,  1.2983,  ...,  0.9107,  1.6315, -0.4155]],
       device='cuda:0')
tensor([[ 0.1142, -0.4579, -0.8276,  ...,  2.2146, -0.4863, -0.2949],
        [ 0.5958, -0.3035, -1.4614,  ...,  0.6290,  0.2932, -0.0970],
        [-1.1089,  0.9852,  0.7755,  ...,  1.1605,  0.1326, -1.6802],
        ...,
        [-1.1327, -0.4231,  0.7789,  ..., -1.0690,  0.4702,  0.3192],
        [ 1.0669, -0.7762, -0.7414,  ...,  0.7191, -0.0251,  0.6091],
        [ 0.2497, -0.1685,  0.3121,  ...,  0.8271, -0.3195,  1.5161]],
       device='cuda:0')
huggingface/tokenizers: The current process just got forked, after parallelism has a

	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/to

06/16/2022 14:28:06 - INFO - __main__ -   Bleu-2:0.383 | B-Bleu-2:0.427
06/16/2022 14:28:06 - INFO - __main__ -   * Saving. Best Score:0.810 | Bleu-2:0.383 | B-Bleu-2:0.427


0.8095846948441869
Train classification discriminator

Training...
  Batch    10  of    172.    Elapsed: 0:00:17.
  Batch    20  of    172.    Elapsed: 0:00:34.
  Batch    30  of    172.    Elapsed: 0:00:50.
  Batch    40  of    172.    Elapsed: 0:01:08.
  Batch    50  of    172.    Elapsed: 0:01:25.
  Batch    60  of    172.    Elapsed: 0:01:42.
  Batch    70  of    172.    Elapsed: 0:01:58.
  Batch    80  of    172.    Elapsed: 0:02:15.
  Batch    90  of    172.    Elapsed: 0:02:33.
  Batch   100  of    172.    Elapsed: 0:02:50.
  Batch   110  of    172.    Elapsed: 0:03:08.
  Batch   120  of    172.    Elapsed: 0:03:26.
  Batch   130  of    172.    Elapsed: 0:03:43.
  Batch   140  of    172.    Elapsed: 0:04:00.
  Batch   150  of    172.    Elapsed: 0:04:18.
  Batch   160  of    172.    Elapsed: 0:04:36.
  Batch   170  of    172.    Elapsed: 0:04:53.

  Average training loss generetor: 0.554
  Average training loss discriminator: 1.182
  Training epcoh took: 0:04:56

Running Test...

06/16/2022 14:33:02 - INFO - __main__ -   Epoch: 4 | Batch: 0/120000 (0%) | G Loss: 3.911754 | C Loss: -0.374810


  Accuracy: 0.372
  Test Loss: 1.914
  Test took: 0:00:01


06/16/2022 14:33:03 - INFO - __main__ -   Text: ['so it i can']
06/16/2022 14:33:04 - INFO - __main__ -   Epoch: 4 | Batch: 600/120000 (0%) | G Loss: 1.798149 | C Loss: 0.132212
06/16/2022 14:33:04 - INFO - __main__ -   Text: ['What do you have for boys little brother?']
06/16/2022 14:33:05 - INFO - __main__ -   Epoch: 4 | Batch: 1200/120000 (1%) | G Loss: -1.018763 | C Loss: 0.785877
06/16/2022 14:33:05 - INFO - __main__ -   Text: ['Does most basketball players in the adult country win?']
06/16/2022 14:33:07 - INFO - __main__ -   Epoch: 4 | Batch: 1800/120000 (2%) | G Loss: 2.000783 | C Loss: -1.527114
06/16/2022 14:33:07 - INFO - __main__ -   Text: ['Does mccarthy ?']
06/16/2022 14:33:08 - INFO - __main__ -   Epoch: 4 | Batch: 2400/120000 (2%) | G Loss: 2.879915 | C Loss: -1.147798
06/16/2022 14:33:08 - INFO - __main__ -   Text: ['Who is Electromerally?']
06/16/2022 14:33:10 - INFO - __main__ -   Epoch: 4 | Batch: 3000/120000 (2%) | G Loss: 5.219800 | C Loss: -1.161673
06/16/2022 14:

06/16/2022 14:34:02 - INFO - __main__ -   Epoch: 4 | Batch: 24000/120000 (20%) | G Loss: -5.167961 | C Loss: -0.717907
06/16/2022 14:34:02 - INFO - __main__ -   Text: ['I don what to do....?']
06/16/2022 14:34:04 - INFO - __main__ -   Epoch: 4 | Batch: 24600/120000 (20%) | G Loss: -5.849594 | C Loss: 0.589109
06/16/2022 14:34:04 - INFO - __main__ -   Text: ['Why are both Albert Einstein and Billy Wicca different?']
06/16/2022 14:34:05 - INFO - __main__ -   Epoch: 4 | Batch: 25200/120000 (21%) | G Loss: -5.206393 | C Loss: 0.160552
06/16/2022 14:34:05 - INFO - __main__ -   Text: ['How to prevent the use Systematic Learning?']
06/16/2022 14:34:07 - INFO - __main__ -   Epoch: 4 | Batch: 25800/120000 (22%) | G Loss: -3.233719 | C Loss: 0.252497
06/16/2022 14:34:07 - INFO - __main__ -   Text: ['Whatotype of Children live in Louisiana?']
06/16/2022 14:34:08 - INFO - __main__ -   Epoch: 4 | Batch: 26400/120000 (22%) | G Loss: -3.775784 | C Loss: -0.786216
06/16/2022 14:34:08 - INFO - __main__

06/16/2022 14:35:00 - INFO - __main__ -   Text: ['Cal distributors nice read?']
06/16/2022 14:35:01 - INFO - __main__ -   Epoch: 4 | Batch: 48000/120000 (40%) | G Loss: 9.240629 | C Loss: -2.880401
06/16/2022 14:35:01 - INFO - __main__ -   Text: ['Choice between you or your dad?']
06/16/2022 14:35:03 - INFO - __main__ -   Epoch: 4 | Batch: 48600/120000 (40%) | G Loss: 6.290718 | C Loss: -2.368457
06/16/2022 14:35:03 - INFO - __main__ -   Text: ['Be extra or a girl? help:?']
06/16/2022 14:35:04 - INFO - __main__ -   Epoch: 4 | Batch: 49200/120000 (41%) | G Loss: 2.407939 | C Loss: -0.109583
06/16/2022 14:35:04 - INFO - __main__ -   Text: ['is the situation for another pacific sentership in america?']
06/16/2022 14:35:05 - INFO - __main__ -   Epoch: 4 | Batch: 49800/120000 (42%) | G Loss: -5.976686 | C Loss: 1.234613
06/16/2022 14:35:06 - INFO - __main__ -   Text: ['list of pictures from the common area of www.?']
06/16/2022 14:35:07 - INFO - __main__ -   Epoch: 4 | Batch: 50400/120000 (

06/16/2022 14:35:59 - INFO - __main__ -   Epoch: 4 | Batch: 71400/120000 (60%) | G Loss: 6.056443 | C Loss: -0.821827
06/16/2022 14:35:59 - INFO - __main__ -   Text: ['are they a list to invest in 3,4,5?']
06/16/2022 14:36:00 - INFO - __main__ -   Epoch: 4 | Batch: 72000/120000 (60%) | G Loss: 3.970547 | C Loss: -1.094795
06/16/2022 14:36:00 - INFO - __main__ -   Text: ['Can God be nice reporter to everyone Who speaks French physically ?']
06/16/2022 14:36:02 - INFO - __main__ -   Epoch: 4 | Batch: 72600/120000 (60%) | G Loss: 4.896680 | C Loss: -0.758911
06/16/2022 14:36:02 - INFO - __main__ -   Text: ['Earthworms 2 own him?']
06/16/2022 14:36:03 - INFO - __main__ -   Epoch: 4 | Batch: 73200/120000 (61%) | G Loss: -0.299493 | C Loss: 0.997895
06/16/2022 14:36:03 - INFO - __main__ -   Text: ['Whats the good bumfic or pork farts?']
06/16/2022 14:36:05 - INFO - __main__ -   Epoch: 4 | Batch: 73800/120000 (62%) | G Loss: -0.996203 | C Loss: -1.114386
06/16/2022 14:36:05 - INFO - __main__ 

06/16/2022 14:36:57 - INFO - __main__ -   Text: ['I looking for ways to hate my blog in 2007.?']
06/16/2022 14:36:59 - INFO - __main__ -   Epoch: 4 | Batch: 95400/120000 (80%) | G Loss: 4.325963 | C Loss: -0.995670
06/16/2022 14:36:59 - INFO - __main__ -   Text: ['how to many places i dont go to sleep at?']
06/16/2022 14:37:00 - INFO - __main__ -   Epoch: 4 | Batch: 96000/120000 (80%) | G Loss: 7.101429 | C Loss: -1.085864
06/16/2022 14:37:00 - INFO - __main__ -   Text: ['What could be caused by my pts.?']
06/16/2022 14:37:02 - INFO - __main__ -   Epoch: 4 | Batch: 96600/120000 (80%) | G Loss: 5.719032 | C Loss: -0.315982
06/16/2022 14:37:02 - INFO - __main__ -   Text: ["any response,rieved by 'the public visit'?"]
06/16/2022 14:37:03 - INFO - __main__ -   Epoch: 4 | Batch: 97200/120000 (81%) | G Loss: 4.886604 | C Loss: -0.017026
06/16/2022 14:37:03 - INFO - __main__ -   Text: ['Sports detailed on San Antonio Area Busses?']
06/16/2022 14:37:05 - INFO - __main__ -   Epoch: 4 | Batch: 9

06/16/2022 14:37:56 - INFO - __main__ -   Epoch: 4 | Batch: 118800/120000 (99%) | G Loss: 4.931567 | C Loss: -1.605721
06/16/2022 14:37:57 - INFO - __main__ -   Text: ['How r u guess you hit a y rampan in arabic?']
06/16/2022 14:37:58 - INFO - __main__ -   Epoch: 4 | Batch: 119400/120000 (100%) | G Loss: 5.219709 | C Loss: -1.619510
06/16/2022 14:37:58 - INFO - __main__ -   Text: ['who is bigger muscular lonsdale or john madden?']
06/16/2022 14:38:00 - INFO - __main__ -   * (Train) Epoch: 4 | G Loss: 0.9158 | C Loss: -0.6235 | Updates G: 803 | Updates C: 9197


tensor([[ 0.4015, -0.3651, -0.0689,  ..., -0.2665, -1.0400,  0.7542],
        [ 0.5106, -0.2336,  0.3519,  ...,  0.2509, -0.1720,  0.1812],
        [ 0.1344, -0.4439, -0.0238,  ..., -0.5911,  0.0907,  0.1871],
        ...,
        [-1.0015,  1.8831, -1.1817,  ..., -0.0573, -0.8332, -0.0500],
        [ 0.2737,  0.8029,  0.0337,  ..., -0.3535,  0.0282, -0.4440],
        [-0.7138,  0.5384, -0.2717,  ...,  0.5851, -1.2649,  1.9086]],
       device='cuda:0')
tensor([[-0.1205, -0.5679, -0.4078,  ..., -0.8757, -0.9072, -0.0350],
        [ 0.5263,  0.8644,  0.2037,  ...,  0.6009,  1.3641,  1.1251],
        [ 0.0706,  0.1394,  0.0526,  ...,  0.6237,  0.6351,  0.1201],
        ...,
        [-1.1535,  1.0421, -0.8340,  ..., -0.2519, -0.5118,  0.8176],
        [ 1.2870,  1.4821,  1.2124,  ..., -0.9740, -1.7782,  0.9480],
        [ 0.3364,  0.1350,  1.1223,  ...,  0.5582, -1.1220,  0.5709]],
       device='cuda:0')
huggingface/tokenizers: The current process just got forked, after parallelism has a

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

06/16/2022 14:38:09 - INFO - __main__ -   Bleu-2:0.361 | B-Bleu-2:0.392


0.7524002065067474
Train classification discriminator

Training...
  Batch    10  of    172.    Elapsed: 0:00:15.
  Batch    20  of    172.    Elapsed: 0:00:33.
  Batch    30  of    172.    Elapsed: 0:00:49.
  Batch    40  of    172.    Elapsed: 0:01:05.
  Batch    50  of    172.    Elapsed: 0:01:21.
  Batch    60  of    172.    Elapsed: 0:01:38.
  Batch    70  of    172.    Elapsed: 0:01:55.
  Batch    80  of    172.    Elapsed: 0:02:12.
  Batch    90  of    172.    Elapsed: 0:02:28.
  Batch   100  of    172.    Elapsed: 0:02:45.
  Batch   110  of    172.    Elapsed: 0:03:03.
  Batch   120  of    172.    Elapsed: 0:03:20.
  Batch   130  of    172.    Elapsed: 0:03:38.
  Batch   140  of    172.    Elapsed: 0:03:54.
  Batch   150  of    172.    Elapsed: 0:04:10.
  Batch   160  of    172.    Elapsed: 0:04:26.
  Batch   170  of    172.    Elapsed: 0:04:43.

  Average training loss generetor: 0.589
  Average training loss discriminator: 1.069
  Training epcoh took: 0:04:46

Running Test...

06/16/2022 14:42:55 - INFO - __main__ -   Epoch: 5 | Batch: 0/120000 (0%) | G Loss: 4.198859 | C Loss: -1.011007
06/16/2022 14:42:55 - INFO - __main__ -   Text: ['hey no that somerealsters here oh hey?']


  Accuracy: 0.388
  Test Loss: 1.920
  Test took: 0:00:01


06/16/2022 14:42:57 - INFO - __main__ -   Epoch: 5 | Batch: 600/120000 (0%) | G Loss: 2.566281 | C Loss: 0.147154
06/16/2022 14:42:57 - INFO - __main__ -   Text: ['What player will get out of this game better!!!?']
06/16/2022 14:42:58 - INFO - __main__ -   Epoch: 5 | Batch: 1200/120000 (1%) | G Loss: 3.242732 | C Loss: -0.470685
06/16/2022 14:42:58 - INFO - __main__ -   Text: ['what was great german fighter peter young and the second son?']
06/16/2022 14:43:00 - INFO - __main__ -   Epoch: 5 | Batch: 1800/120000 (2%) | G Loss: 1.335001 | C Loss: -0.767449
06/16/2022 14:43:00 - INFO - __main__ -   Text: ['Flevel facts/tips?? any tips.?']
06/16/2022 14:43:01 - INFO - __main__ -   Epoch: 5 | Batch: 2400/120000 (2%) | G Loss: -6.997699 | C Loss: -0.372705
06/16/2022 14:43:01 - INFO - __main__ -   Text: ['Why do ski people get hiked?']
06/16/2022 14:43:03 - INFO - __main__ -   Epoch: 5 | Batch: 3000/120000 (2%) | G Loss: -5.236919 | C Loss: -1.379792
06/16/2022 14:43:03 - INFO - __main__ -  

06/16/2022 14:43:55 - INFO - __main__ -   Text: ['Name your "gorgeous kiss.""?']
06/16/2022 14:43:57 - INFO - __main__ -   Epoch: 5 | Batch: 24600/120000 (20%) | G Loss: 6.296363 | C Loss: -2.081990
06/16/2022 14:43:57 - INFO - __main__ -   Text: ['What is your famous criminals?']
06/16/2022 14:43:58 - INFO - __main__ -   Epoch: 5 | Batch: 25200/120000 (21%) | G Loss: 4.703847 | C Loss: -0.684463
06/16/2022 14:43:58 - INFO - __main__ -   Text: ['Flag for a State of the UK?']
06/16/2022 14:43:59 - INFO - __main__ -   Epoch: 5 | Batch: 25800/120000 (22%) | G Loss: 3.802742 | C Loss: -0.734095
06/16/2022 14:44:00 - INFO - __main__ -   Text: ['how can you be vans licensed to straight <BOS>cars?']
06/16/2022 14:44:01 - INFO - __main__ -   Epoch: 5 | Batch: 26400/120000 (22%) | G Loss: 1.539874 | C Loss: -0.920209
06/16/2022 14:44:01 - INFO - __main__ -   Text: ['We can AD padded pictures?']
06/16/2022 14:44:02 - INFO - __main__ -   Epoch: 5 | Batch: 27000/120000 (22%) | G Loss: -3.017448 | 

06/16/2022 14:44:53 - INFO - __main__ -   Text: ['Are telegans the next sisters ok universel shall we white?']
06/16/2022 14:44:54 - INFO - __main__ -   Epoch: 5 | Batch: 48000/120000 (40%) | G Loss: 5.896580 | C Loss: -1.482739
06/16/2022 14:44:54 - INFO - __main__ -   Text: ['which child names else originate Mexicans?']
06/16/2022 14:44:56 - INFO - __main__ -   Epoch: 5 | Batch: 48600/120000 (40%) | G Loss: 5.145657 | C Loss: -0.788697
06/16/2022 14:44:56 - INFO - __main__ -   Text: ['why do u predict the forest hawkers?']
06/16/2022 14:44:57 - INFO - __main__ -   Epoch: 5 | Batch: 49200/120000 (41%) | G Loss: 4.375791 | C Loss: -0.637788
06/16/2022 14:44:57 - INFO - __main__ -   Text: ['E-K and marchivism?']
06/16/2022 14:44:59 - INFO - __main__ -   Epoch: 5 | Batch: 49800/120000 (42%) | G Loss: 1.432813 | C Loss: -0.032912
06/16/2022 14:44:59 - INFO - __main__ -   Text: ["Is Macy's all alone?"]
06/16/2022 14:45:00 - INFO - __main__ -   Epoch: 5 | Batch: 50400/120000 (42%) | G Loss:

06/16/2022 14:45:52 - INFO - __main__ -   Epoch: 5 | Batch: 71400/120000 (60%) | G Loss: 4.555421 | C Loss: -0.685161
06/16/2022 14:45:52 - INFO - __main__ -   Text: ['how do the stories the internet came by?']
06/16/2022 14:45:53 - INFO - __main__ -   Epoch: 5 | Batch: 72000/120000 (60%) | G Loss: 4.758101 | C Loss: -0.416529
06/16/2022 14:45:53 - INFO - __main__ -   Text: ["How do you handle MGA's?"]
06/16/2022 14:45:55 - INFO - __main__ -   Epoch: 5 | Batch: 72600/120000 (60%) | G Loss: 3.855542 | C Loss: -0.585722
06/16/2022 14:45:55 - INFO - __main__ -   Text: ['What is a good age on I wasson State?']
06/16/2022 14:45:56 - INFO - __main__ -   Epoch: 5 | Batch: 73200/120000 (61%) | G Loss: -5.657402 | C Loss: 0.122204
06/16/2022 14:45:56 - INFO - __main__ -   Text: ['this one disappointment vacuums?']
06/16/2022 14:45:58 - INFO - __main__ -   Epoch: 5 | Batch: 73800/120000 (62%) | G Loss: -3.485361 | C Loss: -1.609683
06/16/2022 14:45:58 - INFO - __main__ -   Text: ['i was getting 

06/16/2022 14:46:47 - INFO - __main__ -   Text: ['how do i get the 2nd row altitudeograph?']
06/16/2022 14:46:49 - INFO - __main__ -   Epoch: 5 | Batch: 94800/120000 (79%) | G Loss: 7.080129 | C Loss: -0.845715
06/16/2022 14:46:49 - INFO - __main__ -   Text: ['what computerare like?']
06/16/2022 14:46:50 - INFO - __main__ -   Epoch: 5 | Batch: 95400/120000 (80%) | G Loss: 3.705060 | C Loss: -0.824456
06/16/2022 14:46:50 - INFO - __main__ -   Text: ['Where did an effective SocietyOn TV get used?']
06/16/2022 14:46:52 - INFO - __main__ -   Epoch: 5 | Batch: 96000/120000 (80%) | G Loss: -2.408789 | C Loss: -0.287681
06/16/2022 14:46:52 - INFO - __main__ -   Text: ['how do u solve a wato-like sentence for correct?']
06/16/2022 14:46:53 - INFO - __main__ -   Epoch: 5 | Batch: 96600/120000 (80%) | G Loss: -8.502948 | C Loss: -0.710285
06/16/2022 14:46:53 - INFO - __main__ -   Text: ['what animal is the right to live ?']
06/16/2022 14:46:55 - INFO - __main__ -   Epoch: 5 | Batch: 97200/120000

06/16/2022 14:47:47 - INFO - __main__ -   Epoch: 5 | Batch: 118200/120000 (98%) | G Loss: -0.403861 | C Loss: 0.409530
06/16/2022 14:47:47 - INFO - __main__ -   Text: ['or think you a word like u a racist?']
06/16/2022 14:47:48 - INFO - __main__ -   Epoch: 5 | Batch: 118800/120000 (99%) | G Loss: 10.826754 | C Loss: -1.309575
06/16/2022 14:47:48 - INFO - __main__ -   Text: ['what are Myspace ?']
06/16/2022 14:47:50 - INFO - __main__ -   Epoch: 5 | Batch: 119400/120000 (100%) | G Loss: 5.806362 | C Loss: -0.881662
06/16/2022 14:47:50 - INFO - __main__ -   Text: ['how to save a newly stuck site?']
06/16/2022 14:47:51 - INFO - __main__ -   * (Train) Epoch: 5 | G Loss: 0.8856 | C Loss: -0.6392 | Updates G: 825 | Updates C: 9175


tensor([[-0.9757, -1.3439, -1.9671,  ..., -1.2149, -0.5453,  1.7896],
        [-0.7482,  0.6583,  1.4413,  ...,  0.4099,  0.6198, -1.1021],
        [ 1.3118,  0.0412,  0.6310,  ...,  0.7580, -0.2837,  0.6456],
        ...,
        [ 0.4210, -0.2572, -0.1237,  ..., -0.1677,  0.5927, -0.5592],
        [-0.2313,  0.6569,  0.3217,  ...,  0.3355,  0.9164,  0.3463],
        [ 0.3737, -0.4145, -0.2835,  ...,  0.2496, -1.1397, -1.4527]],
       device='cuda:0')
  Batch    70  of    172.    Elapsed: 0:01:56.
  Batch    80  of    172.    Elapsed: 0:02:14.
  Batch    90  of    172.    Elapsed: 0:02:31.
  Batch   100  of    172.    Elapsed: 0:02:47.
  Batch   110  of    172.    Elapsed: 0:03:03.
  Batch   120  of    172.    Elapsed: 0:03:19.
  Batch   130  of    172.    Elapsed: 0:03:37.
  Batch   140  of    172.    Elapsed: 0:03:54.
  Batch   150  of    172.    Elapsed: 0:04:11.
  Batch   160  of    172.    Elapsed: 0:04:29.
  Batch   170  of    172.    Elapsed: 0:04:47.

  Average training loss 

06/16/2022 14:52:51 - INFO - __main__ -   Epoch: 6 | Batch: 0/120000 (0%) | G Loss: 8.487090 | C Loss: -0.481099
06/16/2022 14:52:51 - INFO - __main__ -   Text: ['Why is my opinion that white guys itch?']


  Accuracy: 0.415
  Test Loss: 1.849
  Test took: 0:00:01


06/16/2022 14:52:52 - INFO - __main__ -   Epoch: 6 | Batch: 600/120000 (0%) | G Loss: 1.368679 | C Loss: -0.194718
06/16/2022 14:52:52 - INFO - __main__ -   Text: ['Which Bodybuilder did try to recover from Guillotine?']
06/16/2022 14:52:53 - INFO - __main__ -   Epoch: 6 | Batch: 1200/120000 (1%) | G Loss: -2.094006 | C Loss: -0.785654
06/16/2022 14:52:53 - INFO - __main__ -   Text: ['What do i think should this question?']
06/16/2022 14:52:54 - INFO - __main__ -   Epoch: 6 | Batch: 1800/120000 (2%) | G Loss: 1.574116 | C Loss: -0.056738
06/16/2022 14:52:54 - INFO - __main__ -   Text: ['Could I see Super Mario? !?']
06/16/2022 14:52:54 - INFO - __main__ -   Epoch: 6 | Batch: 2400/120000 (2%) | G Loss: 8.726427 | C Loss: -2.108954
06/16/2022 14:52:55 - INFO - __main__ -   Text: ['how chicago setup last mile?']
06/16/2022 14:52:55 - INFO - __main__ -   Epoch: 6 | Batch: 3000/120000 (2%) | G Loss: 8.659826 | C Loss: -0.868798
06/16/2022 14:52:55 - INFO - __main__ -   Text: ['what have i d

06/16/2022 14:53:27 - INFO - __main__ -   Text: ['what can you think of yahoo euro online game?']
06/16/2022 14:53:28 - INFO - __main__ -   Epoch: 6 | Batch: 24600/120000 (20%) | G Loss: 5.909184 | C Loss: -0.782424
06/16/2022 14:53:28 - INFO - __main__ -   Text: ['Anyone here wants a talkative address?']
06/16/2022 14:53:30 - INFO - __main__ -   Epoch: 6 | Batch: 25200/120000 (21%) | G Loss: 6.056873 | C Loss: -0.607426
06/16/2022 14:53:30 - INFO - __main__ -   Text: ['i have this question about your friends and reactions?']
06/16/2022 14:53:31 - INFO - __main__ -   Epoch: 6 | Batch: 25800/120000 (22%) | G Loss: 5.720383 | C Loss: 0.018527
06/16/2022 14:53:31 - INFO - __main__ -   Text: ['cant I call the office of any yention?']
06/16/2022 14:53:33 - INFO - __main__ -   Epoch: 6 | Batch: 26400/120000 (22%) | G Loss: 5.698375 | C Loss: -0.299080
06/16/2022 14:53:33 - INFO - __main__ -   Text: ['Which kind of science teacher please!?']
06/16/2022 14:53:34 - INFO - __main__ -   Epoch: 6 

06/16/2022 14:54:26 - INFO - __main__ -   Epoch: 6 | Batch: 47400/120000 (40%) | G Loss: 6.010033 | C Loss: 0.065390
06/16/2022 14:54:26 - INFO - __main__ -   Text: ['How Will synonyms work in ib Gallery?']
06/16/2022 14:54:27 - INFO - __main__ -   Epoch: 6 | Batch: 48000/120000 (40%) | G Loss: 5.295091 | C Loss: -0.159711
06/16/2022 14:54:27 - INFO - __main__ -   Text: ['How are I given the qualifications for any two in college?']
06/16/2022 14:54:29 - INFO - __main__ -   Epoch: 6 | Batch: 48600/120000 (40%) | G Loss: 4.999176 | C Loss: -0.347885
06/16/2022 14:54:29 - INFO - __main__ -   Text: ['why marine mammal is a beasty?']
06/16/2022 14:54:30 - INFO - __main__ -   Epoch: 6 | Batch: 49200/120000 (41%) | G Loss: 4.286487 | C Loss: -0.695135
06/16/2022 14:54:30 - INFO - __main__ -   Text: ['Hey what does some girl look like?']
06/16/2022 14:54:32 - INFO - __main__ -   Epoch: 6 | Batch: 49800/120000 (42%) | G Loss: 4.416941 | C Loss: -0.764150
06/16/2022 14:54:32 - INFO - __main__ - 

06/16/2022 14:55:25 - INFO - __main__ -   Text: ['Does our american leagues really work?']
06/16/2022 14:55:26 - INFO - __main__ -   Epoch: 6 | Batch: 71400/120000 (60%) | G Loss: 4.922670 | C Loss: -0.234257
06/16/2022 14:55:26 - INFO - __main__ -   Text: ['How did starting an outpatient affect your desire?']
06/16/2022 14:55:28 - INFO - __main__ -   Epoch: 6 | Batch: 72000/120000 (60%) | G Loss: 5.220086 | C Loss: -0.288162
06/16/2022 14:55:28 - INFO - __main__ -   Text: ['would it be to flip off a ultralife window?']
06/16/2022 14:55:29 - INFO - __main__ -   Epoch: 6 | Batch: 72600/120000 (60%) | G Loss: 4.226901 | C Loss: -0.953832
06/16/2022 14:55:29 - INFO - __main__ -   Text: ['who knows why']
06/16/2022 14:55:31 - INFO - __main__ -   Epoch: 6 | Batch: 73200/120000 (61%) | G Loss: 4.846610 | C Loss: -0.465505
06/16/2022 14:55:31 - INFO - __main__ -   Text: ['Thenating white shoes?']
06/16/2022 14:55:32 - INFO - __main__ -   Epoch: 6 | Batch: 73800/120000 (62%) | G Loss: 4.654813

06/16/2022 14:56:24 - INFO - __main__ -   Text: ['Where has the latest TORTURE method?']
06/16/2022 14:56:25 - INFO - __main__ -   Epoch: 6 | Batch: 94800/120000 (79%) | G Loss: 3.554205 | C Loss: -0.680518
06/16/2022 14:56:25 - INFO - __main__ -   Text: ['A flirt. Would it cost you a week in business.?']
06/16/2022 14:56:27 - INFO - __main__ -   Epoch: 6 | Batch: 95400/120000 (80%) | G Loss: 3.493542 | C Loss: -0.618145
06/16/2022 14:56:27 - INFO - __main__ -   Text: ['What is the most typing website?']
06/16/2022 14:56:28 - INFO - __main__ -   Epoch: 6 | Batch: 96000/120000 (80%) | G Loss: 4.335287 | C Loss: -0.710161
06/16/2022 14:56:28 - INFO - __main__ -   Text: ["I'm MAILURD! I send ambry.?"]
06/16/2022 14:56:30 - INFO - __main__ -   Epoch: 6 | Batch: 96600/120000 (80%) | G Loss: 5.834590 | C Loss: -0.410288
06/16/2022 14:56:30 - INFO - __main__ -   Text: ['relation of atmosphere solution samples?']
06/16/2022 14:56:31 - INFO - __main__ -   Epoch: 6 | Batch: 97200/120000 (81%) | 

06/16/2022 14:57:23 - INFO - __main__ -   Text: ["I don't think this?"]
06/16/2022 14:57:24 - INFO - __main__ -   Epoch: 6 | Batch: 118200/120000 (98%) | G Loss: 5.984839 | C Loss: -0.602642
06/16/2022 14:57:25 - INFO - __main__ -   Text: ['How to stay in the motor learners insurance?']
06/16/2022 14:57:26 - INFO - __main__ -   Epoch: 6 | Batch: 118800/120000 (99%) | G Loss: 6.779926 | C Loss: -0.332323
06/16/2022 14:57:26 - INFO - __main__ -   Text: ['Secretariat of Switzerland Please: can all of our imaging be located?']
06/16/2022 14:57:28 - INFO - __main__ -   Epoch: 6 | Batch: 119400/120000 (100%) | G Loss: 7.335844 | C Loss: -0.417674
06/16/2022 14:57:28 - INFO - __main__ -   Text: ['How do you get that phrase swimsuit?']
06/16/2022 14:57:29 - INFO - __main__ -   * (Train) Epoch: 6 | G Loss: 4.0963 | C Loss: -0.5350 | Updates G: 326 | Updates C: 9674


tensor([[ 1.0432, -0.0717, -0.2277,  ..., -0.3528, -0.8228, -0.5747],
        [ 1.0533, -0.0339, -1.3454,  ..., -0.8602,  0.3058,  0.8424],
        [-1.3239, -0.6592, -1.0399,  ...,  0.2965, -0.1460, -0.2216],
        ...,
        [-0.4176,  0.3925,  1.1319,  ..., -0.8540, -0.6450, -0.9428],
        [ 1.5018, -0.8705,  1.1487,  ..., -2.0212, -0.2019, -0.1653],
        [-0.5196, -0.2534,  1.4419,  ..., -0.3963,  0.1139, -0.0817]],
       device='cuda:0')
tensor([[ 0.7070, -0.7058, -0.1787,  ..., -0.7809,  1.2587,  0.8616],
        [-0.4512,  0.1903, -1.0642,  ..., -0.0074, -0.7352, -1.2638],
        [ 0.1793,  0.0511, -0.6756,  ..., -1.4029, -0.4046,  0.2964],
        ...,
        [-0.2230, -0.8869, -0.1366,  ...,  0.3138, -0.4194,  0.2537],
        [-1.3566,  0.7376,  0.9829,  ...,  0.2070, -1.0215,  0.3472],
        [ 0.2926,  1.0751,  0.8774,  ...,  0.9166,  0.4116, -1.2841]],
       device='cuda:0')
huggingface/tokenizers: The current process just got forked, after parallelism has a

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly 

06/16/2022 14:57:38 - INFO - __main__ -   Bleu-2:0.362 | B-Bleu-2:0.394


0.7552175502236298
Train classification discriminator

Training...
  Batch    10  of    172.    Elapsed: 0:00:16.
  Batch    20  of    172.    Elapsed: 0:00:32.
  Batch    30  of    172.    Elapsed: 0:00:50.
  Batch    40  of    172.    Elapsed: 0:01:06.
  Batch    50  of    172.    Elapsed: 0:01:23.
  Batch    60  of    172.    Elapsed: 0:01:39.
  Batch    70  of    172.    Elapsed: 0:01:57.
  Batch    80  of    172.    Elapsed: 0:02:13.
  Batch    90  of    172.    Elapsed: 0:02:30.
  Batch   100  of    172.    Elapsed: 0:02:46.
  Batch   110  of    172.    Elapsed: 0:03:03.
  Batch   120  of    172.    Elapsed: 0:03:20.
  Batch   130  of    172.    Elapsed: 0:03:38.
  Batch   140  of    172.    Elapsed: 0:03:54.
  Batch   150  of    172.    Elapsed: 0:04:11.
  Batch   160  of    172.    Elapsed: 0:04:29.
  Batch   170  of    172.    Elapsed: 0:04:46.

  Average training loss generetor: 0.611
  Average training loss discriminator: 1.009
  Training epcoh took: 0:04:49

Running Test...

06/16/2022 15:02:28 - INFO - __main__ -   Epoch: 7 | Batch: 0/120000 (0%) | G Loss: 7.604188 | C Loss: -0.772121
06/16/2022 15:02:28 - INFO - __main__ -   Text: ['Should I do a debt get messed up?']


  Accuracy: 0.417
  Test Loss: 1.873
  Test took: 0:00:01


06/16/2022 15:02:30 - INFO - __main__ -   Epoch: 7 | Batch: 600/120000 (0%) | G Loss: 5.708263 | C Loss: -0.414847
06/16/2022 15:02:30 - INFO - __main__ -   Text: ["When fighting David Grappler's name?"]
06/16/2022 15:02:31 - INFO - __main__ -   Epoch: 7 | Batch: 1200/120000 (1%) | G Loss: 6.018260 | C Loss: -0.573789
06/16/2022 15:02:31 - INFO - __main__ -   Text: ['what Itness of flies is.?']
06/16/2022 15:02:33 - INFO - __main__ -   Epoch: 7 | Batch: 1800/120000 (2%) | G Loss: 5.627157 | C Loss: -0.705122
06/16/2022 15:02:33 - INFO - __main__ -   Text: ['why do i like to have an eating contest?']
06/16/2022 15:02:34 - INFO - __main__ -   Epoch: 7 | Batch: 2400/120000 (2%) | G Loss: 4.842817 | C Loss: -0.674223
06/16/2022 15:02:34 - INFO - __main__ -   Text: ['Question chemist?']
06/16/2022 15:02:36 - INFO - __main__ -   Epoch: 7 | Batch: 3000/120000 (2%) | G Loss: 5.693073 | C Loss: -0.650244
06/16/2022 15:02:36 - INFO - __main__ -   Text: ['Which Zoloft are you so happy will conver

06/16/2022 15:03:29 - INFO - __main__ -   Text: ["What is The Hulk's real name?"]
06/16/2022 15:03:31 - INFO - __main__ -   Epoch: 7 | Batch: 24600/120000 (20%) | G Loss: 7.470135 | C Loss: -0.936298
06/16/2022 15:03:31 - INFO - __main__ -   Text: ['How do you asiliarly know the quotes for "Courcy"?']
06/16/2022 15:03:32 - INFO - __main__ -   Epoch: 7 | Batch: 25200/120000 (21%) | G Loss: 7.113710 | C Loss: -0.856687
06/16/2022 15:03:32 - INFO - __main__ -   Text: ['Do they make changes to watching DVD?']
06/16/2022 15:03:34 - INFO - __main__ -   Epoch: 7 | Batch: 25800/120000 (22%) | G Loss: 7.041022 | C Loss: -0.351662
06/16/2022 15:03:34 - INFO - __main__ -   Text: ['What are the mostaunting quick hitters reaction time?']
06/16/2022 15:03:35 - INFO - __main__ -   Epoch: 7 | Batch: 26400/120000 (22%) | G Loss: 7.748270 | C Loss: -0.760269
06/16/2022 15:03:35 - INFO - __main__ -   Text: ['what should we presist on individual systems education suddenly?']
06/16/2022 15:03:37 - INFO - _

06/16/2022 15:04:30 - INFO - __main__ -   Epoch: 7 | Batch: 48000/120000 (40%) | G Loss: 10.022865 | C Loss: -0.751188
06/16/2022 15:04:30 - INFO - __main__ -   Text: ['No Ducatsys?']
06/16/2022 15:04:32 - INFO - __main__ -   Epoch: 7 | Batch: 48600/120000 (40%) | G Loss: 8.837230 | C Loss: -0.414700
06/16/2022 15:04:32 - INFO - __main__ -   Text: ["whos gonna get ouro's 5th #1?"]
06/16/2022 15:04:33 - INFO - __main__ -   Epoch: 7 | Batch: 49200/120000 (41%) | G Loss: 7.362466 | C Loss: -0.571192
06/16/2022 15:04:33 - INFO - __main__ -   Text: ['Who thinks every shiver was great one blow out of Germany?']
06/16/2022 15:04:35 - INFO - __main__ -   Epoch: 7 | Batch: 49800/120000 (42%) | G Loss: 7.218907 | C Loss: -0.668732
06/16/2022 15:04:35 - INFO - __main__ -   Text: ['how old anyone in world famous election result?']
06/16/2022 15:04:36 - INFO - __main__ -   Epoch: 7 | Batch: 50400/120000 (42%) | G Loss: 6.590036 | C Loss: -0.951593
06/16/2022 15:04:36 - INFO - __main__ -   Text: ['W

06/16/2022 15:05:29 - INFO - __main__ -   Epoch: 7 | Batch: 71400/120000 (60%) | G Loss: 8.764857 | C Loss: -0.683614
06/16/2022 15:05:29 - INFO - __main__ -   Text: ['what has personannar on?']
06/16/2022 15:05:31 - INFO - __main__ -   Epoch: 7 | Batch: 72000/120000 (60%) | G Loss: 7.015049 | C Loss: -0.753548
06/16/2022 15:05:31 - INFO - __main__ -   Text: ['How do I pay for aimtaker service.?']
06/16/2022 15:05:32 - INFO - __main__ -   Epoch: 7 | Batch: 72600/120000 (60%) | G Loss: 6.964502 | C Loss: -0.605337
06/16/2022 15:05:32 - INFO - __main__ -   Text: ['how to rig a Khaki-field water body?']
06/16/2022 15:05:34 - INFO - __main__ -   Epoch: 7 | Batch: 73200/120000 (61%) | G Loss: 6.033883 | C Loss: -0.446717
06/16/2022 15:05:34 - INFO - __main__ -   Text: ['which hip guys will be pretty re-forming red hot in the nfl?']
06/16/2022 15:05:35 - INFO - __main__ -   Epoch: 7 | Batch: 73800/120000 (62%) | G Loss: 8.437647 | C Loss: -0.651550
06/16/2022 15:05:35 - INFO - __main__ -   T

06/16/2022 15:06:17 - INFO - __main__ -   Epoch: 7 | Batch: 94800/120000 (79%) | G Loss: 10.022482 | C Loss: -0.435791
06/16/2022 15:06:17 - INFO - __main__ -   Text: ['WHICH does my friend work vacation?!?']
06/16/2022 15:06:18 - INFO - __main__ -   Epoch: 7 | Batch: 95400/120000 (80%) | G Loss: 8.809443 | C Loss: -0.372837
06/16/2022 15:06:18 - INFO - __main__ -   Text: ['What does it need to be true Love?']
06/16/2022 15:06:18 - INFO - __main__ -   Epoch: 7 | Batch: 96000/120000 (80%) | G Loss: 9.480570 | C Loss: -0.687695
06/16/2022 15:06:18 - INFO - __main__ -   Text: ['what is the name of carets in a bunch of airplanes?']
06/16/2022 15:06:19 - INFO - __main__ -   Epoch: 7 | Batch: 96600/120000 (80%) | G Loss: 8.975932 | C Loss: -0.161056
06/16/2022 15:06:19 - INFO - __main__ -   Text: ['what the purpose of the chemical is?']
06/16/2022 15:06:20 - INFO - __main__ -   Epoch: 7 | Batch: 97200/120000 (81%) | G Loss: 8.763569 | C Loss: -0.947756
06/16/2022 15:06:20 - INFO - __main__ -

06/16/2022 15:07:10 - INFO - __main__ -   Text: ['How do clocks work...?']
06/16/2022 15:07:12 - INFO - __main__ -   Epoch: 7 | Batch: 118800/120000 (99%) | G Loss: 9.458498 | C Loss: -0.497549
06/16/2022 15:07:12 - INFO - __main__ -   Text: ['What weather factors do you blame out?']
06/16/2022 15:07:13 - INFO - __main__ -   Epoch: 7 | Batch: 119400/120000 (100%) | G Loss: 9.062660 | C Loss: -0.539343
06/16/2022 15:07:13 - INFO - __main__ -   Text: ['I mean to judge.......?']
06/16/2022 15:07:15 - INFO - __main__ -   * (Train) Epoch: 7 | G Loss: 7.5692 | C Loss: -0.5927 | Updates G: 213 | Updates C: 9787


tensor([[-0.5031, -1.1173,  0.8005,  ..., -0.1654,  0.3277,  0.9365],
        [-1.8848, -0.1531,  0.2076,  ...,  0.0103, -0.1707,  0.0459],
        [-0.4273, -0.1989, -0.3946,  ...,  1.7799, -0.2682,  0.0238],
        ...,
        [-0.4754, -1.0571, -1.4019,  ...,  0.3194,  0.0679,  0.5349],
        [-0.4274, -0.1329,  0.1419,  ..., -2.7928, -1.3807,  0.4307],
        [-1.8595, -1.2137, -0.0822,  ..., -0.7506, -1.2825,  0.2858]],
       device='cuda:0')
tensor([[-0.0086, -0.2129,  1.0610,  ..., -0.9515, -0.4011, -0.6551],
        [ 1.2459,  0.0183,  0.1797,  ..., -0.2060,  0.2820,  0.6630],
        [ 0.2186, -0.2151,  0.1970,  ...,  1.2738,  0.7736, -0.4156],
        ...,
        [ 1.5080, -0.9964, -0.1262,  ..., -0.4217, -0.6567, -0.2176],
        [ 0.0853,  0.1871, -0.0911,  ...,  0.3238, -1.6254,  2.3996],
        [-0.6087,  0.4466,  0.0524,  ...,  0.3019, -0.1113, -0.3812]],
       device='cuda:0')
huggingface/tokenizers: The current process just got forked, after parallelism has a

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid de

06/16/2022 15:07:24 - INFO - __main__ -   Bleu-2:0.371 | B-Bleu-2:0.398


0.7685975665624898
Train classification discriminator

Training...
  Batch    10  of    172.    Elapsed: 0:00:16.
  Batch    20  of    172.    Elapsed: 0:00:31.
  Batch    30  of    172.    Elapsed: 0:00:49.
  Batch    40  of    172.    Elapsed: 0:01:05.
  Batch    50  of    172.    Elapsed: 0:01:22.
  Batch    60  of    172.    Elapsed: 0:01:38.
  Batch    70  of    172.    Elapsed: 0:01:55.
  Batch    80  of    172.    Elapsed: 0:02:12.
  Batch    90  of    172.    Elapsed: 0:02:28.
  Batch   100  of    172.    Elapsed: 0:02:44.
  Batch   110  of    172.    Elapsed: 0:03:00.
  Batch   120  of    172.    Elapsed: 0:03:17.
  Batch   130  of    172.    Elapsed: 0:03:34.
  Batch   140  of    172.    Elapsed: 0:03:50.
  Batch   150  of    172.    Elapsed: 0:04:07.
  Batch   160  of    172.    Elapsed: 0:04:23.
  Batch   170  of    172.    Elapsed: 0:04:39.

  Average training loss generetor: 0.629
  Average training loss discriminator: 0.956
  Training epcoh took: 0:04:42

Running Test...

06/16/2022 15:12:07 - INFO - __main__ -   Epoch: 8 | Batch: 0/120000 (0%) | G Loss: 9.068911 | C Loss: -0.488692
06/16/2022 15:12:07 - INFO - __main__ -   Text: ['Tell all the lesbians that love in your life bad guys?']


  Accuracy: 0.385
  Test Loss: 1.995
  Test took: 0:00:01


06/16/2022 15:12:08 - INFO - __main__ -   Epoch: 8 | Batch: 600/120000 (0%) | G Loss: 8.081953 | C Loss: -0.821019
06/16/2022 15:12:08 - INFO - __main__ -   Text: ['Like music, for a lawyer who is revenue neutral?']
06/16/2022 15:12:10 - INFO - __main__ -   Epoch: 8 | Batch: 1200/120000 (1%) | G Loss: 8.629542 | C Loss: -0.681709
06/16/2022 15:12:10 - INFO - __main__ -   Text: ['How is the Dallas County gangel known for their black men?']
06/16/2022 15:12:11 - INFO - __main__ -   Epoch: 8 | Batch: 1800/120000 (2%) | G Loss: 8.322558 | C Loss: -0.550281
06/16/2022 15:12:11 - INFO - __main__ -   Text: ['Could someone tell me plant TN: 4 requestable countries?']
06/16/2022 15:12:13 - INFO - __main__ -   Epoch: 8 | Batch: 2400/120000 (2%) | G Loss: 8.767656 | C Loss: -0.461239
06/16/2022 15:12:13 - INFO - __main__ -   Text: ['to obtain wealth of anyone?']
06/16/2022 15:12:14 - INFO - __main__ -   Epoch: 8 | Batch: 3000/120000 (2%) | G Loss: 8.742920 | C Loss: -0.514966
06/16/2022 15:12:14 

06/16/2022 15:13:03 - INFO - __main__ -   Epoch: 8 | Batch: 24000/120000 (20%) | G Loss: 10.143167 | C Loss: -0.148994
06/16/2022 15:13:03 - INFO - __main__ -   Text: ['Did your space time go away for my single Eyes?']
06/16/2022 15:13:04 - INFO - __main__ -   Epoch: 8 | Batch: 24600/120000 (20%) | G Loss: 11.597346 | C Loss: -0.945939
06/16/2022 15:13:04 - INFO - __main__ -   Text: ['is chrispunky between other courtsville?']
06/16/2022 15:13:06 - INFO - __main__ -   Epoch: 8 | Batch: 25200/120000 (21%) | G Loss: 11.270068 | C Loss: 0.250457
06/16/2022 15:13:06 - INFO - __main__ -   Text: ['Do you really rate good Photoshop projects?']
06/16/2022 15:13:07 - INFO - __main__ -   Epoch: 8 | Batch: 25800/120000 (22%) | G Loss: 11.341181 | C Loss: -1.225690
06/16/2022 15:13:07 - INFO - __main__ -   Text: ['At a moment Earthquake, is it a scary reaction?']
06/16/2022 15:13:09 - INFO - __main__ -   Epoch: 8 | Batch: 26400/120000 (22%) | G Loss: 10.473789 | C Loss: -0.819390
06/16/2022 15:13:

06/16/2022 15:14:02 - INFO - __main__ -   Text: ['How do we need to move in the stock exchange at some point in time?']
06/16/2022 15:14:04 - INFO - __main__ -   Epoch: 8 | Batch: 48000/120000 (40%) | G Loss: 9.649878 | C Loss: -0.490726
06/16/2022 15:14:04 - INFO - __main__ -   Text: ['How could Darwin leave a valley?']
06/16/2022 15:14:05 - INFO - __main__ -   Epoch: 8 | Batch: 48600/120000 (40%) | G Loss: 9.607611 | C Loss: -0.595971
06/16/2022 15:14:05 - INFO - __main__ -   Text: ['I walk word to 14 and my editor will not remember?']
06/16/2022 15:14:07 - INFO - __main__ -   Epoch: 8 | Batch: 49200/120000 (41%) | G Loss: 9.231526 | C Loss: -0.480920
06/16/2022 15:14:07 - INFO - __main__ -   Text: ['What question can a US in army in?']
06/16/2022 15:14:08 - INFO - __main__ -   Epoch: 8 | Batch: 49800/120000 (42%) | G Loss: 8.870667 | C Loss: -0.924738
06/16/2022 15:14:08 - INFO - __main__ -   Text: ['any kind of cruelty or everyday activities to help people?']
06/16/2022 15:14:10 - 

06/16/2022 15:15:02 - INFO - __main__ -   Text: ['Is there a straight thing you can do to track a dangerous shark?']
06/16/2022 15:15:03 - INFO - __main__ -   Epoch: 8 | Batch: 71400/120000 (60%) | G Loss: 11.037437 | C Loss: -1.439124
06/16/2022 15:15:03 - INFO - __main__ -   Text: ['Imagine, how you would handle the meeting, for the United States?']
06/16/2022 15:15:05 - INFO - __main__ -   Epoch: 8 | Batch: 72000/120000 (60%) | G Loss: 9.815172 | C Loss: 0.028507
06/16/2022 15:15:05 - INFO - __main__ -   Text: ['What does Ah Cats phase of Chris Carter look like?']
06/16/2022 15:15:06 - INFO - __main__ -   Epoch: 8 | Batch: 72600/120000 (60%) | G Loss: 10.357792 | C Loss: -1.080277
06/16/2022 15:15:07 - INFO - __main__ -   Text: ['Any question about smart phones and bridges?']
06/16/2022 15:15:08 - INFO - __main__ -   Epoch: 8 | Batch: 73200/120000 (61%) | G Loss: 8.806114 | C Loss: -0.400784
06/16/2022 15:15:08 - INFO - __main__ -   Text: ['How can you preorder Microsoft media cards

06/16/2022 15:16:02 - INFO - __main__ -   Epoch: 8 | Batch: 94200/120000 (78%) | G Loss: 10.142456 | C Loss: -0.971058
06/16/2022 15:16:02 - INFO - __main__ -   Text: ['adding the restona in full time ?']
06/16/2022 15:16:03 - INFO - __main__ -   Epoch: 8 | Batch: 94800/120000 (79%) | G Loss: 10.513362 | C Loss: -0.874496
06/16/2022 15:16:03 - INFO - __main__ -   Text: ['anyone in large:face to face test of aerodynamics?']
06/16/2022 15:16:05 - INFO - __main__ -   Epoch: 8 | Batch: 95400/120000 (80%) | G Loss: 11.147057 | C Loss: -0.664342
06/16/2022 15:16:05 - INFO - __main__ -   Text: ['What is the very shape of?']
06/16/2022 15:16:06 - INFO - __main__ -   Epoch: 8 | Batch: 96000/120000 (80%) | G Loss: 10.483530 | C Loss: -0.551381
06/16/2022 15:16:06 - INFO - __main__ -   Text: ['where can i find free music videos for 2 days?']
06/16/2022 15:16:08 - INFO - __main__ -   Epoch: 8 | Batch: 96600/120000 (80%) | G Loss: 9.694077 | C Loss: -0.898630
06/16/2022 15:16:08 - INFO - __main__ -

06/16/2022 15:17:01 - INFO - __main__ -   Epoch: 8 | Batch: 117600/120000 (98%) | G Loss: 10.068811 | C Loss: -0.539640
06/16/2022 15:17:01 - INFO - __main__ -   Text: ['Does ribbons tip, how old do they remain?']
06/16/2022 15:17:03 - INFO - __main__ -   Epoch: 8 | Batch: 118200/120000 (98%) | G Loss: 10.337418 | C Loss: -1.076146
06/16/2022 15:17:03 - INFO - __main__ -   Text: ['Can Tendulkar Nation (please help NOT Lakes under Measures)?']
06/16/2022 15:17:04 - INFO - __main__ -   Epoch: 8 | Batch: 118800/120000 (99%) | G Loss: 11.131836 | C Loss: -0.704307
06/16/2022 15:17:04 - INFO - __main__ -   Text: ['what does my webcam play?']
06/16/2022 15:17:06 - INFO - __main__ -   Epoch: 8 | Batch: 119400/120000 (100%) | G Loss: 10.244967 | C Loss: -0.809713
06/16/2022 15:17:06 - INFO - __main__ -   Text: ['what i have done to be upset about my friend?']
06/16/2022 15:17:07 - INFO - __main__ -   * (Train) Epoch: 8 | G Loss: 9.4647 | C Loss: -0.6810 | Updates G: 172 | Updates C: 9828


tensor([[ 0.0979, -0.3358,  0.1574,  ...,  1.8730,  0.3622,  0.8369],
        [-0.0660, -0.0262, -0.4089,  ...,  0.2391, -1.8512,  1.0379],
        [-0.1434, -0.3114,  0.3669,  ...,  1.1432,  0.2561,  0.0085],
        ...,
        [-0.0924, -0.7968, -1.1073,  ...,  1.2635,  0.1141,  0.5347],
        [-1.1086, -1.9933,  0.2399,  ...,  1.0377, -0.3061,  1.4029],
        [-0.6264, -0.4072,  0.4883,  ...,  0.3528, -0.0386,  0.4703]],
       device='cuda:0')
tensor([[-1.4789,  0.0484, -1.4496,  ...,  0.6058, -0.6666,  0.8847],
        [-0.2856,  0.3492, -0.3505,  ...,  0.8662,  0.4440,  0.2606],
        [ 0.4930, -1.0136,  0.2463,  ..., -0.7046,  0.3404, -0.8081],
        ...,
        [ 0.2112,  0.3946,  1.2156,  ..., -0.2268,  1.5965,  0.3084],
        [-0.9294, -0.4178,  0.8305,  ...,  0.3077,  0.1402, -0.1665],
        [ 0.6073,  0.9612,  1.0150,  ...,  1.1308, -0.4732,  1.3178]],
       device='cuda:0')
huggingface/tokenizers: The current process just got forked, after parallelism has a

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly 

06/16/2022 15:17:15 - INFO - __main__ -   Bleu-2:0.376 | B-Bleu-2:0.410


0.7862966617293465
Train classification discriminator

Training...
  Batch    10  of    172.    Elapsed: 0:00:16.
  Batch    20  of    172.    Elapsed: 0:00:34.
  Batch    30  of    172.    Elapsed: 0:00:50.
  Batch    40  of    172.    Elapsed: 0:01:07.
  Batch    50  of    172.    Elapsed: 0:01:23.
  Batch    60  of    172.    Elapsed: 0:01:40.
  Batch    70  of    172.    Elapsed: 0:01:56.
  Batch    80  of    172.    Elapsed: 0:02:12.
  Batch    90  of    172.    Elapsed: 0:02:29.
  Batch   100  of    172.    Elapsed: 0:02:46.
  Batch   110  of    172.    Elapsed: 0:03:02.
  Batch   120  of    172.    Elapsed: 0:03:18.
  Batch   130  of    172.    Elapsed: 0:03:35.
  Batch   140  of    172.    Elapsed: 0:03:51.
  Batch   150  of    172.    Elapsed: 0:04:08.
  Batch   160  of    172.    Elapsed: 0:04:24.


06/16/2022 15:23:29 - INFO - __main__ -   Epoch: 9 | Batch: 35400/120000 (30%) | G Loss: 10.286275 | C Loss: -0.379460
06/16/2022 15:23:29 - INFO - __main__ -   Text: ['What is Piper Lopter?']
06/16/2022 15:23:30 - INFO - __main__ -   Epoch: 9 | Batch: 36000/120000 (30%) | G Loss: 10.455324 | C Loss: -0.719861
06/16/2022 15:23:31 - INFO - __main__ -   Text: ['Find out your top five goals teachers?']
06/16/2022 15:23:32 - INFO - __main__ -   Epoch: 9 | Batch: 36600/120000 (30%) | G Loss: 10.522500 | C Loss: -0.840463
06/16/2022 15:23:32 - INFO - __main__ -   Text: ['How do you trace up internal noises?']
06/16/2022 15:23:33 - INFO - __main__ -   Epoch: 9 | Batch: 37200/120000 (31%) | G Loss: 10.531681 | C Loss: -1.199916
06/16/2022 15:23:34 - INFO - __main__ -   Text: ['What were these refunds for food frauds?']
06/16/2022 15:23:35 - INFO - __main__ -   Epoch: 9 | Batch: 37800/120000 (32%) | G Loss: 10.225986 | C Loss: -0.508518
06/16/2022 15:23:35 - INFO - __main__ -   Text: ['Who else

06/16/2022 15:24:29 - INFO - __main__ -   Text: ['am I of expectancy in having an affair, in the media?']
06/16/2022 15:24:30 - INFO - __main__ -   Epoch: 9 | Batch: 59400/120000 (50%) | G Loss: 9.345072 | C Loss: -0.696424
06/16/2022 15:24:30 - INFO - __main__ -   Text: ['how to take charge of gastronomy clinic?']
06/16/2022 15:24:31 - INFO - __main__ -   Epoch: 9 | Batch: 60000/120000 (50%) | G Loss: 9.973163 | C Loss: -0.723404
06/16/2022 15:24:31 - INFO - __main__ -   Text: ['Why do Y Examples???']
06/16/2022 15:24:33 - INFO - __main__ -   Epoch: 9 | Batch: 60600/120000 (50%) | G Loss: 10.698751 | C Loss: -0.466732
06/16/2022 15:24:33 - INFO - __main__ -   Text: ['Should i have to get it here from soya cows?']
06/16/2022 15:24:34 - INFO - __main__ -   Epoch: 9 | Batch: 61200/120000 (51%) | G Loss: 10.438055 | C Loss: -1.697806
06/16/2022 15:24:35 - INFO - __main__ -   Text: ["Do you really buy the hottest woman's husbands?"]
06/16/2022 15:24:36 - INFO - __main__ -   Epoch: 9 | Batc

06/16/2022 15:25:28 - INFO - __main__ -   Epoch: 9 | Batch: 82200/120000 (68%) | G Loss: 9.200955 | C Loss: -0.663581
06/16/2022 15:25:28 - INFO - __main__ -   Text: ['how do you increase the datez?']
06/16/2022 15:25:29 - INFO - __main__ -   Epoch: 9 | Batch: 82800/120000 (69%) | G Loss: 10.186130 | C Loss: -1.081436
06/16/2022 15:25:29 - INFO - __main__ -   Text: ['Where mailer was found missing?']
06/16/2022 15:25:31 - INFO - __main__ -   Epoch: 9 | Batch: 83400/120000 (70%) | G Loss: 11.370834 | C Loss: -0.816215
06/16/2022 15:25:31 - INFO - __main__ -   Text: ['which can be used in removing cement discharge?']
06/16/2022 15:25:32 - INFO - __main__ -   Epoch: 9 | Batch: 84000/120000 (70%) | G Loss: 10.825722 | C Loss: -0.848532
06/16/2022 15:25:33 - INFO - __main__ -   Text: ['how can you really be personalised if there are nothing?']
06/16/2022 15:25:34 - INFO - __main__ -   Epoch: 9 | Batch: 84600/120000 (70%) | G Loss: 10.181910 | C Loss: -0.359266
06/16/2022 15:25:34 - INFO - _

06/16/2022 15:26:27 - INFO - __main__ -   Epoch: 9 | Batch: 105600/120000 (88%) | G Loss: 9.115412 | C Loss: -0.499633
06/16/2022 15:26:27 - INFO - __main__ -   Text: ['What has been your longest love with each other in the first 8 month?']
06/16/2022 15:26:29 - INFO - __main__ -   Epoch: 9 | Batch: 106200/120000 (88%) | G Loss: 9.818523 | C Loss: -0.451161
06/16/2022 15:26:29 - INFO - __main__ -   Text: ["What does purple's height?"]
06/16/2022 15:26:30 - INFO - __main__ -   Epoch: 9 | Batch: 106800/120000 (89%) | G Loss: 9.822145 | C Loss: -0.544237
06/16/2022 15:26:30 - INFO - __main__ -   Text: ['I hope I workè di Gomez attractes french actress?']
06/16/2022 15:26:32 - INFO - __main__ -   Epoch: 9 | Batch: 107400/120000 (90%) | G Loss: 9.908713 | C Loss: -0.368301
06/16/2022 15:26:32 - INFO - __main__ -   Text: ["who's the sackling horsesmen. best Croat or Celtic?"]
06/16/2022 15:26:33 - INFO - __main__ -   Epoch: 9 | Batch: 108000/120000 (90%) | G Loss: 9.638634 | C Loss: -1.00886

tensor([[-0.3606, -0.8612,  0.1888,  ...,  0.4487,  0.9730, -1.1553],
        [ 0.0569,  0.7842,  0.0285,  ...,  0.9781, -2.0229,  1.3682],
        [ 0.7829,  0.5047,  0.5960,  ..., -1.0880, -0.4322,  0.9228],
        ...,
        [ 0.0079,  0.7597, -0.1770,  ...,  1.6416, -1.5353,  1.2034],
        [-0.1712, -1.0424, -0.6387,  ...,  0.0730, -0.9450,  0.5419],
        [ 1.3813, -0.2367, -0.9124,  ...,  1.2731,  0.7727,  0.1177]],
       device='cuda:0')
tensor([[ 1.2135,  0.0351,  1.3428,  ..., -0.1681, -0.7409, -0.9074],
        [-0.5875,  1.3770, -0.0342,  ...,  0.8027, -0.3295, -0.9743],
        [-1.6497,  0.1832, -1.7979,  ..., -1.4298,  0.3637, -0.8585],
        ...,
        [-0.6483,  0.6778,  0.3580,  ...,  1.1775,  0.6137, -0.6934],
        [-0.0588, -0.6928,  0.2210,  ..., -1.2063, -0.0928, -0.1999],
        [-0.2986, -0.9396, -0.6344,  ..., -0.1841,  0.1292, -1.0783]],
       device='cuda:0')
huggingface/tokenizers: The current process just got forked, after parallelism has a

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/to

06/16/2022 15:27:13 - INFO - __main__ -   Bleu-2:0.364 | B-Bleu-2:0.400


0.7643963737077377
Train classification discriminator

Training...
  Batch    10  of    172.    Elapsed: 0:00:16.
  Batch    20  of    172.    Elapsed: 0:00:33.
  Batch    30  of    172.    Elapsed: 0:00:49.
  Batch    40  of    172.    Elapsed: 0:01:05.
  Batch    50  of    172.    Elapsed: 0:01:21.
  Batch    60  of    172.    Elapsed: 0:01:37.
  Batch    70  of    172.    Elapsed: 0:01:53.
  Batch    80  of    172.    Elapsed: 0:02:09.
  Batch    90  of    172.    Elapsed: 0:02:26.
  Batch   100  of    172.    Elapsed: 0:02:42.
  Batch   110  of    172.    Elapsed: 0:02:59.


In [5]:
#Generating Sentences
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse

import logging
import torch
import torch.nn as nn
import numpy as np

from modules.gan import Generator

import glob
import os
import pickle
import random

import torch.nn.functional as F
from tqdm import tqdm, trange

from func import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, BertConfig
from func import GPT2LMHeadModel, GPT2Tokenizer, GPT2ForLatentConnector, GPT2ForLatentConnectorValueHead
from func import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
from func import XLNetLMHeadModel, XLNetTokenizer
from func import TransfoXLLMHeadModel, TransfoXLTokenizer
from func import BertForLatentConnector, BertTokenizer

from collections import defaultdict
import pdb
from modules.utils import rollout_test

MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2ForLatentConnector, GPT2Tokenizer),
    'bert': (BertConfig, BertForLatentConnector, BertTokenizer),
    'gpt2v': (GPT2Config, GPT2ForLatentConnectorValueHead, GPT2Tokenizer)
}

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--new_sent', type=int, default=1, help="Number of sentences to generate")
    parser.add_argument('--n_layers', type=int, default=20, help="Number of layers of generator")
    parser.add_argument('--block_dim', type=int, default=100)
    parser.add_argument('--interval', type=int, default=10)
    parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
    parser.add_argument('--generator_dir', default=None, type=str, required=True, help="Directory of GAN model checkpoint")
    parser.add_argument("--checkpoint_dir", default=None, type=str, required=True,
                        help="The directory where checkpoints are saved.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument("--save", default=False, type=bool, help="Save results to file.")
    parser.add_argument("--latent_size", default=32, type=int, help="Latent space dimension.")
    parser.add_argument("--output_name", default="results", type=str, help="File name of output")
    parser.add_argument("--batch_size", default=100, type=int, help="Batch size to generate outputs")
    ## Encoder options
    parser.add_argument("--encoder_model_type", default="bert", type=str,
                        help="The encoder model architecture to be fine-tuned.")
    parser.add_argument("--encoder_model_name_or_path", default="bert-base-cased", type=str,
                        help="The encoder model checkpoint for weights initialization.")
    parser.add_argument("--encoder_config_name", default="", type=str,
                        help="Optional pretrained config name or path if not the same as model_name_or_path")
    parser.add_argument("--encoder_tokenizer_name", default="", type=str,
                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
    ## Decoder options
    parser.add_argument("--decoder_model_type", default="gpt2", type=str,
                        help="The decoder model architecture to be fine-tuned.")
    parser.add_argument("--decoder_model_name_or_path", default="gpt2", type=str,
                        help="The decoder model checkpoint for weights initialization.")
    parser.add_argument("--decoder_config_name", default="", type=str,
                        help="Optional pretrained config name or path if not the same as model_name_or_path")
    parser.add_argument("--decoder_tokenizer_name", default="", type=str,
                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
    parser.add_argument("--max_seq_length", default=512, type=int,
                        help="Optional input sequence length before tokenization. The sequence will be dropped if it is longer the max_seq_length")
    parser.add_argument("--finetune_decoder", default=False, type=bool,
                        help="Uses finetuned decoder in output dir if true.")

    ## Variational auto-encoder(check this)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument("--top_p", type=float, default=1.0)
    parser.add_argument("--prompt", type=str, default="")
    parser.add_argument("--padding_text", type=str, default="")
    parser.add_argument("--length", type=int, default=20)
    parser.add_argument("--block_size", default=-1, type=int,
                        help="Optional input sequence length after tokenization."
                             "The training dataset will be truncated in block of this size for training."
                             "Default to the model max input length for single sentence inputs (take into account special tokens).")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--use_philly", action='store_true',
                        help="Use Philly for computing.")
    parser.add_argument('--gloabl_step_eval', type=int, default=508523,
                        help="Evaluate the results at the given global step")

    # Load a trained Encoder model and vocabulary that you have fine-tuned
    args = parser.parse_args("--checkpoint_dir=output_dir_yahoo \
    --output_dir=output_dir_yahoo \
    --generator_dir=output_dir_yahoo \
    --block_size 100 \
    --max_seq_length 60 \
    --gloabl_step_eval 24000 \
    --latent_size 32 \
    --block_dim 100 \
    --new_sent 100 \
    --n_layers 10 \
    --top_p 0.9 \
    --output_name=results \
    --save True".split())
    global_step = args.gloabl_step_eval

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.backends.cudnn.deterministic = True
    args.device = torch.device("cuda" if args.cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)       
    
    args.encoder_model_type = args.encoder_model_type.lower()
    args.decoder_model_type = args.decoder_model_type.lower()

    output_encoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-encoder-{}'.format(global_step))
    output_decoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-decoder-{}'.format(global_step))
    if not args.finetune_decoder:
        output_decoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-decoder-{}'.format(global_step))
    else:
         output_decoder_dir = os.path.join(args.output_dir, 'checkpoint-decoder-{}'.format(global_step))
    checkpoints = [ [output_encoder_dir, output_decoder_dir] ]

    # Load a trained Encoder model and vocabulary that you have fine-tuned
    encoder_config_class, encoder_model_class, encoder_tokenizer_class = MODEL_CLASSES[args.encoder_model_type]
    model_encoder = encoder_model_class.from_pretrained(output_encoder_dir, latent_size=args.latent_size)
    tokenizer_encoder = encoder_tokenizer_class.from_pretrained(args.encoder_tokenizer_name if args.encoder_tokenizer_name else args.encoder_model_name_or_path, do_lower_case=args.do_lower_case)

    model_encoder.to(args.device)
    if args.block_size <= 0:
        args.block_size = tokenizer_encoder.max_len_single_sentence  # Our input block size will be the max possible for the model
    args.block_size = min(args.block_size, tokenizer_encoder.max_len_single_sentence)

    # Load a trained Decoder model and vocabulary that you have fine-tuned
    if not args.finetune_decoder:
        decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES[args.decoder_model_type]
    else:
        decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES["gpt2v"]
    model_decoder = decoder_model_class.from_pretrained(output_decoder_dir, latent_size=args.latent_size)
    tokenizer_decoder = decoder_tokenizer_class.from_pretrained(args.decoder_tokenizer_name if args.decoder_tokenizer_name else args.decoder_model_name_or_path, do_lower_case=args.do_lower_case)
    model_decoder.to(args.device)
    if args.block_size <= 0:
        args.block_size = tokenizer_decoder.max_len_single_sentence  # Our input block size will be the max possible for the model
    args.block_size = min(args.block_size, tokenizer_decoder.max_len_single_sentence)

    # Chunyuan: Add Padding token to GPT2
    special_tokens_dict = {'pad_token': '<PAD>', 'bos_token': '<BOS>', 'eos_token': '<EOS>'}
    num_added_toks = tokenizer_decoder.add_special_tokens(special_tokens_dict)
    logger.info('We have added {} tokens to GPT2'.format(num_added_toks))
    model_decoder.resize_token_embeddings(len(tokenizer_decoder))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
    assert tokenizer_decoder.pad_token == '<PAD>'
    
    generator = Generator(args.n_layers, args.block_dim, args.latent_size)

    if args.cuda:
        generator = generator.cuda()

    generator.load_state_dict(torch.load(args.generator_dir+'/generator_'+str(args.gloabl_step_eval)+'.th'))
    generator.eval()
    model_decoder.eval()
    model_encoder.eval()
    if args.save:
        if not os.path.exists(args.output_dir+"/{}.txt".format(args.output_name)):
            with open(args.output_dir+"/{}.txt".format(args.output_name), 'w'): 
                pass

    for i in range(int(args.new_sent/args.batch_size)):
        # sample noise
        noise = torch.Tensor(np.random.normal(0, 1, (args.batch_size, args.latent_size))).to(args.device)
        new_z = generator(noise).data

        # create new sent
        sents = rollout_test(model_decoder, new_z, tokenizer_decoder, args.max_seq_length, args.batch_size, args.top_k, args.top_p)

        if args.save:
            with open(args.output_dir+"/{}.txt".format(args.output_name), 'a') as file:
                for i in sents:
                    file.write(i+"\n")
        else:
            for i in sents:
                logger.info(i)


06/15/2022 22:58:43 - INFO - func.configuration_utils -   loading configuration file output_dir_yahoo/checkpoint-encoder-24000/config.json
06/15/2022 22:58:43 - INFO - func.configuration_utils -   Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 28996
}

06/15/2022 22:58:43 - INFO - func.modeling_utils -   loading weights file output_dir_yahoo/checkpoint-encoder-24000/pytorch_model.bin
06/15/2022 22:58:48 - INFO - func.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingfac