# 指令微调

In [1]:
"""
The main program for finetuning LLMs with Huggingface Transformers Library.

ALL SECTIONS WHERE CODE POSSIBLY NEEDS TO BE FILLED IN ARE MARKED AS TODO.
"""

import argparse
from dataclasses import dataclass, field
from typing import Optional, List, Dict
import sys
import torch
from transformers import TrainingArguments, HfArgumentParser, Trainer, AutoTokenizer, AutoModelForCausalLM
import datasets
from peft import LoraConfig, TaskType, get_peft_model
print("done")

  from .autonotebook import tqdm as notebook_tqdm


done


In [2]:
# Define the arguments required for the main program.
# NOTE: You can customize any arguments you need to pass in.
@dataclass
class ModelArguments:
    """Arguments for model
    """
    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The path to the LLM to fine-tune or its name on the Hugging Face Hub."
        }
    )
    torch_dtype: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Override the default `torch.dtype` and load the model under this dtype."
            ),
            "choices": ["bfloat16", "float16", "float32"],
        },
    )
    # TODO: add your model arguments here
    pass


@dataclass
class DataArguments:
    """Arguments for data
    """
    dataset_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The path to the fine-tuning dataset or its name on the Hugging Face Hub."
        }
    )
    # TODO: add your data arguments here
    
@dataclass
class TrainingArguments(TrainingArguments):
    """Arguments for training
    """
    output_dir: str = field(
        default=None
    )
    per_device_train_batch_size: Optional[int] = field(
        default=1
    )
    save_strategy: Optional[str] = field(
        default="steps"
    )
    save_steps: Optional[int] = field(
        default=10000
    )
    report_to: Optional[str] = field(
        default="tensorboard"
    )
    warmup_ratio: Optional[float] = field(
        default=0.03
    )
    logging_steps: Optional[int] = field(
        default=300
    )
    num_train_epochs: Optional[int] = field(
        default=1
    )
    learning_rate: Optional[float] = field(
        default=1e-5
    )
    optim: Optional[str] = field(
        default="adamw_hf"
    )
    lr_scheduler_type: Optional[str] = field(
        default="cosine"
    )
    weight_decay: Optional[float] = field(
        default=0.003
    )
    print("done")

done


In [8]:
# The main function
# NOTE You can customize some logs to monitor your program.
def finetune():
    
    config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        inference_mode=False,
        r=16,
        lora_alpha=32,
        lora_dropout=0.1
    )
    # TODO Step 1: Define an arguments parser and parse the arguments
    # NOTE Three parts: model arguments, data arguments, and training arguments
    # HINT: Refer to 
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/internal/trainer_utils#transformers.HfArgumentParser
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/main_classes/trainer#transformers.TrainingArguments
    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
    print("step1 done")
    print(training_args)

    # TODO Step 2: Load tokenizer and model
    # HINT 1: Refer to
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/main_classes/tokenizer#tokenizer
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/model_doc/qwen2
    # HINT 2: To save training GPU memory, you need to set the model's parameter precision to half-precision (float16 or bfloat16).
    #         You may also check other strategies to save the memory!
    #   * https://huggingface.co/docs/transformers/v4.46.3/en/model_doc/llama2#usage-tips
    #   * https://huggingface.co/docs/transformers/perf_train_gpu_one
    #   * https://www.53ai.com/news/qianyanjishu/2024052494875.html
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
    model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, torch_dtype=model_args.torch_dtype, device_map="cuda:0")
    model = get_peft_model(model, config)
    print("step2 done")
    
    
    # TODO Step 3: Load dataset
    # HINT: https://huggingface.co/docs/datasets/v3.1.0/en/package_reference/main_classes#datasets.Dataset
    dataset = datasets.load_dataset(data_args.dataset_path)
    dataset = dataset["train"]
    print("step3 done")
    
    # TODO Step 4: Define the data collator function
    # NOTE During training, for each model parameter update, we fetch a batch of data, perform a forward and backward pass,
    # and then update the model parameters. The role of the data collator is to process the data (e.g., padding the data within
    # a batch to the same length) and format the batch into the input required by the model.
    #
    # In this assignment, the purpose of the custom data_collator is to process each batch of data from the dataset loaded in
    # Step 3 into the format required by the model. This includes tasks such as tokenizing the data, converting each token into 
    # an ID sequence, applying padding, and preparing labels.
    # 
    # HINT:
    #   * Before implementation, you should:
    #      1. Clearly understand the format of each sample in the dataset loaded in Step 3.
    #      2. Understand the input format required by the model (https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2ForCausalLM).
    #         Reading its source code also helps!
    padding = -100
    def preprocess(dataset, max_len=1024):
        question_template = '<|im_start|>system\nYou are a knowledgeable assistant. Please give a detailed response to the following question.<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n'
        prompt_template = '<|im_start|>system\nYou are a knowledgeable assistant. Please give a detailed response to the following question.<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>' #定义训练时用的prompt
        model_inputs = {'input_ids': [], 'attention_mask':[], 'labels': []} #初始化返回的字典
        for i in range(len(dataset['instruction'])): #遍历
            instruct = dataset['instruction'][i] if dataset['instruction'][i] else '' # 有些数据的instruct和input没有内容，为None
            input = dataset['input'][i] if dataset['input'][i] else ''
            question = instruct+ '\n' + input
            prompt = question_template.format(
                        question=question,
                        #response=dataset['output'][i],
                    ) #获得prompt
            instruction = tokenizer(prompt,add_special_tokens=False,)
            response = tokenizer(dataset['output'][i],add_special_tokens=False)
            input_ids = instruction['input_ids']+response['input_ids']+[tokenizer.pad_token_id]
            attention_mask = (instruction["attention_mask"] + response["attention_mask"] + [1])
            labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]

            if len(input_ids) > max_len:
                input_ids = input_ids[:max_len]
                attention_mask = attention_mask[:max_len]
                labels = labels[:max_len]
            model_inputs['input_ids'].append(input_ids)
            model_inputs['attention_mask'].append(attention_mask)
            model_inputs['labels'].append(labels)
        return model_inputs
    
    dataset = dataset.map(preprocess, batched=True, num_proc=1) #用提前对dataset进行map操作替代custom data_collator，节约后续训练时间。
    print("step4 done")
    
    def data_collator(batch: List[Dict]):
        """
        batch: list of dict, each dict of the list is a sample in the dataset.
        """
        pass
    
    
    
    # TODO Step 5: Define the Trainer
    # HINT: https://huggingface.co/docs/transformers/main_classes/trainer
    trainer = Trainer(
        train_dataset=dataset,
        #eval_dataset=dataset["test"], 不进行训练测试集切分，而使用后续的数据集进行评估
        tokenizer=tokenizer,
        args=training_args,
        model=model,
    )
    print("step5 done")
    # Step 6: Train!
    trainer.train()
print("done")

done


In [9]:
# Pass your training arguments.
# NOTE [IMPORTANT!!!] DO NOT FORGET TO PASS PROPER ARGUMENTS TO SAVE YOUR CHECKPOINTS!!!
sys.argv = [
    "notebook", 
    "--model_name_or_path", "E:/Homework/NLP/NLP_Project_2/Qwen2.5-1.5B",
    "--torch_dtype", "bfloat16", #使用bfloat16来减小显存需求
    "--dataset_path", "E:/Homework/NLP/NLP_Project_2/alpaca-cleaned",
    "--output_dir", "E:/Homework/NLP/NLP_Project_2/output",
    "--per_device_train_batch_size", "1", #16G显存使用4会爆显
    "--save_strategy", "steps",
    "--save_steps", "10000",
    "--report_to", "tensorboard",
    "--warmup_ratio", "0.03",
    "--logging_steps", "100",
    "--num_train_epochs", "1",
    "--learning_rate", "1e-5",
    "--optim", "adamw_hf",
    "--weight_decay", "1e-3",
    "--lr_scheduler_type", "cosine",
]
finetune()

step1 done
TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.

  trainer = Trainer(
  0%|          | 100/51760 [00:26<3:33:38,  4.03it/s]

{'loss': 1.4311, 'grad_norm': 7.603107929229736, 'learning_rate': 6.439150032195751e-07, 'epoch': 0.0}


  0%|          | 200/51760 [00:51<3:31:39,  4.06it/s]

{'loss': 1.268, 'grad_norm': 1.0809446573257446, 'learning_rate': 1.2878300064391501e-06, 'epoch': 0.0}


  1%|          | 300/51760 [01:16<3:20:31,  4.28it/s]

{'loss': 1.424, 'grad_norm': 85.70658874511719, 'learning_rate': 1.931745009658725e-06, 'epoch': 0.01}


  1%|          | 400/51760 [01:41<3:28:02,  4.11it/s]

{'loss': 1.2325, 'grad_norm': 6.559329986572266, 'learning_rate': 2.5756600128783003e-06, 'epoch': 0.01}


  1%|          | 500/51760 [02:07<3:20:49,  4.25it/s]

{'loss': 1.1595, 'grad_norm': 7.458324432373047, 'learning_rate': 3.2195750160978756e-06, 'epoch': 0.01}


  1%|          | 600/51760 [02:31<3:15:32,  4.36it/s]

{'loss': 1.0259, 'grad_norm': 2.943985939025879, 'learning_rate': 3.86349001931745e-06, 'epoch': 0.01}


  1%|▏         | 700/51760 [02:55<3:40:51,  3.85it/s]

{'loss': 1.1547, 'grad_norm': 0.8907306790351868, 'learning_rate': 4.507405022537025e-06, 'epoch': 0.01}


  2%|▏         | 800/51760 [03:20<3:15:14,  4.35it/s]

{'loss': 1.1319, 'grad_norm': 6.659772872924805, 'learning_rate': 5.1513200257566006e-06, 'epoch': 0.02}


  2%|▏         | 900/51760 [03:45<3:37:25,  3.90it/s]

{'loss': 1.0421, 'grad_norm': 1.1354748010635376, 'learning_rate': 5.795235028976176e-06, 'epoch': 0.02}


  2%|▏         | 1000/51760 [04:09<3:25:48,  4.11it/s]

{'loss': 1.0946, 'grad_norm': 0.839602530002594, 'learning_rate': 6.439150032195751e-06, 'epoch': 0.02}


  2%|▏         | 1100/51760 [04:33<3:20:42,  4.21it/s]

{'loss': 1.1698, 'grad_norm': 4.355932235717773, 'learning_rate': 7.083065035415326e-06, 'epoch': 0.02}


  2%|▏         | 1200/51760 [04:59<3:27:19,  4.06it/s]

{'loss': 1.1069, 'grad_norm': 1.3508834838867188, 'learning_rate': 7.7269800386349e-06, 'epoch': 0.02}


  3%|▎         | 1300/51760 [05:23<3:08:33,  4.46it/s]

{'loss': 1.0474, 'grad_norm': 0.6084761023521423, 'learning_rate': 8.370895041854476e-06, 'epoch': 0.03}


  3%|▎         | 1400/51760 [05:50<3:15:47,  4.29it/s]

{'loss': 1.0844, 'grad_norm': 4.3988847732543945, 'learning_rate': 9.01481004507405e-06, 'epoch': 0.03}


  3%|▎         | 1500/51760 [06:14<3:10:03,  4.41it/s]

{'loss': 1.0786, 'grad_norm': 16.650375366210938, 'learning_rate': 9.658725048293625e-06, 'epoch': 0.03}


  3%|▎         | 1600/51760 [06:39<3:34:55,  3.89it/s]

{'loss': 1.0917, 'grad_norm': 4.0969390869140625, 'learning_rate': 9.999978377464785e-06, 'epoch': 0.03}


  3%|▎         | 1700/51760 [07:05<3:34:52,  3.88it/s]

{'loss': 1.0481, 'grad_norm': 2.5894858837127686, 'learning_rate': 9.999788484198321e-06, 'epoch': 0.03}


  3%|▎         | 1800/51760 [07:31<3:24:55,  4.06it/s]

{'loss': 1.118, 'grad_norm': 16.353038787841797, 'learning_rate': 9.999402832076712e-06, 'epoch': 0.03}


  4%|▎         | 1900/51760 [07:55<3:10:38,  4.36it/s]

{'loss': 1.125, 'grad_norm': 3.1539552211761475, 'learning_rate': 9.998821436199566e-06, 'epoch': 0.04}


  4%|▍         | 2000/51760 [08:21<3:11:56,  4.32it/s]

{'loss': 1.0128, 'grad_norm': 3.1738100051879883, 'learning_rate': 9.998044319330522e-06, 'epoch': 0.04}


  4%|▍         | 2100/51760 [08:45<3:02:10,  4.54it/s]

{'loss': 1.077, 'grad_norm': 4.52039098739624, 'learning_rate': 9.99707151189637e-06, 'epoch': 0.04}


  4%|▍         | 2200/51760 [09:11<3:18:47,  4.16it/s]

{'loss': 1.0457, 'grad_norm': 0.8639129996299744, 'learning_rate': 9.995903051985851e-06, 'epoch': 0.04}


  4%|▍         | 2300/51760 [09:35<3:04:16,  4.47it/s]

{'loss': 1.0185, 'grad_norm': 3.166156053543091, 'learning_rate': 9.99453898534818e-06, 'epoch': 0.04}


  5%|▍         | 2400/51760 [10:00<3:09:18,  4.35it/s]

{'loss': 1.072, 'grad_norm': 9.20949649810791, 'learning_rate': 9.99297936539124e-06, 'epoch': 0.05}


  5%|▍         | 2500/51760 [10:27<3:26:04,  3.98it/s]

{'loss': 1.0729, 'grad_norm': 3.4868617057800293, 'learning_rate': 9.991224253179497e-06, 'epoch': 0.05}


  5%|▌         | 2600/51760 [10:51<3:27:54,  3.94it/s]

{'loss': 0.9868, 'grad_norm': 5.811098575592041, 'learning_rate': 9.989273717431606e-06, 'epoch': 0.05}


  5%|▌         | 2700/51760 [11:16<3:37:17,  3.76it/s]

{'loss': 0.9329, 'grad_norm': 39.46692657470703, 'learning_rate': 9.987127834517734e-06, 'epoch': 0.05}


  5%|▌         | 2800/51760 [11:41<3:02:23,  4.47it/s]

{'loss': 1.0115, 'grad_norm': 6.971906661987305, 'learning_rate': 9.984786688456546e-06, 'epoch': 0.05}


  6%|▌         | 2900/51760 [12:07<3:30:46,  3.86it/s]

{'loss': 0.999, 'grad_norm': 1.812668800354004, 'learning_rate': 9.982250370911937e-06, 'epoch': 0.06}


  6%|▌         | 3000/51760 [12:32<3:38:38,  3.72it/s]

{'loss': 1.0962, 'grad_norm': 4.0207719802856445, 'learning_rate': 9.979518981189431e-06, 'epoch': 0.06}


  6%|▌         | 3100/51760 [13:00<3:31:12,  3.84it/s] 

{'loss': 1.0564, 'grad_norm': 1.0977280139923096, 'learning_rate': 9.976592626232297e-06, 'epoch': 0.06}


  6%|▌         | 3200/51760 [13:26<3:06:00,  4.35it/s]

{'loss': 1.0408, 'grad_norm': 80.55439758300781, 'learning_rate': 9.97347142061736e-06, 'epoch': 0.06}


  6%|▋         | 3300/51760 [13:51<3:23:35,  3.97it/s]

{'loss': 0.9902, 'grad_norm': 2.838087797164917, 'learning_rate': 9.97015548655052e-06, 'epoch': 0.06}


  7%|▋         | 3400/51760 [14:17<3:04:11,  4.38it/s]

{'loss': 1.0601, 'grad_norm': 9.380800247192383, 'learning_rate': 9.966644953861955e-06, 'epoch': 0.07}


  7%|▋         | 3500/51760 [14:43<3:30:04,  3.83it/s]

{'loss': 1.0886, 'grad_norm': 5.691338062286377, 'learning_rate': 9.96293996000106e-06, 'epoch': 0.07}


  7%|▋         | 3600/51760 [15:09<3:28:01,  3.86it/s]

{'loss': 0.9666, 'grad_norm': 2.681460380554199, 'learning_rate': 9.959040650031034e-06, 'epoch': 0.07}


  7%|▋         | 3700/51760 [15:34<3:38:02,  3.67it/s]

{'loss': 1.03, 'grad_norm': 1.0905815362930298, 'learning_rate': 9.954947176623233e-06, 'epoch': 0.07}


  7%|▋         | 3800/51760 [15:59<3:13:25,  4.13it/s]

{'loss': 1.0711, 'grad_norm': 1.5670876502990723, 'learning_rate': 9.950659700051168e-06, 'epoch': 0.07}


  8%|▊         | 3900/51760 [16:24<3:04:00,  4.33it/s]

{'loss': 0.9955, 'grad_norm': 3.338470220565796, 'learning_rate': 9.946178388184244e-06, 'epoch': 0.08}


  8%|▊         | 4000/51760 [16:49<3:18:12,  4.02it/s]

{'loss': 1.0217, 'grad_norm': 2.1431655883789062, 'learning_rate': 9.941503416481175e-06, 'epoch': 0.08}


  8%|▊         | 4100/51760 [17:14<3:17:45,  4.02it/s]

{'loss': 1.0286, 'grad_norm': 1.45197594165802, 'learning_rate': 9.93663496798313e-06, 'epoch': 0.08}


  8%|▊         | 4200/51760 [17:40<3:19:42,  3.97it/s]

{'loss': 1.042, 'grad_norm': 1.4288161993026733, 'learning_rate': 9.931573233306551e-06, 'epoch': 0.08}


  8%|▊         | 4300/51760 [18:06<3:43:09,  3.54it/s]

{'loss': 1.0859, 'grad_norm': 2.109675407409668, 'learning_rate': 9.9263184106357e-06, 'epoch': 0.08}


  9%|▊         | 4400/51760 [18:32<3:54:09,  3.37it/s]

{'loss': 0.9645, 'grad_norm': 1.2657238245010376, 'learning_rate': 9.920870705714895e-06, 'epoch': 0.09}


  9%|▊         | 4500/51760 [18:58<3:12:08,  4.10it/s]

{'loss': 1.0391, 'grad_norm': 1.6978950500488281, 'learning_rate': 9.915230331840454e-06, 'epoch': 0.09}


  9%|▉         | 4600/51760 [19:25<4:10:57,  3.13it/s]

{'loss': 1.0409, 'grad_norm': 5.9620561599731445, 'learning_rate': 9.909397509852347e-06, 'epoch': 0.09}


  9%|▉         | 4700/51760 [19:51<3:43:50,  3.50it/s]

{'loss': 1.056, 'grad_norm': 0.9778143167495728, 'learning_rate': 9.903372468125544e-06, 'epoch': 0.09}


  9%|▉         | 4800/51760 [20:16<3:16:52,  3.98it/s]

{'loss': 1.0771, 'grad_norm': 1.4948114156723022, 'learning_rate': 9.897155442561079e-06, 'epoch': 0.09}


  9%|▉         | 4900/51760 [20:44<3:07:44,  4.16it/s]

{'loss': 1.0117, 'grad_norm': 4.37048864364624, 'learning_rate': 9.890746676576812e-06, 'epoch': 0.09}


 10%|▉         | 5000/51760 [21:10<3:04:58,  4.21it/s]

{'loss': 1.1686, 'grad_norm': 13.912140846252441, 'learning_rate': 9.884146421097895e-06, 'epoch': 0.1}


 10%|▉         | 5100/51760 [21:34<2:56:13,  4.41it/s]

{'loss': 0.9954, 'grad_norm': 5.71315336227417, 'learning_rate': 9.877354934546952e-06, 'epoch': 0.1}


 10%|█         | 5200/51760 [21:58<2:54:44,  4.44it/s]

{'loss': 1.1083, 'grad_norm': 1.9882193803787231, 'learning_rate': 9.870372482833958e-06, 'epoch': 0.1}


 10%|█         | 5300/51760 [22:23<3:14:12,  3.99it/s]

{'loss': 0.9633, 'grad_norm': 1.3576009273529053, 'learning_rate': 9.863199339345828e-06, 'epoch': 0.1}


 10%|█         | 5400/51760 [22:50<3:10:57,  4.05it/s]

{'loss': 1.1131, 'grad_norm': 1.3824787139892578, 'learning_rate': 9.855835784935717e-06, 'epoch': 0.1}


 11%|█         | 5500/51760 [23:14<3:20:32,  3.84it/s]

{'loss': 1.0468, 'grad_norm': 6.962819576263428, 'learning_rate': 9.848282107912015e-06, 'epoch': 0.11}


 11%|█         | 5600/51760 [23:39<3:17:41,  3.89it/s]

{'loss': 0.9556, 'grad_norm': 2.429352045059204, 'learning_rate': 9.840538604027066e-06, 'epoch': 0.11}


 11%|█         | 5700/51760 [24:05<3:02:13,  4.21it/s]

{'loss': 1.0681, 'grad_norm': 5.06346321105957, 'learning_rate': 9.832605576465589e-06, 'epoch': 0.11}


 11%|█         | 5800/51760 [24:30<2:57:30,  4.32it/s]

{'loss': 0.9967, 'grad_norm': 9.446369171142578, 'learning_rate': 9.824483335832801e-06, 'epoch': 0.11}


 11%|█▏        | 5900/51760 [24:54<3:20:04,  3.82it/s]

{'loss': 1.0216, 'grad_norm': 1.1553014516830444, 'learning_rate': 9.816172200142262e-06, 'epoch': 0.11}


 12%|█▏        | 6000/51760 [25:19<3:13:48,  3.94it/s]

{'loss': 1.0977, 'grad_norm': 1.4115875959396362, 'learning_rate': 9.807672494803416e-06, 'epoch': 0.12}


 12%|█▏        | 6100/51760 [25:44<2:54:12,  4.37it/s]

{'loss': 1.0186, 'grad_norm': 2.0021255016326904, 'learning_rate': 9.79898455260886e-06, 'epoch': 0.12}


 12%|█▏        | 6200/51760 [26:09<3:53:47,  3.25it/s]

{'loss': 1.0667, 'grad_norm': 6.782876491546631, 'learning_rate': 9.790108713721311e-06, 'epoch': 0.12}


 12%|█▏        | 6300/51760 [26:33<3:19:30,  3.80it/s]

{'loss': 1.0023, 'grad_norm': 1.1985487937927246, 'learning_rate': 9.781045325660277e-06, 'epoch': 0.12}


 12%|█▏        | 6400/51760 [26:57<2:51:05,  4.42it/s]

{'loss': 0.9816, 'grad_norm': 23.6894474029541, 'learning_rate': 9.771794743288466e-06, 'epoch': 0.12}


 13%|█▎        | 6500/51760 [27:24<2:56:39,  4.27it/s]

{'loss': 0.9857, 'grad_norm': 2.846268892288208, 'learning_rate': 9.762357328797884e-06, 'epoch': 0.13}


 13%|█▎        | 6600/51760 [27:49<3:00:11,  4.18it/s]

{'loss': 1.0373, 'grad_norm': 5.522106170654297, 'learning_rate': 9.752733451695651e-06, 'epoch': 0.13}


 13%|█▎        | 6700/51760 [28:13<3:00:46,  4.15it/s]

{'loss': 0.932, 'grad_norm': 1.7722041606903076, 'learning_rate': 9.742923488789541e-06, 'epoch': 0.13}


 13%|█▎        | 6800/51760 [28:39<2:53:28,  4.32it/s]

{'loss': 1.0569, 'grad_norm': 3.756227493286133, 'learning_rate': 9.732927824173224e-06, 'epoch': 0.13}


 13%|█▎        | 6900/51760 [29:04<2:59:22,  4.17it/s]

{'loss': 1.1039, 'grad_norm': 1.2546463012695312, 'learning_rate': 9.722746849211228e-06, 'epoch': 0.13}


 14%|█▎        | 7000/51760 [29:30<3:04:42,  4.04it/s]

{'loss': 1.0922, 'grad_norm': 1.2800402641296387, 'learning_rate': 9.712380962523617e-06, 'epoch': 0.14}


 14%|█▎        | 7100/51760 [29:55<2:55:39,  4.24it/s]

{'loss': 1.012, 'grad_norm': 4.3302178382873535, 'learning_rate': 9.701830569970382e-06, 'epoch': 0.14}


 14%|█▍        | 7200/51760 [30:19<3:08:45,  3.93it/s]

{'loss': 1.0493, 'grad_norm': 1.277601718902588, 'learning_rate': 9.691096084635551e-06, 'epoch': 0.14}


 14%|█▍        | 7300/51760 [30:45<2:54:57,  4.24it/s]

{'loss': 1.0269, 'grad_norm': 2.6814322471618652, 'learning_rate': 9.680177926811014e-06, 'epoch': 0.14}


 14%|█▍        | 7400/51760 [31:09<2:51:29,  4.31it/s]

{'loss': 1.0299, 'grad_norm': 1.6315338611602783, 'learning_rate': 9.669076523980075e-06, 'epoch': 0.14}


 14%|█▍        | 7500/51760 [31:33<3:07:30,  3.93it/s]

{'loss': 1.0179, 'grad_norm': 1.1566284894943237, 'learning_rate': 9.657792310800699e-06, 'epoch': 0.14}


 15%|█▍        | 7600/51760 [31:58<2:54:56,  4.21it/s]

{'loss': 1.0706, 'grad_norm': 1.608838677406311, 'learning_rate': 9.646325729088508e-06, 'epoch': 0.15}


 15%|█▍        | 7700/51760 [32:23<2:48:05,  4.37it/s]

{'loss': 1.0078, 'grad_norm': 3.3435263633728027, 'learning_rate': 9.634677227799475e-06, 'epoch': 0.15}


 15%|█▌        | 7800/51760 [32:48<2:50:08,  4.31it/s]

{'loss': 1.0762, 'grad_norm': 2.4295310974121094, 'learning_rate': 9.622847263012349e-06, 'epoch': 0.15}


 15%|█▌        | 7900/51760 [33:13<2:46:03,  4.40it/s]

{'loss': 1.0536, 'grad_norm': 3.7548630237579346, 'learning_rate': 9.6108362979108e-06, 'epoch': 0.15}


 15%|█▌        | 8000/51760 [33:38<2:46:56,  4.37it/s]

{'loss': 1.0629, 'grad_norm': 3.638592004776001, 'learning_rate': 9.598644802765273e-06, 'epoch': 0.15}


 16%|█▌        | 8100/51760 [34:02<2:48:13,  4.33it/s]

{'loss': 1.0081, 'grad_norm': 1.4970066547393799, 'learning_rate': 9.586273254914589e-06, 'epoch': 0.16}


 16%|█▌        | 8200/51760 [34:29<4:34:02,  2.65it/s]

{'loss': 1.0078, 'grad_norm': 0.8870835900306702, 'learning_rate': 9.573722138747247e-06, 'epoch': 0.16}


 16%|█▌        | 8300/51760 [34:54<3:13:12,  3.75it/s]

{'loss': 0.9896, 'grad_norm': 1.1705814599990845, 'learning_rate': 9.560991945682466e-06, 'epoch': 0.16}


 16%|█▌        | 8400/51760 [35:22<2:46:40,  4.34it/s] 

{'loss': 1.0647, 'grad_norm': 5.251091480255127, 'learning_rate': 9.54808317415093e-06, 'epoch': 0.16}


 16%|█▋        | 8500/51760 [35:48<3:11:50,  3.76it/s]

{'loss': 1.0282, 'grad_norm': 0.9667012095451355, 'learning_rate': 9.53499632957529e-06, 'epoch': 0.16}


 17%|█▋        | 8600/51760 [36:15<3:44:47,  3.20it/s]

{'loss': 0.9934, 'grad_norm': 5.220272541046143, 'learning_rate': 9.521731924350364e-06, 'epoch': 0.17}


 17%|█▋        | 8700/51760 [36:41<3:23:50,  3.52it/s]

{'loss': 1.0101, 'grad_norm': 1.6452807188034058, 'learning_rate': 9.508290477823079e-06, 'epoch': 0.17}


 17%|█▋        | 8800/51760 [37:10<3:01:11,  3.95it/s]

{'loss': 1.0277, 'grad_norm': 4.229984283447266, 'learning_rate': 9.494672516272132e-06, 'epoch': 0.17}


 17%|█▋        | 8900/51760 [37:37<3:16:29,  3.64it/s]

{'loss': 1.0018, 'grad_norm': 1.792514443397522, 'learning_rate': 9.480878572887392e-06, 'epoch': 0.17}


 17%|█▋        | 9000/51760 [38:03<2:56:18,  4.04it/s]

{'loss': 1.0369, 'grad_norm': 1.214766502380371, 'learning_rate': 9.466909187749022e-06, 'epoch': 0.17}


 18%|█▊        | 9100/51760 [38:28<2:44:06,  4.33it/s]

{'loss': 1.0296, 'grad_norm': 3.1684491634368896, 'learning_rate': 9.452764907806326e-06, 'epoch': 0.18}


 18%|█▊        | 9200/51760 [38:57<3:32:14,  3.34it/s] 

{'loss': 1.0589, 'grad_norm': 1.7605228424072266, 'learning_rate': 9.43844628685634e-06, 'epoch': 0.18}


 18%|█▊        | 9300/51760 [39:25<3:28:35,  3.39it/s]

{'loss': 1.0, 'grad_norm': 1.3227975368499756, 'learning_rate': 9.423953885522153e-06, 'epoch': 0.18}


 18%|█▊        | 9400/51760 [39:50<2:55:49,  4.02it/s]

{'loss': 1.1423, 'grad_norm': 1.4003175497055054, 'learning_rate': 9.409288271230941e-06, 'epoch': 0.18}


 18%|█▊        | 9500/51760 [40:17<3:22:15,  3.48it/s]

{'loss': 1.0334, 'grad_norm': 1.0144197940826416, 'learning_rate': 9.394450018191774e-06, 'epoch': 0.18}


 19%|█▊        | 9600/51760 [40:45<2:59:55,  3.91it/s]

{'loss': 1.0631, 'grad_norm': 1.1752545833587646, 'learning_rate': 9.37943970737311e-06, 'epoch': 0.19}


 19%|█▊        | 9700/51760 [41:11<2:51:26,  4.09it/s]

{'loss': 0.9579, 'grad_norm': 2.9859821796417236, 'learning_rate': 9.364257926480066e-06, 'epoch': 0.19}


 19%|█▉        | 9800/51760 [41:38<2:46:41,  4.20it/s]

{'loss': 1.0556, 'grad_norm': 1.8670536279678345, 'learning_rate': 9.348905269931394e-06, 'epoch': 0.19}


 19%|█▉        | 9900/51760 [42:04<3:33:01,  3.28it/s]

{'loss': 1.2316, 'grad_norm': 0.9996125102043152, 'learning_rate': 9.333382338836218e-06, 'epoch': 0.19}


 19%|█▉        | 10000/51760 [42:29<3:26:33,  3.37it/s]

{'loss': 1.0409, 'grad_norm': 3.5763559341430664, 'learning_rate': 9.317689740970498e-06, 'epoch': 0.19}


 20%|█▉        | 10100/51760 [42:56<2:44:48,  4.21it/s]

{'loss': 1.0184, 'grad_norm': 1.444985032081604, 'learning_rate': 9.301828090753214e-06, 'epoch': 0.2}


 20%|█▉        | 10200/51760 [43:21<2:43:22,  4.24it/s]

{'loss': 1.0786, 'grad_norm': 1.731100082397461, 'learning_rate': 9.285798009222344e-06, 'epoch': 0.2}


 20%|█▉        | 10300/51760 [43:48<3:36:34,  3.19it/s]

{'loss': 1.1189, 'grad_norm': 3.9121477603912354, 'learning_rate': 9.269600124010516e-06, 'epoch': 0.2}


 20%|██        | 10400/51760 [44:13<2:49:59,  4.06it/s]

{'loss': 1.0115, 'grad_norm': 1.4093693494796753, 'learning_rate': 9.253235069320454e-06, 'epoch': 0.2}


 20%|██        | 10500/51760 [44:39<2:57:28,  3.87it/s]

{'loss': 0.9683, 'grad_norm': 8.88468074798584, 'learning_rate': 9.236703485900136e-06, 'epoch': 0.2}


 20%|██        | 10600/51760 [45:06<3:40:28,  3.11it/s]

{'loss': 1.0021, 'grad_norm': 1.8616397380828857, 'learning_rate': 9.220006021017713e-06, 'epoch': 0.2}


 21%|██        | 10700/51760 [45:32<2:59:27,  3.81it/s]

{'loss': 1.016, 'grad_norm': 1.424534797668457, 'learning_rate': 9.203143328436164e-06, 'epoch': 0.21}


 21%|██        | 10800/51760 [45:59<2:47:25,  4.08it/s]

{'loss': 1.0384, 'grad_norm': 2.422452211380005, 'learning_rate': 9.186116068387696e-06, 'epoch': 0.21}


 21%|██        | 10900/51760 [46:25<2:41:03,  4.23it/s]

{'loss': 1.0954, 'grad_norm': 2.897380828857422, 'learning_rate': 9.168924907547898e-06, 'epoch': 0.21}


 21%|██▏       | 11000/51760 [46:52<2:52:12,  3.94it/s]

{'loss': 0.919, 'grad_norm': 5.081881523132324, 'learning_rate': 9.15157051900964e-06, 'epoch': 0.21}


 21%|██▏       | 11100/51760 [47:19<2:49:48,  3.99it/s]

{'loss': 1.0098, 'grad_norm': 1.4248385429382324, 'learning_rate': 9.134053582256712e-06, 'epoch': 0.21}


 22%|██▏       | 11200/51760 [47:44<2:56:10,  3.84it/s]

{'loss': 0.9226, 'grad_norm': 1.16222083568573, 'learning_rate': 9.11637478313722e-06, 'epoch': 0.22}


 22%|██▏       | 11300/51760 [48:11<2:54:31,  3.86it/s]

{'loss': 0.9863, 'grad_norm': 1.2566906213760376, 'learning_rate': 9.098534813836746e-06, 'epoch': 0.22}


 22%|██▏       | 11400/51760 [48:37<3:03:10,  3.67it/s]

{'loss': 0.9577, 'grad_norm': 2.0047106742858887, 'learning_rate': 9.080534372851229e-06, 'epoch': 0.22}


 22%|██▏       | 11500/51760 [49:06<2:43:43,  4.10it/s] 

{'loss': 1.1209, 'grad_norm': 3.8695132732391357, 'learning_rate': 9.062374164959628e-06, 'epoch': 0.22}


 22%|██▏       | 11600/51760 [49:33<4:26:37,  2.51it/s]

{'loss': 1.0128, 'grad_norm': 0.9144818186759949, 'learning_rate': 9.044054901196325e-06, 'epoch': 0.22}


 23%|██▎       | 11700/51760 [49:59<2:47:42,  3.98it/s]

{'loss': 0.9303, 'grad_norm': 1.2337989807128906, 'learning_rate': 9.025577298823279e-06, 'epoch': 0.23}


 23%|██▎       | 11800/51760 [50:26<4:39:06,  2.39it/s]

{'loss': 0.995, 'grad_norm': 0.8845030069351196, 'learning_rate': 9.006942081301953e-06, 'epoch': 0.23}


 23%|██▎       | 11900/51760 [50:52<3:16:01,  3.39it/s]

{'loss': 1.1598, 'grad_norm': 2.3374006748199463, 'learning_rate': 8.988149978264983e-06, 'epoch': 0.23}


 23%|██▎       | 12000/51760 [51:18<2:48:15,  3.94it/s]

{'loss': 1.0971, 'grad_norm': 2.2450053691864014, 'learning_rate': 8.969201725487612e-06, 'epoch': 0.23}


 23%|██▎       | 12100/51760 [51:43<2:56:42,  3.74it/s]

{'loss': 1.0388, 'grad_norm': 1.0502005815505981, 'learning_rate': 8.950098064858875e-06, 'epoch': 0.23}


 24%|██▎       | 12200/51760 [52:10<2:31:57,  4.34it/s]

{'loss': 0.9837, 'grad_norm': 2.556234121322632, 'learning_rate': 8.930839744352562e-06, 'epoch': 0.24}


 24%|██▍       | 12300/51760 [52:36<2:42:41,  4.04it/s]

{'loss': 0.9971, 'grad_norm': 1.4610170125961304, 'learning_rate': 8.911427517997926e-06, 'epoch': 0.24}


 24%|██▍       | 12400/51760 [53:01<2:38:15,  4.15it/s]

{'loss': 1.0327, 'grad_norm': 5.629349708557129, 'learning_rate': 8.891862145850159e-06, 'epoch': 0.24}


 24%|██▍       | 12500/51760 [53:27<3:23:11,  3.22it/s]

{'loss': 1.0073, 'grad_norm': 1.7364583015441895, 'learning_rate': 8.87214439396064e-06, 'epoch': 0.24}


 24%|██▍       | 12600/51760 [53:54<3:05:04,  3.53it/s]

{'loss': 1.0602, 'grad_norm': 1.0231504440307617, 'learning_rate': 8.852275034346935e-06, 'epoch': 0.24}


 25%|██▍       | 12700/51760 [54:19<2:55:07,  3.72it/s]

{'loss': 1.0487, 'grad_norm': 2.138759136199951, 'learning_rate': 8.832254844962571e-06, 'epoch': 0.25}


 25%|██▍       | 12800/51760 [54:45<2:35:20,  4.18it/s]

{'loss': 1.1868, 'grad_norm': 7.712104797363281, 'learning_rate': 8.812084609666578e-06, 'epoch': 0.25}


 25%|██▍       | 12900/51760 [55:10<2:32:42,  4.24it/s]

{'loss': 1.0538, 'grad_norm': 3.2621448040008545, 'learning_rate': 8.7917651181928e-06, 'epoch': 0.25}


 25%|██▌       | 13000/51760 [55:36<2:34:53,  4.17it/s]

{'loss': 1.0696, 'grad_norm': 1.360893726348877, 'learning_rate': 8.771297166118967e-06, 'epoch': 0.25}


 25%|██▌       | 13100/51760 [56:04<3:17:26,  3.26it/s]

{'loss': 1.0127, 'grad_norm': 0.8232952356338501, 'learning_rate': 8.750681554835556e-06, 'epoch': 0.25}


 26%|██▌       | 13200/51760 [56:30<2:48:50,  3.81it/s]

{'loss': 1.0059, 'grad_norm': 6.7120466232299805, 'learning_rate': 8.729919091514405e-06, 'epoch': 0.26}


 26%|██▌       | 13300/51760 [56:56<2:43:30,  3.92it/s]

{'loss': 1.0431, 'grad_norm': 1.1787264347076416, 'learning_rate': 8.709010589077113e-06, 'epoch': 0.26}


 26%|██▌       | 13400/51760 [57:22<2:46:13,  3.85it/s]

{'loss': 1.0372, 'grad_norm': 1.2047021389007568, 'learning_rate': 8.68795686616321e-06, 'epoch': 0.26}


 26%|██▌       | 13500/51760 [57:48<2:32:35,  4.18it/s]

{'loss': 1.0637, 'grad_norm': 8.513731002807617, 'learning_rate': 8.666758747098108e-06, 'epoch': 0.26}


 26%|██▋       | 13600/51760 [58:14<2:48:40,  3.77it/s]

{'loss': 1.1087, 'grad_norm': 2.9558067321777344, 'learning_rate': 8.645417061860822e-06, 'epoch': 0.26}


 26%|██▋       | 13700/51760 [58:41<3:30:01,  3.02it/s]

{'loss': 1.0133, 'grad_norm': 0.9178600311279297, 'learning_rate': 8.623932646051471e-06, 'epoch': 0.26}


 27%|██▋       | 13800/51760 [59:07<2:37:35,  4.01it/s]

{'loss': 1.1397, 'grad_norm': 2.5727012157440186, 'learning_rate': 8.60230634085857e-06, 'epoch': 0.27}


 27%|██▋       | 13900/51760 [59:33<2:44:41,  3.83it/s]

{'loss': 0.9794, 'grad_norm': 1.0007250308990479, 'learning_rate': 8.58053899302609e-06, 'epoch': 0.27}


 27%|██▋       | 14000/51760 [59:58<2:36:58,  4.01it/s]

{'loss': 1.0481, 'grad_norm': 9.134968757629395, 'learning_rate': 8.5586314548203e-06, 'epoch': 0.27}


 27%|██▋       | 14100/51760 [1:00:24<2:35:25,  4.04it/s]

{'loss': 1.0857, 'grad_norm': 5.399421215057373, 'learning_rate': 8.536584583996408e-06, 'epoch': 0.27}


 27%|██▋       | 14200/51760 [1:00:48<2:34:47,  4.04it/s]

{'loss': 0.9721, 'grad_norm': 1.480074405670166, 'learning_rate': 8.514399243764967e-06, 'epoch': 0.27}


 28%|██▊       | 14300/51760 [1:01:15<2:36:25,  3.99it/s]

{'loss': 0.9786, 'grad_norm': 1.8385711908340454, 'learning_rate': 8.492076302758085e-06, 'epoch': 0.28}


 28%|██▊       | 14400/51760 [1:01:40<2:21:34,  4.40it/s]

{'loss': 1.0293, 'grad_norm': 3.6387698650360107, 'learning_rate': 8.46961663499541e-06, 'epoch': 0.28}


 28%|██▊       | 14500/51760 [1:02:06<2:21:44,  4.38it/s]

{'loss': 1.0001, 'grad_norm': 2.483315944671631, 'learning_rate': 8.447021119849912e-06, 'epoch': 0.28}


 28%|██▊       | 14600/51760 [1:02:32<2:35:38,  3.98it/s]

{'loss': 0.9382, 'grad_norm': 4.836645126342773, 'learning_rate': 8.424290642013451e-06, 'epoch': 0.28}


 28%|██▊       | 14700/51760 [1:02:58<2:36:18,  3.95it/s]

{'loss': 0.9963, 'grad_norm': 2.9723281860351562, 'learning_rate': 8.40142609146214e-06, 'epoch': 0.28}


 29%|██▊       | 14800/51760 [1:03:24<2:49:42,  3.63it/s]

{'loss': 1.0668, 'grad_norm': 1.3963384628295898, 'learning_rate': 8.378428363421498e-06, 'epoch': 0.29}


 29%|██▉       | 14900/51760 [1:03:50<2:36:51,  3.92it/s]

{'loss': 1.0517, 'grad_norm': 4.517414093017578, 'learning_rate': 8.355298358331399e-06, 'epoch': 0.29}


 29%|██▉       | 15000/51760 [1:04:15<2:24:28,  4.24it/s]

{'loss': 1.1074, 'grad_norm': 1.6690970659255981, 'learning_rate': 8.332036981810817e-06, 'epoch': 0.29}


 29%|██▉       | 15100/51760 [1:04:40<2:39:27,  3.83it/s]

{'loss': 0.9737, 'grad_norm': 1.6699795722961426, 'learning_rate': 8.308645144622363e-06, 'epoch': 0.29}


 29%|██▉       | 15200/51760 [1:05:07<2:46:09,  3.67it/s]

{'loss': 1.0201, 'grad_norm': 2.398540496826172, 'learning_rate': 8.285123762636642e-06, 'epoch': 0.29}


 30%|██▉       | 15300/51760 [1:05:33<2:20:12,  4.33it/s]

{'loss': 1.0116, 'grad_norm': 11.259496688842773, 'learning_rate': 8.261473756796369e-06, 'epoch': 0.3}


 30%|██▉       | 15400/51760 [1:05:59<2:19:44,  4.34it/s]

{'loss': 1.1024, 'grad_norm': 5.441864967346191, 'learning_rate': 8.237696053080334e-06, 'epoch': 0.3}


 30%|██▉       | 15500/51760 [1:06:25<2:41:42,  3.74it/s]

{'loss': 0.9735, 'grad_norm': 2.550943374633789, 'learning_rate': 8.213791582467127e-06, 'epoch': 0.3}


 30%|███       | 15600/51760 [1:06:52<2:22:35,  4.23it/s]

{'loss': 1.0543, 'grad_norm': 2.0834009647369385, 'learning_rate': 8.189761280898705e-06, 'epoch': 0.3}


 30%|███       | 15700/51760 [1:07:21<2:49:07,  3.55it/s]

{'loss': 1.056, 'grad_norm': 2.397620439529419, 'learning_rate': 8.165606089243732e-06, 'epoch': 0.3}


 31%|███       | 15800/51760 [1:07:47<2:26:04,  4.10it/s]

{'loss': 1.0616, 'grad_norm': 3.758749008178711, 'learning_rate': 8.14132695326075e-06, 'epoch': 0.31}


 31%|███       | 15900/51760 [1:08:15<2:34:40,  3.86it/s]

{'loss': 1.1377, 'grad_norm': 1.4240598678588867, 'learning_rate': 8.116924823561147e-06, 'epoch': 0.31}


 31%|███       | 16000/51760 [1:08:41<2:25:14,  4.10it/s]

{'loss': 0.9976, 'grad_norm': 1.2983824014663696, 'learning_rate': 8.092400655571933e-06, 'epoch': 0.31}


 31%|███       | 16100/51760 [1:09:07<3:01:55,  3.27it/s]

{'loss': 1.0299, 'grad_norm': 1.4972482919692993, 'learning_rate': 8.06775540949834e-06, 'epoch': 0.31}


 31%|███▏      | 16200/51760 [1:09:33<2:38:38,  3.74it/s]

{'loss': 1.0673, 'grad_norm': 0.9893796443939209, 'learning_rate': 8.042990050286217e-06, 'epoch': 0.31}


 31%|███▏      | 16300/51760 [1:10:00<3:32:53,  2.78it/s]

{'loss': 1.0546, 'grad_norm': 1.3382627964019775, 'learning_rate': 8.018105547584258e-06, 'epoch': 0.31}


 32%|███▏      | 16400/51760 [1:10:28<2:28:34,  3.97it/s]

{'loss': 0.9291, 'grad_norm': 7.0267744064331055, 'learning_rate': 7.993102875706031e-06, 'epoch': 0.32}


 32%|███▏      | 16500/51760 [1:10:55<2:47:30,  3.51it/s]

{'loss': 1.0314, 'grad_norm': 1.6383103132247925, 'learning_rate': 7.967983013591834e-06, 'epoch': 0.32}


 32%|███▏      | 16600/51760 [1:11:21<2:42:25,  3.61it/s]

{'loss': 1.0436, 'grad_norm': 2.026052236557007, 'learning_rate': 7.942746944770361e-06, 'epoch': 0.32}


 32%|███▏      | 16700/51760 [1:11:49<3:41:39,  2.64it/s]

{'loss': 1.015, 'grad_norm': 1.9835214614868164, 'learning_rate': 7.9173956573202e-06, 'epoch': 0.32}


 32%|███▏      | 16800/51760 [1:12:16<2:16:51,  4.26it/s]

{'loss': 0.9994, 'grad_norm': 1.6608599424362183, 'learning_rate': 7.891930143831141e-06, 'epoch': 0.32}


 33%|███▎      | 16900/51760 [1:12:41<2:34:27,  3.76it/s]

{'loss': 0.9368, 'grad_norm': 1.0213977098464966, 'learning_rate': 7.86635140136532e-06, 'epoch': 0.33}


 33%|███▎      | 17000/51760 [1:13:07<2:27:04,  3.94it/s]

{'loss': 1.0766, 'grad_norm': 4.325002670288086, 'learning_rate': 7.840660431418168e-06, 'epoch': 0.33}


 33%|███▎      | 17100/51760 [1:13:33<2:21:00,  4.10it/s]

{'loss': 0.9598, 'grad_norm': 2.0235683917999268, 'learning_rate': 7.814858239879211e-06, 'epoch': 0.33}


 33%|███▎      | 17200/51760 [1:13:59<2:17:01,  4.20it/s]

{'loss': 1.0327, 'grad_norm': 1.2610576152801514, 'learning_rate': 7.788945836992678e-06, 'epoch': 0.33}


 33%|███▎      | 17300/51760 [1:14:25<2:32:55,  3.76it/s]

{'loss': 1.0215, 'grad_norm': 2.053598403930664, 'learning_rate': 7.76292423731796e-06, 'epoch': 0.33}


 34%|███▎      | 17400/51760 [1:14:52<2:25:32,  3.93it/s]

{'loss': 1.0098, 'grad_norm': 6.406527042388916, 'learning_rate': 7.736794459689861e-06, 'epoch': 0.34}


 34%|███▍      | 17500/51760 [1:15:17<2:16:16,  4.19it/s]

{'loss': 1.0391, 'grad_norm': 1.6416798830032349, 'learning_rate': 7.710557527178737e-06, 'epoch': 0.34}


 34%|███▍      | 17600/51760 [1:15:43<2:34:02,  3.70it/s]

{'loss': 0.9901, 'grad_norm': 1.9212000370025635, 'learning_rate': 7.684214467050415e-06, 'epoch': 0.34}


 34%|███▍      | 17700/51760 [1:16:09<2:20:55,  4.03it/s]

{'loss': 1.0337, 'grad_norm': 2.8207316398620605, 'learning_rate': 7.657766310725987e-06, 'epoch': 0.34}


 34%|███▍      | 17800/51760 [1:16:34<2:25:17,  3.90it/s]

{'loss': 1.0748, 'grad_norm': 0.5187485814094543, 'learning_rate': 7.631214093741422e-06, 'epoch': 0.34}


 35%|███▍      | 17900/51760 [1:17:00<2:35:50,  3.62it/s]

{'loss': 1.0289, 'grad_norm': 1.4089248180389404, 'learning_rate': 7.604558855707014e-06, 'epoch': 0.35}


 35%|███▍      | 18000/51760 [1:17:25<2:16:03,  4.14it/s]

{'loss': 1.1563, 'grad_norm': 1.4546622037887573, 'learning_rate': 7.5778016402666906e-06, 'epoch': 0.35}


 35%|███▍      | 18100/51760 [1:17:52<2:22:05,  3.95it/s]

{'loss': 1.0675, 'grad_norm': 8.601799011230469, 'learning_rate': 7.550943495057145e-06, 'epoch': 0.35}


 35%|███▌      | 18200/51760 [1:18:18<2:12:33,  4.22it/s]

{'loss': 0.8809, 'grad_norm': 1.8501768112182617, 'learning_rate': 7.52398547166681e-06, 'epoch': 0.35}


 35%|███▌      | 18300/51760 [1:18:44<2:14:15,  4.15it/s]

{'loss': 1.035, 'grad_norm': 3.202906608581543, 'learning_rate': 7.4969286255946994e-06, 'epoch': 0.35}


 36%|███▌      | 18400/51760 [1:19:09<2:22:37,  3.90it/s]

{'loss': 1.0622, 'grad_norm': 6.586977958679199, 'learning_rate': 7.469774016209067e-06, 'epoch': 0.36}


 36%|███▌      | 18500/51760 [1:19:36<2:16:30,  4.06it/s]

{'loss': 1.0702, 'grad_norm': 13.059073448181152, 'learning_rate': 7.442522706705942e-06, 'epoch': 0.36}


 36%|███▌      | 18600/51760 [1:20:02<2:14:30,  4.11it/s]

{'loss': 1.0342, 'grad_norm': 1.8679215908050537, 'learning_rate': 7.415175764067487e-06, 'epoch': 0.36}


 36%|███▌      | 18700/51760 [1:20:28<2:09:21,  4.26it/s]

{'loss': 1.0018, 'grad_norm': 1.5867127180099487, 'learning_rate': 7.387734259020236e-06, 'epoch': 0.36}


 36%|███▋      | 18800/51760 [1:20:55<2:13:04,  4.13it/s]

{'loss': 1.0696, 'grad_norm': 12.77673625946045, 'learning_rate': 7.360199265993162e-06, 'epoch': 0.36}


 37%|███▋      | 18900/51760 [1:21:19<2:22:22,  3.85it/s]

{'loss': 1.0298, 'grad_norm': 1.0704859495162964, 'learning_rate': 7.332571863075615e-06, 'epoch': 0.37}


 37%|███▋      | 19000/51760 [1:21:45<2:18:12,  3.95it/s]

{'loss': 1.1575, 'grad_norm': 2.4643373489379883, 'learning_rate': 7.304853131975105e-06, 'epoch': 0.37}


 37%|███▋      | 19100/51760 [1:22:10<2:09:07,  4.22it/s]

{'loss': 1.0131, 'grad_norm': 0.09547923505306244, 'learning_rate': 7.2770441579749594e-06, 'epoch': 0.37}


 37%|███▋      | 19200/51760 [1:22:37<2:10:46,  4.15it/s]

{'loss': 0.9312, 'grad_norm': 4.832937717437744, 'learning_rate': 7.249146029891821e-06, 'epoch': 0.37}


 37%|███▋      | 19300/51760 [1:23:02<2:35:59,  3.47it/s]

{'loss': 1.1034, 'grad_norm': 1.6711071729660034, 'learning_rate': 7.2211598400330195e-06, 'epoch': 0.37}


 37%|███▋      | 19400/51760 [1:23:30<2:44:55,  3.27it/s]

{'loss': 1.0292, 'grad_norm': 2.4000818729400635, 'learning_rate': 7.193086684153805e-06, 'epoch': 0.37}


 38%|███▊      | 19500/51760 [1:23:55<2:07:28,  4.22it/s]

{'loss': 1.0361, 'grad_norm': 1.359908103942871, 'learning_rate': 7.164927661414448e-06, 'epoch': 0.38}


 38%|███▊      | 19600/51760 [1:24:20<2:11:24,  4.08it/s]

{'loss': 0.9041, 'grad_norm': 2.32682728767395, 'learning_rate': 7.1366838743372e-06, 'epoch': 0.38}


 38%|███▊      | 19700/51760 [1:24:48<2:45:25,  3.23it/s]

{'loss': 1.0177, 'grad_norm': 1.1860387325286865, 'learning_rate': 7.1083564287631245e-06, 'epoch': 0.38}


 38%|███▊      | 19800/51760 [1:25:14<2:19:06,  3.83it/s]

{'loss': 1.0527, 'grad_norm': 1.1567243337631226, 'learning_rate': 7.079946433808805e-06, 'epoch': 0.38}


 38%|███▊      | 19900/51760 [1:25:40<2:24:08,  3.68it/s]

{'loss': 1.1016, 'grad_norm': 23.279645919799805, 'learning_rate': 7.051455001822912e-06, 'epoch': 0.38}


 39%|███▊      | 20000/51760 [1:26:07<2:46:35,  3.18it/s]

{'loss': 1.0025, 'grad_norm': 0.6153322458267212, 'learning_rate': 7.0228832483426624e-06, 'epoch': 0.39}


 39%|███▉      | 20100/51760 [1:26:34<2:55:03,  3.01it/s]

{'loss': 1.0192, 'grad_norm': 3.4533441066741943, 'learning_rate': 6.994232292050126e-06, 'epoch': 0.39}


 39%|███▉      | 20200/51760 [1:27:00<2:18:42,  3.79it/s]

{'loss': 0.9735, 'grad_norm': 1.6784921884536743, 'learning_rate': 6.9655032547284405e-06, 'epoch': 0.39}


 39%|███▉      | 20300/51760 [1:27:28<2:23:18,  3.66it/s]

{'loss': 1.0007, 'grad_norm': 3.7512941360473633, 'learning_rate': 6.93669726121788e-06, 'epoch': 0.39}


 39%|███▉      | 20400/51760 [1:27:53<2:41:16,  3.24it/s]

{'loss': 1.1297, 'grad_norm': 0.7577080726623535, 'learning_rate': 6.907815439371819e-06, 'epoch': 0.39}


 40%|███▉      | 20500/51760 [1:28:20<2:01:25,  4.29it/s]

{'loss': 0.9184, 'grad_norm': 6.683237075805664, 'learning_rate': 6.878858920012571e-06, 'epoch': 0.4}


 40%|███▉      | 20600/51760 [1:28:47<2:27:53,  3.51it/s]

{'loss': 0.9766, 'grad_norm': 3.116950511932373, 'learning_rate': 6.849828836887111e-06, 'epoch': 0.4}


 40%|███▉      | 20700/51760 [1:29:13<2:47:21,  3.09it/s]

{'loss': 0.9398, 'grad_norm': 3.4626569747924805, 'learning_rate': 6.8207263266226935e-06, 'epoch': 0.4}


 40%|████      | 20800/51760 [1:29:38<2:18:22,  3.73it/s]

{'loss': 1.0831, 'grad_norm': 2.8497958183288574, 'learning_rate': 6.791552528682333e-06, 'epoch': 0.4}


 40%|████      | 20900/51760 [1:30:06<2:02:21,  4.20it/s]

{'loss': 1.0486, 'grad_norm': 3.9715628623962402, 'learning_rate': 6.762308585320214e-06, 'epoch': 0.4}


 41%|████      | 21000/51760 [1:30:33<3:22:17,  2.53it/s]

{'loss': 0.9823, 'grad_norm': 1.0855190753936768, 'learning_rate': 6.7329956415369455e-06, 'epoch': 0.41}


 41%|████      | 21100/51760 [1:31:00<2:54:32,  2.93it/s]

{'loss': 1.0347, 'grad_norm': 2.093467950820923, 'learning_rate': 6.703614845034742e-06, 'epoch': 0.41}


 41%|████      | 21200/51760 [1:31:28<2:22:36,  3.57it/s]

{'loss': 1.0163, 'grad_norm': 1.96498441696167, 'learning_rate': 6.674167346172484e-06, 'epoch': 0.41}


 41%|████      | 21300/51760 [1:31:54<2:13:40,  3.80it/s]

{'loss': 1.0474, 'grad_norm': 3.371697425842285, 'learning_rate': 6.644654297920679e-06, 'epoch': 0.41}


 41%|████▏     | 21400/51760 [1:32:21<2:03:30,  4.10it/s]

{'loss': 0.9683, 'grad_norm': 4.306558132171631, 'learning_rate': 6.615076855816317e-06, 'epoch': 0.41}


 42%|████▏     | 21500/51760 [1:32:47<2:08:02,  3.94it/s]

{'loss': 1.0506, 'grad_norm': 1.8426672220230103, 'learning_rate': 6.585436177917631e-06, 'epoch': 0.42}


 42%|████▏     | 21600/51760 [1:33:13<2:08:22,  3.92it/s]

{'loss': 0.9535, 'grad_norm': 8.368570327758789, 'learning_rate': 6.555733424758746e-06, 'epoch': 0.42}


 42%|████▏     | 21700/51760 [1:33:38<2:01:51,  4.11it/s]

{'loss': 1.0344, 'grad_norm': 1.624140739440918, 'learning_rate': 6.525969759304252e-06, 'epoch': 0.42}


 42%|████▏     | 21800/51760 [1:34:06<2:52:34,  2.89it/s]

{'loss': 1.0149, 'grad_norm': 1.1973870992660522, 'learning_rate': 6.49614634690366e-06, 'epoch': 0.42}


 42%|████▏     | 21900/51760 [1:34:32<2:11:11,  3.79it/s]

{'loss': 1.0455, 'grad_norm': 1.5394892692565918, 'learning_rate': 6.466264355245784e-06, 'epoch': 0.42}


 43%|████▎     | 22000/51760 [1:34:59<2:16:05,  3.64it/s]

{'loss': 1.0719, 'grad_norm': 1.1484415531158447, 'learning_rate': 6.436324954313008e-06, 'epoch': 0.43}


 43%|████▎     | 22100/51760 [1:35:25<2:15:44,  3.64it/s]

{'loss': 0.9956, 'grad_norm': 1.1093446016311646, 'learning_rate': 6.406329316335494e-06, 'epoch': 0.43}


 43%|████▎     | 22200/51760 [1:35:51<2:00:12,  4.10it/s]

{'loss': 1.0316, 'grad_norm': 1.3846062421798706, 'learning_rate': 6.376278615745275e-06, 'epoch': 0.43}


 43%|████▎     | 22300/51760 [1:36:18<2:00:08,  4.09it/s]

{'loss': 1.0077, 'grad_norm': 1.1120661497116089, 'learning_rate': 6.346174029130271e-06, 'epoch': 0.43}


 43%|████▎     | 22400/51760 [1:36:45<2:06:48,  3.86it/s]

{'loss': 0.9732, 'grad_norm': 1.4052293300628662, 'learning_rate': 6.316016735188228e-06, 'epoch': 0.43}


 43%|████▎     | 22500/51760 [1:37:11<2:08:34,  3.79it/s]

{'loss': 1.0306, 'grad_norm': 0.8775006532669067, 'learning_rate': 6.285807914680563e-06, 'epoch': 0.43}


 44%|████▎     | 22600/51760 [1:37:38<2:24:33,  3.36it/s]

{'loss': 0.9904, 'grad_norm': 1.3939704895019531, 'learning_rate': 6.255548750386137e-06, 'epoch': 0.44}


 44%|████▍     | 22700/51760 [1:38:06<1:55:01,  4.21it/s]

{'loss': 0.9644, 'grad_norm': 3.91607666015625, 'learning_rate': 6.225240427054935e-06, 'epoch': 0.44}


 44%|████▍     | 22800/51760 [1:38:32<1:56:33,  4.14it/s]

{'loss': 0.9996, 'grad_norm': 7.027872085571289, 'learning_rate': 6.194884131361697e-06, 'epoch': 0.44}


 44%|████▍     | 22900/51760 [1:38:58<2:01:06,  3.97it/s]

{'loss': 0.9644, 'grad_norm': 3.46524977684021, 'learning_rate': 6.164481051859439e-06, 'epoch': 0.44}


 44%|████▍     | 23001/51760 [1:39:25<1:48:21,  4.42it/s]

{'loss': 0.9981, 'grad_norm': 1.4566996097564697, 'learning_rate': 6.134032378932928e-06, 'epoch': 0.44}


 45%|████▍     | 23100/51760 [1:39:52<2:39:49,  2.99it/s]

{'loss': 1.0224, 'grad_norm': 1.8077151775360107, 'learning_rate': 6.103539304752063e-06, 'epoch': 0.45}


 45%|████▍     | 23200/51760 [1:40:17<2:02:47,  3.88it/s]

{'loss': 1.049, 'grad_norm': 7.864938259124756, 'learning_rate': 6.073003023225211e-06, 'epoch': 0.45}


 45%|████▌     | 23300/51760 [1:40:43<1:48:37,  4.37it/s]

{'loss': 1.0654, 'grad_norm': 1.5363982915878296, 'learning_rate': 6.0424247299524515e-06, 'epoch': 0.45}


 45%|████▌     | 23400/51760 [1:41:09<2:02:08,  3.87it/s]

{'loss': 1.0418, 'grad_norm': 3.838644504547119, 'learning_rate': 6.011805622178769e-06, 'epoch': 0.45}


 45%|████▌     | 23500/51760 [1:41:36<2:04:51,  3.77it/s]

{'loss': 1.008, 'grad_norm': 8.07568073272705, 'learning_rate': 5.981146898747172e-06, 'epoch': 0.45}


 46%|████▌     | 23600/51760 [1:42:03<2:04:30,  3.77it/s]

{'loss': 1.078, 'grad_norm': 1.901963233947754, 'learning_rate': 5.950449760051764e-06, 'epoch': 0.46}


 46%|████▌     | 23700/51760 [1:42:31<2:12:03,  3.54it/s]

{'loss': 1.0514, 'grad_norm': 0.9894389510154724, 'learning_rate': 5.9197154079907346e-06, 'epoch': 0.46}


 46%|████▌     | 23800/51760 [1:43:00<2:01:07,  3.85it/s]

{'loss': 0.997, 'grad_norm': 7.002289772033691, 'learning_rate': 5.888945045919303e-06, 'epoch': 0.46}


 46%|████▌     | 23900/51760 [1:43:26<2:13:29,  3.48it/s]

{'loss': 0.9949, 'grad_norm': 1.087924599647522, 'learning_rate': 5.858139878602604e-06, 'epoch': 0.46}


 46%|████▋     | 24000/51760 [1:43:53<1:52:24,  4.12it/s]

{'loss': 1.0067, 'grad_norm': 1.5887664556503296, 'learning_rate': 5.827301112168519e-06, 'epoch': 0.46}


 47%|████▋     | 24100/51760 [1:44:18<1:45:28,  4.37it/s]

{'loss': 1.0025, 'grad_norm': 1.713820457458496, 'learning_rate': 5.7964299540604495e-06, 'epoch': 0.47}


 47%|████▋     | 24200/51760 [1:44:45<2:16:20,  3.37it/s]

{'loss': 0.9658, 'grad_norm': 1.7419638633728027, 'learning_rate': 5.765527612990041e-06, 'epoch': 0.47}


 47%|████▋     | 24300/51760 [1:45:11<1:59:38,  3.83it/s]

{'loss': 1.1085, 'grad_norm': 1.0395886898040771, 'learning_rate': 5.734595298889858e-06, 'epoch': 0.47}


 47%|████▋     | 24400/51760 [1:45:37<1:54:29,  3.98it/s]

{'loss': 1.0082, 'grad_norm': 1.238474726676941, 'learning_rate': 5.703634222866014e-06, 'epoch': 0.47}


 47%|████▋     | 24500/51760 [1:46:05<2:00:44,  3.76it/s]

{'loss': 1.045, 'grad_norm': 3.069056510925293, 'learning_rate': 5.67264559715075e-06, 'epoch': 0.47}


 48%|████▊     | 24600/51760 [1:46:31<1:55:32,  3.92it/s]

{'loss': 0.9738, 'grad_norm': 3.888277053833008, 'learning_rate': 5.641630635054971e-06, 'epoch': 0.48}


 48%|████▊     | 24700/51760 [1:46:57<1:58:54,  3.79it/s]

{'loss': 1.0522, 'grad_norm': 1.4608359336853027, 'learning_rate': 5.610590550920742e-06, 'epoch': 0.48}


 48%|████▊     | 24800/51760 [1:47:22<2:01:39,  3.69it/s]

{'loss': 0.9998, 'grad_norm': 1.1324764490127563, 'learning_rate': 5.579526560073741e-06, 'epoch': 0.48}


 48%|████▊     | 24900/51760 [1:47:49<1:48:21,  4.13it/s]

{'loss': 1.0425, 'grad_norm': 1.2844445705413818, 'learning_rate': 5.548439878775679e-06, 'epoch': 0.48}


 48%|████▊     | 25000/51760 [1:48:15<1:58:53,  3.75it/s]

{'loss': 0.9926, 'grad_norm': 6.423619270324707, 'learning_rate': 5.517331724176671e-06, 'epoch': 0.48}


 48%|████▊     | 25100/51760 [1:48:41<1:57:05,  3.79it/s]

{'loss': 1.0085, 'grad_norm': 13.288844108581543, 'learning_rate': 5.486203314267589e-06, 'epoch': 0.48}


 49%|████▊     | 25200/51760 [1:49:07<2:05:59,  3.51it/s]

{'loss': 1.0284, 'grad_norm': 1.9426971673965454, 'learning_rate': 5.455055867832368e-06, 'epoch': 0.49}


 49%|████▉     | 25300/51760 [1:49:32<1:44:10,  4.23it/s]

{'loss': 0.975, 'grad_norm': 1.8916409015655518, 'learning_rate': 5.423890604400289e-06, 'epoch': 0.49}


 49%|████▉     | 25400/51760 [1:49:59<1:40:06,  4.39it/s]

{'loss': 0.9064, 'grad_norm': 0.9237309694290161, 'learning_rate': 5.392708744198231e-06, 'epoch': 0.49}


 49%|████▉     | 25500/51760 [1:50:25<1:53:47,  3.85it/s]

{'loss': 0.9984, 'grad_norm': 1.213965892791748, 'learning_rate': 5.361511508102888e-06, 'epoch': 0.49}


 49%|████▉     | 25600/51760 [1:50:49<1:53:43,  3.83it/s]

{'loss': 1.0019, 'grad_norm': 1.0070278644561768, 'learning_rate': 5.3303001175929814e-06, 'epoch': 0.49}


 50%|████▉     | 25700/51760 [1:51:16<1:43:24,  4.20it/s]

{'loss': 0.9753, 'grad_norm': 4.9146504402160645, 'learning_rate': 5.299075794701419e-06, 'epoch': 0.5}


 50%|████▉     | 25800/51760 [1:51:43<2:03:24,  3.51it/s]

{'loss': 1.0865, 'grad_norm': 2.6672005653381348, 'learning_rate': 5.267839761967459e-06, 'epoch': 0.5}


 50%|█████     | 25900/51760 [1:52:09<1:55:06,  3.74it/s]

{'loss': 0.9565, 'grad_norm': 0.9847477078437805, 'learning_rate': 5.23659324238884e-06, 'epoch': 0.5}


 50%|█████     | 26000/51760 [1:52:34<1:52:52,  3.80it/s]

{'loss': 1.0571, 'grad_norm': 0.8034130334854126, 'learning_rate': 5.205337459373895e-06, 'epoch': 0.5}


 50%|█████     | 26100/51760 [1:53:00<1:43:14,  4.14it/s]

{'loss': 0.9917, 'grad_norm': 7.125938415527344, 'learning_rate': 5.174073636693653e-06, 'epoch': 0.5}


 51%|█████     | 26200/51760 [1:53:25<1:35:24,  4.47it/s]

{'loss': 1.0787, 'grad_norm': 4.790232181549072, 'learning_rate': 5.142802998433926e-06, 'epoch': 0.51}


 51%|█████     | 26300/51760 [1:53:52<1:44:47,  4.05it/s]

{'loss': 1.0232, 'grad_norm': 1.4608913660049438, 'learning_rate': 5.111526768947375e-06, 'epoch': 0.51}


 51%|█████     | 26400/51760 [1:54:17<1:48:17,  3.90it/s]

{'loss': 1.1303, 'grad_norm': 1.457549810409546, 'learning_rate': 5.080246172805579e-06, 'epoch': 0.51}


 51%|█████     | 26500/51760 [1:54:43<1:48:35,  3.88it/s]

{'loss': 1.0014, 'grad_norm': 1.8251758813858032, 'learning_rate': 5.0489624347510855e-06, 'epoch': 0.51}


 51%|█████▏    | 26600/51760 [1:55:08<1:41:01,  4.15it/s]

{'loss': 1.0209, 'grad_norm': 8.395846366882324, 'learning_rate': 5.017676779649463e-06, 'epoch': 0.51}


 52%|█████▏    | 26701/51760 [1:55:34<1:40:48,  4.14it/s]

{'loss': 1.0197, 'grad_norm': 2.990328311920166, 'learning_rate': 4.986390432441331e-06, 'epoch': 0.52}


 52%|█████▏    | 26800/51760 [1:55:59<1:37:54,  4.25it/s]

{'loss': 0.9949, 'grad_norm': 1.6082499027252197, 'learning_rate': 4.955104618094414e-06, 'epoch': 0.52}


 52%|█████▏    | 26900/51760 [1:56:26<1:44:43,  3.96it/s]

{'loss': 1.0117, 'grad_norm': 0.1055680438876152, 'learning_rate': 4.92382056155557e-06, 'epoch': 0.52}


 52%|█████▏    | 27000/51760 [1:56:52<1:39:33,  4.14it/s]

{'loss': 0.9259, 'grad_norm': 6.791496276855469, 'learning_rate': 4.8925394877028365e-06, 'epoch': 0.52}


 52%|█████▏    | 27100/51760 [1:57:18<1:49:08,  3.77it/s]

{'loss': 0.9841, 'grad_norm': 2.371953248977661, 'learning_rate': 4.861262621297461e-06, 'epoch': 0.52}


 53%|█████▎    | 27200/51760 [1:57:45<1:41:55,  4.02it/s]

{'loss': 0.9406, 'grad_norm': 7.292830467224121, 'learning_rate': 4.829991186935964e-06, 'epoch': 0.53}


 53%|█████▎    | 27300/51760 [1:58:11<1:45:27,  3.87it/s]

{'loss': 1.0704, 'grad_norm': 3.8854730129241943, 'learning_rate': 4.798726409002174e-06, 'epoch': 0.53}


 53%|█████▎    | 27400/51760 [1:58:37<1:57:47,  3.45it/s]

{'loss': 0.9782, 'grad_norm': 1.1271514892578125, 'learning_rate': 4.767469511619307e-06, 'epoch': 0.53}


 53%|█████▎    | 27500/51760 [1:59:06<1:40:51,  4.01it/s]

{'loss': 0.9921, 'grad_norm': 1.6435314416885376, 'learning_rate': 4.736221718602022e-06, 'epoch': 0.53}


 53%|█████▎    | 27600/51760 [1:59:33<1:36:37,  4.17it/s]

{'loss': 1.0739, 'grad_norm': 1.3990483283996582, 'learning_rate': 4.704984253408511e-06, 'epoch': 0.53}


 54%|█████▎    | 27700/51760 [1:59:58<1:37:48,  4.10it/s]

{'loss': 1.0571, 'grad_norm': 1.3448387384414673, 'learning_rate': 4.673758339092598e-06, 'epoch': 0.54}


 54%|█████▎    | 27800/51760 [2:00:24<1:32:37,  4.31it/s]

{'loss': 1.0441, 'grad_norm': 2.099968671798706, 'learning_rate': 4.642545198255854e-06, 'epoch': 0.54}


 54%|█████▍    | 27900/51760 [2:00:49<1:36:21,  4.13it/s]

{'loss': 1.0814, 'grad_norm': 12.917693138122559, 'learning_rate': 4.611346052999716e-06, 'epoch': 0.54}


 54%|█████▍    | 28000/51760 [2:01:16<1:40:13,  3.95it/s]

{'loss': 1.0307, 'grad_norm': 1.353088140487671, 'learning_rate': 4.580162124877653e-06, 'epoch': 0.54}


 54%|█████▍    | 28100/51760 [2:01:42<1:35:16,  4.14it/s]

{'loss': 1.0495, 'grad_norm': 2.5070159435272217, 'learning_rate': 4.548994634847329e-06, 'epoch': 0.54}


 54%|█████▍    | 28200/51760 [2:02:08<1:32:00,  4.27it/s]

{'loss': 1.0504, 'grad_norm': 2.1404833793640137, 'learning_rate': 4.517844803222802e-06, 'epoch': 0.54}


 55%|█████▍    | 28300/51760 [2:02:34<1:45:01,  3.72it/s]

{'loss': 1.0093, 'grad_norm': 0.9050385355949402, 'learning_rate': 4.486713849626738e-06, 'epoch': 0.55}


 55%|█████▍    | 28400/51760 [2:02:59<1:47:05,  3.64it/s]

{'loss': 0.9647, 'grad_norm': 1.6428682804107666, 'learning_rate': 4.455602992942669e-06, 'epoch': 0.55}


 55%|█████▌    | 28500/51760 [2:03:25<1:34:24,  4.11it/s]

{'loss': 1.0019, 'grad_norm': 1.822180986404419, 'learning_rate': 4.424513451267261e-06, 'epoch': 0.55}


 55%|█████▌    | 28600/51760 [2:03:52<1:39:18,  3.89it/s]

{'loss': 0.9768, 'grad_norm': 3.8061630725860596, 'learning_rate': 4.3934464418626255e-06, 'epoch': 0.55}


 55%|█████▌    | 28700/51760 [2:04:18<1:36:25,  3.99it/s]

{'loss': 1.1151, 'grad_norm': 1.3298888206481934, 'learning_rate': 4.362403181108659e-06, 'epoch': 0.55}


 56%|█████▌    | 28800/51760 [2:04:45<1:55:19,  3.32it/s]

{'loss': 1.0441, 'grad_norm': 4.089468479156494, 'learning_rate': 4.331384884455412e-06, 'epoch': 0.56}


 56%|█████▌    | 28900/51760 [2:05:13<1:34:42,  4.02it/s]

{'loss': 0.9336, 'grad_norm': 1.6008388996124268, 'learning_rate': 4.3003927663755115e-06, 'epoch': 0.56}


 56%|█████▌    | 29000/51760 [2:05:38<1:42:52,  3.69it/s]

{'loss': 1.0714, 'grad_norm': 1.7520503997802734, 'learning_rate': 4.269428040316602e-06, 'epoch': 0.56}


 56%|█████▌    | 29100/51760 [2:06:04<1:51:00,  3.40it/s]

{'loss': 1.0527, 'grad_norm': 0.95535808801651, 'learning_rate': 4.238491918653832e-06, 'epoch': 0.56}


 56%|█████▋    | 29200/51760 [2:06:31<1:39:17,  3.79it/s]

{'loss': 0.9862, 'grad_norm': 1.1760774850845337, 'learning_rate': 4.207585612642397e-06, 'epoch': 0.56}


 57%|█████▋    | 29300/51760 [2:06:57<1:28:46,  4.22it/s]

{'loss': 1.0408, 'grad_norm': 1.7912591695785522, 'learning_rate': 4.176710332370102e-06, 'epoch': 0.57}


 57%|█████▋    | 29400/51760 [2:07:23<1:35:25,  3.91it/s]

{'loss': 1.0888, 'grad_norm': 4.130012512207031, 'learning_rate': 4.1458672867099895e-06, 'epoch': 0.57}


 57%|█████▋    | 29500/51760 [2:07:48<1:28:58,  4.17it/s]

{'loss': 0.9515, 'grad_norm': 1.499596118927002, 'learning_rate': 4.115057683273007e-06, 'epoch': 0.57}


 57%|█████▋    | 29600/51760 [2:08:15<1:39:00,  3.73it/s]

{'loss': 1.0384, 'grad_norm': 1.106152057647705, 'learning_rate': 4.084282728360724e-06, 'epoch': 0.57}


 57%|█████▋    | 29700/51760 [2:08:41<1:33:37,  3.93it/s]

{'loss': 1.0242, 'grad_norm': 1.4173567295074463, 'learning_rate': 4.053543626918103e-06, 'epoch': 0.57}


 58%|█████▊    | 29800/51760 [2:09:07<1:30:02,  4.07it/s]

{'loss': 1.0086, 'grad_norm': 2.0022449493408203, 'learning_rate': 4.022841582486317e-06, 'epoch': 0.58}


 58%|█████▊    | 29900/51760 [2:09:34<1:38:44,  3.69it/s]

{'loss': 1.0173, 'grad_norm': 2.749650001525879, 'learning_rate': 3.992177797155633e-06, 'epoch': 0.58}


 58%|█████▊    | 30000/51760 [2:10:01<1:25:47,  4.23it/s]

{'loss': 1.0846, 'grad_norm': 4.172973155975342, 'learning_rate': 3.961553471518343e-06, 'epoch': 0.58}


 58%|█████▊    | 30100/51760 [2:10:28<1:28:20,  4.09it/s]

{'loss': 0.9943, 'grad_norm': 1.9797608852386475, 'learning_rate': 3.930969804621753e-06, 'epoch': 0.58}


 58%|█████▊    | 30200/51760 [2:10:55<1:38:52,  3.63it/s]

{'loss': 0.9854, 'grad_norm': 0.962766170501709, 'learning_rate': 3.900427993921244e-06, 'epoch': 0.58}


 59%|█████▊    | 30300/51760 [2:11:21<1:26:57,  4.11it/s]

{'loss': 0.9879, 'grad_norm': 1.7972067594528198, 'learning_rate': 3.869929235233385e-06, 'epoch': 0.59}


 59%|█████▊    | 30400/51760 [2:11:47<1:28:00,  4.05it/s]

{'loss': 1.0571, 'grad_norm': 0.1651601642370224, 'learning_rate': 3.8394747226891035e-06, 'epoch': 0.59}


 59%|█████▉    | 30500/51760 [2:12:14<1:38:02,  3.61it/s]

{'loss': 0.9717, 'grad_norm': 1.254401683807373, 'learning_rate': 3.8090656486869467e-06, 'epoch': 0.59}


 59%|█████▉    | 30600/51760 [2:12:39<1:39:45,  3.54it/s]

{'loss': 1.0278, 'grad_norm': 2.2876977920532227, 'learning_rate': 3.778703203846382e-06, 'epoch': 0.59}


 59%|█████▉    | 30700/51760 [2:13:05<1:27:30,  4.01it/s]

{'loss': 1.1277, 'grad_norm': 1.8324519395828247, 'learning_rate': 3.7483885769611885e-06, 'epoch': 0.59}


 60%|█████▉    | 30800/51760 [2:13:32<1:34:10,  3.71it/s]

{'loss': 1.0246, 'grad_norm': 3.4831743240356445, 'learning_rate': 3.7181229549529075e-06, 'epoch': 0.6}


 60%|█████▉    | 30900/51760 [2:13:58<1:27:44,  3.96it/s]

{'loss': 1.0083, 'grad_norm': 5.2533183097839355, 'learning_rate': 3.6879075228243717e-06, 'epoch': 0.6}


 60%|█████▉    | 31000/51760 [2:14:24<1:58:34,  2.92it/s]

{'loss': 1.0176, 'grad_norm': 0.9159656763076782, 'learning_rate': 3.6577434636133064e-06, 'epoch': 0.6}


 60%|██████    | 31100/51760 [2:14:50<1:26:31,  3.98it/s]

{'loss': 1.0353, 'grad_norm': 1.576634407043457, 'learning_rate': 3.627631958346013e-06, 'epoch': 0.6}


 60%|██████    | 31200/51760 [2:15:17<1:31:31,  3.74it/s]

{'loss': 1.0725, 'grad_norm': 17.588821411132812, 'learning_rate': 3.5975741859911255e-06, 'epoch': 0.6}


 60%|██████    | 31300/51760 [2:15:44<1:28:21,  3.86it/s]

{'loss': 0.9847, 'grad_norm': 2.9627249240875244, 'learning_rate': 3.567571323413449e-06, 'epoch': 0.6}


 61%|██████    | 31400/51760 [2:16:11<1:24:50,  4.00it/s]

{'loss': 1.0644, 'grad_norm': 4.007754802703857, 'learning_rate': 3.5376245453278833e-06, 'epoch': 0.61}


 61%|██████    | 31500/51760 [2:16:38<1:35:01,  3.55it/s]

{'loss': 1.0141, 'grad_norm': 3.6311755180358887, 'learning_rate': 3.5077350242534304e-06, 'epoch': 0.61}


 61%|██████    | 31600/51760 [2:17:05<1:27:02,  3.86it/s]

{'loss': 0.9429, 'grad_norm': 16.54464340209961, 'learning_rate': 3.477903930467279e-06, 'epoch': 0.61}


 61%|██████    | 31700/51760 [2:17:31<1:27:55,  3.80it/s]

{'loss': 0.9511, 'grad_norm': 5.134998321533203, 'learning_rate': 3.448132431958994e-06, 'epoch': 0.61}


 61%|██████▏   | 31800/51760 [2:17:58<1:22:15,  4.04it/s]

{'loss': 0.994, 'grad_norm': 2.7649660110473633, 'learning_rate': 3.418421694384778e-06, 'epoch': 0.61}


 62%|██████▏   | 31900/51760 [2:18:24<1:35:36,  3.46it/s]

{'loss': 0.9843, 'grad_norm': 4.957995891571045, 'learning_rate': 3.388772881021839e-06, 'epoch': 0.62}


 62%|██████▏   | 32000/51760 [2:18:51<1:31:59,  3.58it/s]

{'loss': 1.0928, 'grad_norm': 1.7972781658172607, 'learning_rate': 3.3591871527228337e-06, 'epoch': 0.62}


 62%|██████▏   | 32100/51760 [2:19:16<1:12:48,  4.50it/s]

{'loss': 1.0081, 'grad_norm': 3.209333658218384, 'learning_rate': 3.3296656678704248e-06, 'epoch': 0.62}


 62%|██████▏   | 32200/51760 [2:19:41<1:14:05,  4.40it/s]

{'loss': 0.9987, 'grad_norm': 7.2339677810668945, 'learning_rate': 3.300209582331926e-06, 'epoch': 0.62}


 62%|██████▏   | 32300/51760 [2:20:08<1:29:13,  3.63it/s]

{'loss': 1.0673, 'grad_norm': 1.5338337421417236, 'learning_rate': 3.270820049414042e-06, 'epoch': 0.62}


 63%|██████▎   | 32400/51760 [2:20:33<1:13:39,  4.38it/s]

{'loss': 0.9785, 'grad_norm': 4.366494655609131, 'learning_rate': 3.2414982198177152e-06, 'epoch': 0.63}


 63%|██████▎   | 32500/51760 [2:20:59<1:18:10,  4.11it/s]

{'loss': 0.968, 'grad_norm': 1.4684863090515137, 'learning_rate': 3.212245241593069e-06, 'epoch': 0.63}


 63%|██████▎   | 32600/51760 [2:21:24<1:14:41,  4.28it/s]

{'loss': 1.0161, 'grad_norm': 1.468862771987915, 'learning_rate': 3.1830622600944615e-06, 'epoch': 0.63}


 63%|██████▎   | 32700/51760 [2:21:53<1:24:42,  3.75it/s]

{'loss': 0.9573, 'grad_norm': 1.1187461614608765, 'learning_rate': 3.1539504179356388e-06, 'epoch': 0.63}


 63%|██████▎   | 32800/51760 [2:22:21<1:25:29,  3.70it/s]

{'loss': 1.1299, 'grad_norm': 6.84433126449585, 'learning_rate': 3.124910854944997e-06, 'epoch': 0.63}


 64%|██████▎   | 32900/51760 [2:22:48<1:14:07,  4.24it/s]

{'loss': 1.0003, 'grad_norm': 4.408555507659912, 'learning_rate': 3.095944708120955e-06, 'epoch': 0.64}


 64%|██████▍   | 33000/51760 [2:23:13<1:24:46,  3.69it/s]

{'loss': 0.9845, 'grad_norm': 7.08991003036499, 'learning_rate': 3.0670531115874403e-06, 'epoch': 0.64}


 64%|██████▍   | 33100/51760 [2:23:39<1:19:44,  3.90it/s]

{'loss': 1.0044, 'grad_norm': 8.197491645812988, 'learning_rate': 3.038237196549476e-06, 'epoch': 0.64}


 64%|██████▍   | 33200/51760 [2:24:05<1:18:03,  3.96it/s]

{'loss': 1.0562, 'grad_norm': 1.0349690914154053, 'learning_rate': 3.009498091248899e-06, 'epoch': 0.64}


 64%|██████▍   | 33300/51760 [2:24:31<1:14:31,  4.13it/s]

{'loss': 0.9354, 'grad_norm': 2.461928129196167, 'learning_rate': 2.9808369209201806e-06, 'epoch': 0.64}


 65%|██████▍   | 33400/51760 [2:24:57<1:25:02,  3.60it/s]

{'loss': 1.0534, 'grad_norm': 1.6092643737792969, 'learning_rate': 2.952254807746371e-06, 'epoch': 0.65}


 65%|██████▍   | 33500/51760 [2:25:23<1:21:00,  3.76it/s]

{'loss': 1.1075, 'grad_norm': 1.1288139820098877, 'learning_rate': 2.923752870815163e-06, 'epoch': 0.65}


 65%|██████▍   | 33600/51760 [2:25:49<1:12:24,  4.18it/s]

{'loss': 1.0487, 'grad_norm': 3.016512632369995, 'learning_rate': 2.8953322260750677e-06, 'epoch': 0.65}


 65%|██████▌   | 33700/51760 [2:26:16<1:15:26,  3.99it/s]

{'loss': 1.1221, 'grad_norm': 1.183940052986145, 'learning_rate': 2.866993986291741e-06, 'epoch': 0.65}


 65%|██████▌   | 33800/51760 [2:26:42<1:17:35,  3.86it/s]

{'loss': 1.05, 'grad_norm': 2.456423282623291, 'learning_rate': 2.8387392610043906e-06, 'epoch': 0.65}


 65%|██████▌   | 33900/51760 [2:27:09<1:27:33,  3.40it/s]

{'loss': 1.0802, 'grad_norm': 1.7280018329620361, 'learning_rate': 2.810569156482351e-06, 'epoch': 0.65}


 66%|██████▌   | 34000/51760 [2:27:35<1:26:50,  3.41it/s]

{'loss': 1.1019, 'grad_norm': 1.3057862520217896, 'learning_rate': 2.7824847756817645e-06, 'epoch': 0.66}


 66%|██████▌   | 34100/51760 [2:28:01<1:11:01,  4.14it/s]

{'loss': 0.9942, 'grad_norm': 4.1633100509643555, 'learning_rate': 2.754487218202394e-06, 'epoch': 0.66}


 66%|██████▌   | 34200/51760 [2:28:27<1:20:44,  3.62it/s]

{'loss': 0.9881, 'grad_norm': 3.9131269454956055, 'learning_rate': 2.7265775802445688e-06, 'epoch': 0.66}


 66%|██████▋   | 34300/51760 [2:28:53<1:20:42,  3.61it/s]

{'loss': 1.007, 'grad_norm': 2.2485945224761963, 'learning_rate': 2.6987569545662752e-06, 'epoch': 0.66}


 66%|██████▋   | 34400/51760 [2:29:19<1:12:22,  4.00it/s]

{'loss': 0.9716, 'grad_norm': 0.6955282092094421, 'learning_rate': 2.671026430440357e-06, 'epoch': 0.66}


 67%|██████▋   | 34500/51760 [2:29:45<1:21:44,  3.52it/s]

{'loss': 1.053, 'grad_norm': 0.9863582253456116, 'learning_rate': 2.643387093611874e-06, 'epoch': 0.67}


 67%|██████▋   | 34600/51760 [2:30:12<1:16:45,  3.73it/s]

{'loss': 1.0627, 'grad_norm': 3.5582499504089355, 'learning_rate': 2.615840026255597e-06, 'epoch': 0.67}


 67%|██████▋   | 34700/51760 [2:30:38<1:11:37,  3.97it/s]

{'loss': 1.0736, 'grad_norm': 1.4011884927749634, 'learning_rate': 2.588386306933624e-06, 'epoch': 0.67}


 67%|██████▋   | 34800/51760 [2:31:06<1:08:24,  4.13it/s]

{'loss': 1.0269, 'grad_norm': 1.7797966003417969, 'learning_rate': 2.561027010553157e-06, 'epoch': 0.67}


 67%|██████▋   | 34900/51760 [2:31:33<1:19:28,  3.54it/s]

{'loss': 1.0203, 'grad_norm': 2.401397228240967, 'learning_rate': 2.5337632083244235e-06, 'epoch': 0.67}


 68%|██████▊   | 35000/51760 [2:31:59<1:07:05,  4.16it/s]

{'loss': 1.0208, 'grad_norm': 1.3825109004974365, 'learning_rate': 2.506595967718719e-06, 'epoch': 0.68}


 68%|██████▊   | 35100/51760 [2:32:26<1:07:54,  4.09it/s]

{'loss': 1.0011, 'grad_norm': 3.244091272354126, 'learning_rate': 2.479526352426631e-06, 'epoch': 0.68}


 68%|██████▊   | 35200/51760 [2:32:51<1:02:10,  4.44it/s]

{'loss': 1.0353, 'grad_norm': 3.5823116302490234, 'learning_rate': 2.4525554223163684e-06, 'epoch': 0.68}


 68%|██████▊   | 35300/51760 [2:33:18<1:21:18,  3.37it/s]

{'loss': 1.0067, 'grad_norm': 1.946372389793396, 'learning_rate': 2.425684233392284e-06, 'epoch': 0.68}


 68%|██████▊   | 35400/51760 [2:33:44<1:09:17,  3.93it/s]

{'loss': 1.0785, 'grad_norm': 1.5576248168945312, 'learning_rate': 2.3989138377535253e-06, 'epoch': 0.68}


 69%|██████▊   | 35500/51760 [2:34:09<1:05:42,  4.12it/s]

{'loss': 1.1083, 'grad_norm': 4.356563091278076, 'learning_rate': 2.3722452835528304e-06, 'epoch': 0.69}


 69%|██████▉   | 35600/51760 [2:34:36<1:06:51,  4.03it/s]

{'loss': 0.9912, 'grad_norm': 4.038865566253662, 'learning_rate': 2.345679614955496e-06, 'epoch': 0.69}


 69%|██████▉   | 35700/51760 [2:35:01<1:03:49,  4.19it/s]

{'loss': 1.0144, 'grad_norm': 0.011623364873230457, 'learning_rate': 2.319217872098502e-06, 'epoch': 0.69}


 69%|██████▉   | 35800/51760 [2:35:27<1:06:03,  4.03it/s]

{'loss': 0.9816, 'grad_norm': 2.524163246154785, 'learning_rate': 2.2928610910497713e-06, 'epoch': 0.69}


 69%|██████▉   | 35900/51760 [2:35:55<1:10:30,  3.75it/s]

{'loss': 0.97, 'grad_norm': 0.9091723561286926, 'learning_rate': 2.266610303767618e-06, 'epoch': 0.69}


 70%|██████▉   | 36000/51760 [2:36:21<1:06:00,  3.98it/s]

{'loss': 0.9964, 'grad_norm': 1.0854133367538452, 'learning_rate': 2.240466538060334e-06, 'epoch': 0.7}


 70%|██████▉   | 36100/51760 [2:36:47<1:04:05,  4.07it/s]

{'loss': 1.1076, 'grad_norm': 6.071164608001709, 'learning_rate': 2.2144308175459477e-06, 'epoch': 0.7}


 70%|██████▉   | 36200/51760 [2:37:13<1:35:34,  2.71it/s]

{'loss': 1.0107, 'grad_norm': 0.9266192317008972, 'learning_rate': 2.188504161612155e-06, 'epoch': 0.7}


 70%|███████   | 36300/51760 [2:37:39<1:05:48,  3.92it/s]

{'loss': 1.0073, 'grad_norm': 1.8114280700683594, 'learning_rate': 2.162687585376393e-06, 'epoch': 0.7}


 70%|███████   | 36400/51760 [2:38:05<1:01:18,  4.18it/s]

{'loss': 1.0107, 'grad_norm': 8.411517143249512, 'learning_rate': 2.1369820996461015e-06, 'epoch': 0.7}


 71%|███████   | 36500/51760 [2:38:32<1:09:13,  3.67it/s]

{'loss': 1.0597, 'grad_norm': 1.3471770286560059, 'learning_rate': 2.1113887108791526e-06, 'epoch': 0.71}


 71%|███████   | 36600/51760 [2:38:58<1:03:36,  3.97it/s]

{'loss': 1.0368, 'grad_norm': 5.996630668640137, 'learning_rate': 2.0859084211444316e-06, 'epoch': 0.71}


 71%|███████   | 36700/51760 [2:39:25<57:30,  4.36it/s]  

{'loss': 0.9261, 'grad_norm': 1.1311869621276855, 'learning_rate': 2.060542228082609e-06, 'epoch': 0.71}


 71%|███████   | 36800/51760 [2:39:51<59:35,  4.18it/s]  

{'loss': 1.0551, 'grad_norm': 4.049074649810791, 'learning_rate': 2.035291124867088e-06, 'epoch': 0.71}


 71%|███████▏  | 36900/51760 [2:40:17<1:00:51,  4.07it/s]

{'loss': 1.0069, 'grad_norm': 1.388504147529602, 'learning_rate': 2.010156100165096e-06, 'epoch': 0.71}


 71%|███████▏  | 37000/51760 [2:40:44<1:03:40,  3.86it/s]

{'loss': 0.979, 'grad_norm': 2.4140453338623047, 'learning_rate': 1.985138138099002e-06, 'epoch': 0.71}


 72%|███████▏  | 37100/51760 [2:41:10<1:16:15,  3.20it/s]

{'loss': 1.0736, 'grad_norm': 2.379972219467163, 'learning_rate': 1.9602382182077643e-06, 'epoch': 0.72}


 72%|███████▏  | 37200/51760 [2:41:36<1:01:28,  3.95it/s]

{'loss': 1.0149, 'grad_norm': 1.281740665435791, 'learning_rate': 1.935457315408585e-06, 'epoch': 0.72}


 72%|███████▏  | 37300/51760 [2:42:03<1:02:23,  3.86it/s]

{'loss': 1.0054, 'grad_norm': 2.2052364349365234, 'learning_rate': 1.9107963999587453e-06, 'epoch': 0.72}


 72%|███████▏  | 37400/51760 [2:42:28<1:05:51,  3.63it/s]

{'loss': 1.0778, 'grad_norm': 1.4697750806808472, 'learning_rate': 1.8862564374176045e-06, 'epoch': 0.72}


 72%|███████▏  | 37500/51760 [2:42:54<1:05:46,  3.61it/s]

{'loss': 0.9777, 'grad_norm': 2.3314318656921387, 'learning_rate': 1.8618383886088016e-06, 'epoch': 0.72}


 73%|███████▎  | 37600/51760 [2:43:19<54:43,  4.31it/s]  

{'loss': 1.0021, 'grad_norm': 10.278672218322754, 'learning_rate': 1.837543209582639e-06, 'epoch': 0.73}


 73%|███████▎  | 37700/51760 [2:43:47<54:03,  4.34it/s]  

{'loss': 0.9545, 'grad_norm': 1.587227702140808, 'learning_rate': 1.8133718515786424e-06, 'epoch': 0.73}


 73%|███████▎  | 37800/51760 [2:44:13<1:06:13,  3.51it/s]

{'loss': 1.0022, 'grad_norm': 1.116851568222046, 'learning_rate': 1.7893252609883194e-06, 'epoch': 0.73}


 73%|███████▎  | 37900/51760 [2:44:40<1:14:31,  3.10it/s]

{'loss': 0.9966, 'grad_norm': 4.65001106262207, 'learning_rate': 1.7654043793181108e-06, 'epoch': 0.73}


 73%|███████▎  | 38000/51760 [2:45:07<1:10:53,  3.23it/s]

{'loss': 1.0333, 'grad_norm': 1.7069121599197388, 'learning_rate': 1.7416101431525157e-06, 'epoch': 0.73}


 74%|███████▎  | 38100/51760 [2:45:33<1:08:52,  3.31it/s]

{'loss': 0.9037, 'grad_norm': 2.1685941219329834, 'learning_rate': 1.7179434841174346e-06, 'epoch': 0.74}


 74%|███████▍  | 38200/51760 [2:46:00<1:05:27,  3.45it/s]

{'loss': 1.0434, 'grad_norm': 1.6731324195861816, 'learning_rate': 1.69440532884368e-06, 'epoch': 0.74}


 74%|███████▍  | 38300/51760 [2:46:26<59:05,  3.80it/s]  

{'loss': 1.1811, 'grad_norm': 1.981693983078003, 'learning_rate': 1.6709965989307036e-06, 'epoch': 0.74}


 74%|███████▍  | 38400/51760 [2:46:52<1:00:15,  3.70it/s]

{'loss': 1.0571, 'grad_norm': 1.87984299659729, 'learning_rate': 1.647718210910515e-06, 'epoch': 0.74}


 74%|███████▍  | 38500/51760 [2:47:19<1:02:42,  3.52it/s]

{'loss': 0.9792, 'grad_norm': 1.1879099607467651, 'learning_rate': 1.6245710762117828e-06, 'epoch': 0.74}


 75%|███████▍  | 38600/51760 [2:47:45<50:03,  4.38it/s]  

{'loss': 0.9437, 'grad_norm': 1.822140097618103, 'learning_rate': 1.6015561011241676e-06, 'epoch': 0.75}


 75%|███████▍  | 38700/51760 [2:48:12<1:03:36,  3.42it/s]

{'loss': 1.0042, 'grad_norm': 1.175479531288147, 'learning_rate': 1.578674186762823e-06, 'epoch': 0.75}


 75%|███████▍  | 38800/51760 [2:48:38<56:20,  3.83it/s]  

{'loss': 1.0855, 'grad_norm': 1.2139126062393188, 'learning_rate': 1.5559262290331183e-06, 'epoch': 0.75}


 75%|███████▌  | 38900/51760 [2:49:04<59:22,  3.61it/s]  

{'loss': 0.9807, 'grad_norm': 1.161425232887268, 'learning_rate': 1.533313118595568e-06, 'epoch': 0.75}


 75%|███████▌  | 39000/51760 [2:49:29<58:00,  3.67it/s]  

{'loss': 1.0138, 'grad_norm': 1.834883213043213, 'learning_rate': 1.510835740830946e-06, 'epoch': 0.75}


 76%|███████▌  | 39100/51760 [2:49:55<50:06,  4.21it/s]  

{'loss': 1.0646, 'grad_norm': 5.235470294952393, 'learning_rate': 1.4884949758056278e-06, 'epoch': 0.76}


 76%|███████▌  | 39200/51760 [2:50:21<48:35,  4.31it/s]  

{'loss': 0.9947, 'grad_norm': 5.514766216278076, 'learning_rate': 1.4662916982371373e-06, 'epoch': 0.76}


 76%|███████▌  | 39300/51760 [2:50:47<49:56,  4.16it/s]  

{'loss': 1.059, 'grad_norm': 6.190634250640869, 'learning_rate': 1.4442267774598878e-06, 'epoch': 0.76}


 76%|███████▌  | 39400/51760 [2:51:13<1:00:27,  3.41it/s]

{'loss': 0.9013, 'grad_norm': 0.8140642046928406, 'learning_rate': 1.4223010773911506e-06, 'epoch': 0.76}


 76%|███████▋  | 39500/51760 [2:51:41<1:11:21,  2.86it/s]

{'loss': 0.9527, 'grad_norm': 3.23517107963562, 'learning_rate': 1.4005154564972324e-06, 'epoch': 0.76}


 77%|███████▋  | 39600/51760 [2:52:07<47:14,  4.29it/s]  

{'loss': 1.1567, 'grad_norm': 10.010011672973633, 'learning_rate': 1.3788707677598574e-06, 'epoch': 0.77}


 77%|███████▋  | 39700/51760 [2:52:33<54:54,  3.66it/s]  

{'loss': 1.0554, 'grad_norm': 1.0868258476257324, 'learning_rate': 1.3573678586427707e-06, 'epoch': 0.77}


 77%|███████▋  | 39800/51760 [2:53:00<52:14,  3.82it/s]  

{'loss': 1.036, 'grad_norm': 1.6004326343536377, 'learning_rate': 1.3360075710585662e-06, 'epoch': 0.77}


 77%|███████▋  | 39900/51760 [2:53:26<1:15:24,  2.62it/s]

{'loss': 0.9466, 'grad_norm': 1.210710048675537, 'learning_rate': 1.3147907413357104e-06, 'epoch': 0.77}


 77%|███████▋  | 40000/51760 [2:53:51<55:19,  3.54it/s]  

{'loss': 1.057, 'grad_norm': 2.9320666790008545, 'learning_rate': 1.2937182001858034e-06, 'epoch': 0.77}


 77%|███████▋  | 40100/51760 [2:54:17<55:25,  3.51it/s]  

{'loss': 1.0443, 'grad_norm': 1.3337664604187012, 'learning_rate': 1.2727907726710548e-06, 'epoch': 0.77}


 78%|███████▊  | 40200/51760 [2:54:43<45:05,  4.27it/s]  

{'loss': 0.9553, 'grad_norm': 4.3681254386901855, 'learning_rate': 1.252009278171975e-06, 'epoch': 0.78}


 78%|███████▊  | 40300/51760 [2:55:10<50:56,  3.75it/s]  

{'loss': 1.0097, 'grad_norm': 1.5157228708267212, 'learning_rate': 1.2313745303553027e-06, 'epoch': 0.78}


 78%|███████▊  | 40400/51760 [2:55:36<49:25,  3.83it/s]  

{'loss': 1.0773, 'grad_norm': 1.9735466241836548, 'learning_rate': 1.2108873371421347e-06, 'epoch': 0.78}


 78%|███████▊  | 40500/51760 [2:56:03<45:26,  4.13it/s]  

{'loss': 0.9263, 'grad_norm': 6.618743896484375, 'learning_rate': 1.1905485006763019e-06, 'epoch': 0.78}


 78%|███████▊  | 40600/51760 [2:56:28<43:03,  4.32it/s]  

{'loss': 1.0692, 'grad_norm': 3.863175630569458, 'learning_rate': 1.170358817292962e-06, 'epoch': 0.78}


 79%|███████▊  | 40700/51760 [2:56:53<48:41,  3.79it/s]  

{'loss': 1.1078, 'grad_norm': 4.455684661865234, 'learning_rate': 1.150319077487415e-06, 'epoch': 0.79}


 79%|███████▉  | 40800/51760 [2:57:19<45:12,  4.04it/s]

{'loss': 1.0765, 'grad_norm': 3.039785385131836, 'learning_rate': 1.1304300658841566e-06, 'epoch': 0.79}


 79%|███████▉  | 40900/51760 [2:57:46<43:36,  4.15it/s]  

{'loss': 1.0107, 'grad_norm': 2.5463619232177734, 'learning_rate': 1.1106925612061598e-06, 'epoch': 0.79}


 79%|███████▉  | 41000/51760 [2:58:11<41:35,  4.31it/s]

{'loss': 1.0385, 'grad_norm': 2.2208902835845947, 'learning_rate': 1.091107336244377e-06, 'epoch': 0.79}


 79%|███████▉  | 41100/51760 [2:58:38<52:45,  3.37it/s]  

{'loss': 1.0659, 'grad_norm': 1.550183653831482, 'learning_rate': 1.0716751578274936e-06, 'epoch': 0.79}


 80%|███████▉  | 41200/51760 [2:59:04<46:46,  3.76it/s]  

{'loss': 1.0825, 'grad_norm': 1.143117070198059, 'learning_rate': 1.0523967867918943e-06, 'epoch': 0.8}


 80%|███████▉  | 41300/51760 [2:59:30<44:00,  3.96it/s]

{'loss': 1.0111, 'grad_norm': 1.7086005210876465, 'learning_rate': 1.0332729779518768e-06, 'epoch': 0.8}


 80%|███████▉  | 41400/51760 [2:59:55<53:22,  3.24it/s]  

{'loss': 1.0219, 'grad_norm': 1.1477464437484741, 'learning_rate': 1.0143044800701042e-06, 'epoch': 0.8}


 80%|████████  | 41500/51760 [3:00:22<42:40,  4.01it/s]  

{'loss': 1.0209, 'grad_norm': 29.76033592224121, 'learning_rate': 9.954920358282788e-07, 'epoch': 0.8}


 80%|████████  | 41600/51760 [3:00:47<40:59,  4.13it/s]

{'loss': 0.9575, 'grad_norm': 1.2489873170852661, 'learning_rate': 9.768363817980686e-07, 'epoch': 0.8}


 81%|████████  | 41700/51760 [3:01:13<37:40,  4.45it/s]  

{'loss': 1.0011, 'grad_norm': 3.27823543548584, 'learning_rate': 9.583382484122694e-07, 'epoch': 0.81}


 81%|████████  | 41800/51760 [3:01:39<51:05,  3.25it/s]  

{'loss': 1.0107, 'grad_norm': 0.7061505913734436, 'learning_rate': 9.399983599362001e-07, 'epoch': 0.81}


 81%|████████  | 41900/51760 [3:02:06<43:40,  3.76it/s]  

{'loss': 1.0427, 'grad_norm': 1.3357232809066772, 'learning_rate': 9.218174344393549e-07, 'epoch': 0.81}


 81%|████████  | 42000/51760 [3:02:32<46:31,  3.50it/s]

{'loss': 1.0751, 'grad_norm': 1.8556671142578125, 'learning_rate': 9.037961837672782e-07, 'epoch': 0.81}


 81%|████████▏ | 42100/51760 [3:02:59<38:36,  4.17it/s]  

{'loss': 1.0299, 'grad_norm': 3.0922751426696777, 'learning_rate': 8.859353135136966e-07, 'epoch': 0.81}


 82%|████████▏ | 42200/51760 [3:03:27<45:29,  3.50it/s]  

{'loss': 1.0386, 'grad_norm': 0.9708296060562134, 'learning_rate': 8.682355229928995e-07, 'epoch': 0.82}


 82%|████████▏ | 42300/51760 [3:03:53<37:02,  4.26it/s]  

{'loss': 1.0175, 'grad_norm': 3.6432723999023438, 'learning_rate': 8.506975052123473e-07, 'epoch': 0.82}


 82%|████████▏ | 42400/51760 [3:04:19<44:53,  3.48it/s]

{'loss': 1.0549, 'grad_norm': 3.3166933059692383, 'learning_rate': 8.333219468455434e-07, 'epoch': 0.82}


 82%|████████▏ | 42500/51760 [3:04:46<38:40,  3.99it/s]  

{'loss': 1.0306, 'grad_norm': 1.7852756977081299, 'learning_rate': 8.161095282051523e-07, 'epoch': 0.82}


 82%|████████▏ | 42600/51760 [3:05:11<41:16,  3.70it/s]

{'loss': 1.0421, 'grad_norm': 1.6342917680740356, 'learning_rate': 7.990609232163554e-07, 'epoch': 0.82}


 82%|████████▏ | 42700/51760 [3:05:38<39:47,  3.79it/s]  

{'loss': 1.0072, 'grad_norm': 1.148898959159851, 'learning_rate': 7.821767993904689e-07, 'epoch': 0.82}


 83%|████████▎ | 42800/51760 [3:06:04<41:53,  3.57it/s]

{'loss': 1.1895, 'grad_norm': 1.1192970275878906, 'learning_rate': 7.654578177988098e-07, 'epoch': 0.83}


 83%|████████▎ | 42900/51760 [3:06:31<43:44,  3.38it/s]

{'loss': 1.0378, 'grad_norm': 2.1519417762756348, 'learning_rate': 7.489046330468086e-07, 'epoch': 0.83}


 83%|████████▎ | 43000/51760 [3:06:57<36:23,  4.01it/s]  

{'loss': 0.9877, 'grad_norm': 2.0655503273010254, 'learning_rate': 7.325178932483795e-07, 'epoch': 0.83}


 83%|████████▎ | 43100/51760 [3:07:22<42:39,  3.38it/s]

{'loss': 0.9162, 'grad_norm': 1.8859999179840088, 'learning_rate': 7.162982400005525e-07, 'epoch': 0.83}


 83%|████████▎ | 43200/51760 [3:07:49<43:00,  3.32it/s]

{'loss': 1.0284, 'grad_norm': 2.056514024734497, 'learning_rate': 7.0024630835834e-07, 'epoch': 0.83}


 84%|████████▎ | 43300/51760 [3:08:14<35:05,  4.02it/s]

{'loss': 0.9196, 'grad_norm': 30.519590377807617, 'learning_rate': 6.843627268098818e-07, 'epoch': 0.84}


 84%|████████▍ | 43400/51760 [3:08:41<33:27,  4.16it/s]  

{'loss': 1.0115, 'grad_norm': 1.5536514520645142, 'learning_rate': 6.686481172518339e-07, 'epoch': 0.84}


 84%|████████▍ | 43500/51760 [3:09:08<35:16,  3.90it/s]

{'loss': 1.0485, 'grad_norm': 1.1850569248199463, 'learning_rate': 6.531030949650185e-07, 'epoch': 0.84}


 84%|████████▍ | 43600/51760 [3:09:33<34:35,  3.93it/s]

{'loss': 1.0379, 'grad_norm': 3.8468868732452393, 'learning_rate': 6.377282685903391e-07, 'epoch': 0.84}


 84%|████████▍ | 43700/51760 [3:09:59<34:47,  3.86it/s]

{'loss': 0.9797, 'grad_norm': 1.7570520639419556, 'learning_rate': 6.225242401049408e-07, 'epoch': 0.84}


 85%|████████▍ | 43800/51760 [3:10:27<41:36,  3.19it/s]  

{'loss': 0.9363, 'grad_norm': 1.8832037448883057, 'learning_rate': 6.074916047986495e-07, 'epoch': 0.85}


 85%|████████▍ | 43900/51760 [3:10:54<30:32,  4.29it/s]

{'loss': 1.0784, 'grad_norm': 5.744569301605225, 'learning_rate': 5.926309512506595e-07, 'epoch': 0.85}


 85%|████████▌ | 44000/51760 [3:11:22<29:48,  4.34it/s]  

{'loss': 0.9436, 'grad_norm': 0.17352190613746643, 'learning_rate': 5.779428613064875e-07, 'epoch': 0.85}


 85%|████████▌ | 44100/51760 [3:11:47<34:58,  3.65it/s]

{'loss': 1.0148, 'grad_norm': 1.3255946636199951, 'learning_rate': 5.634279100551992e-07, 'epoch': 0.85}


 85%|████████▌ | 44200/51760 [3:12:14<32:00,  3.94it/s]  

{'loss': 0.9989, 'grad_norm': 1.2517842054367065, 'learning_rate': 5.490866658068817e-07, 'epoch': 0.85}


 86%|████████▌ | 44300/51760 [3:12:40<29:42,  4.18it/s]

{'loss': 1.0385, 'grad_norm': 1.393531322479248, 'learning_rate': 5.349196900703996e-07, 'epoch': 0.86}


 86%|████████▌ | 44400/51760 [3:13:06<46:05,  2.66it/s]

{'loss': 1.0602, 'grad_norm': 0.8043624758720398, 'learning_rate': 5.209275375314093e-07, 'epoch': 0.86}


 86%|████████▌ | 44500/51760 [3:13:32<34:30,  3.51it/s]

{'loss': 1.1254, 'grad_norm': 2.0598301887512207, 'learning_rate': 5.071107560306376e-07, 'epoch': 0.86}


 86%|████████▌ | 44600/51760 [3:13:58<27:52,  4.28it/s]

{'loss': 0.9862, 'grad_norm': 6.219912528991699, 'learning_rate': 4.934698865424348e-07, 'epoch': 0.86}


 86%|████████▋ | 44700/51760 [3:14:25<31:58,  3.68it/s]

{'loss': 0.9726, 'grad_norm': 2.2718966007232666, 'learning_rate': 4.80005463153595e-07, 'epoch': 0.86}


 87%|████████▋ | 44800/51760 [3:14:50<32:58,  3.52it/s]

{'loss': 0.9839, 'grad_norm': 1.1688590049743652, 'learning_rate': 4.6671801304243924e-07, 'epoch': 0.87}


 87%|████████▋ | 44900/51760 [3:15:16<27:37,  4.14it/s]

{'loss': 1.1108, 'grad_norm': 3.7154042720794678, 'learning_rate': 4.536080564581813e-07, 'epoch': 0.87}


 87%|████████▋ | 45000/51760 [3:15:43<27:30,  4.10it/s]

{'loss': 0.9371, 'grad_norm': 1.1740071773529053, 'learning_rate': 4.4067610670055327e-07, 'epoch': 0.87}


 87%|████████▋ | 45100/51760 [3:16:08<27:41,  4.01it/s]

{'loss': 0.9881, 'grad_norm': 3.0230062007904053, 'learning_rate': 4.2792267009970857e-07, 'epoch': 0.87}


 87%|████████▋ | 45200/51760 [3:16:34<32:34,  3.36it/s]

{'loss': 1.0304, 'grad_norm': 2.104018449783325, 'learning_rate': 4.1534824599640367e-07, 'epoch': 0.87}


 88%|████████▊ | 45300/51760 [3:17:00<31:10,  3.45it/s]

{'loss': 1.0745, 'grad_norm': 9.839277267456055, 'learning_rate': 4.0295332672243783e-07, 'epoch': 0.88}


 88%|████████▊ | 45400/51760 [3:17:28<31:34,  3.36it/s]

{'loss': 1.0225, 'grad_norm': 0.9382530450820923, 'learning_rate': 3.9073839758138057e-07, 'epoch': 0.88}


 88%|████████▊ | 45500/51760 [3:17:55<26:42,  3.91it/s]

{'loss': 1.0414, 'grad_norm': 1.8963223695755005, 'learning_rate': 3.787039368295753e-07, 'epoch': 0.88}


 88%|████████▊ | 45600/51760 [3:18:21<26:10,  3.92it/s]

{'loss': 0.9797, 'grad_norm': 1.6153417825698853, 'learning_rate': 3.668504156574049e-07, 'epoch': 0.88}


 88%|████████▊ | 45700/51760 [3:18:47<28:01,  3.60it/s]

{'loss': 0.9688, 'grad_norm': 1.1111197471618652, 'learning_rate': 3.5517829817084956e-07, 'epoch': 0.88}


 88%|████████▊ | 45800/51760 [3:19:13<24:40,  4.03it/s]

{'loss': 1.0205, 'grad_norm': 1.1733152866363525, 'learning_rate': 3.436880413733135e-07, 'epoch': 0.88}


 89%|████████▊ | 45900/51760 [3:19:39<25:14,  3.87it/s]

{'loss': 0.9729, 'grad_norm': 6.478654861450195, 'learning_rate': 3.3238009514773185e-07, 'epoch': 0.89}


 89%|████████▉ | 46000/51760 [3:20:06<22:01,  4.36it/s]

{'loss': 1.0838, 'grad_norm': 1.3294709920883179, 'learning_rate': 3.2125490223895386e-07, 'epoch': 0.89}


 89%|████████▉ | 46100/51760 [3:20:32<25:22,  3.72it/s]

{'loss': 1.0221, 'grad_norm': 2.50521183013916, 'learning_rate': 3.10312898236414e-07, 'epoch': 0.89}


 89%|████████▉ | 46200/51760 [3:20:58<24:44,  3.75it/s]

{'loss': 0.9446, 'grad_norm': 4.485610485076904, 'learning_rate': 2.995545115570686e-07, 'epoch': 0.89}


 89%|████████▉ | 46300/51760 [3:21:24<23:39,  3.85it/s]

{'loss': 0.9669, 'grad_norm': 15.080324172973633, 'learning_rate': 2.88980163428631e-07, 'epoch': 0.89}


 90%|████████▉ | 46400/51760 [3:21:50<20:26,  4.37it/s]

{'loss': 1.0671, 'grad_norm': 2.622666120529175, 'learning_rate': 2.7859026787307177e-07, 'epoch': 0.9}


 90%|████████▉ | 46500/51760 [3:22:16<27:00,  3.25it/s]

{'loss': 1.1524, 'grad_norm': 8.720173835754395, 'learning_rate': 2.683852316904106e-07, 'epoch': 0.9}


 90%|█████████ | 46600/51760 [3:22:42<24:58,  3.44it/s]

{'loss': 1.0033, 'grad_norm': 1.1078121662139893, 'learning_rate': 2.5836545444279106e-07, 'epoch': 0.9}


 90%|█████████ | 46700/51760 [3:23:08<21:26,  3.93it/s]

{'loss': 1.0245, 'grad_norm': 1.2744563817977905, 'learning_rate': 2.4853132843883296e-07, 'epoch': 0.9}


 90%|█████████ | 46800/51760 [3:23:35<23:54,  3.46it/s]

{'loss': 0.9879, 'grad_norm': 1.019021987915039, 'learning_rate': 2.3888323871827544e-07, 'epoch': 0.9}


 91%|█████████ | 46900/51760 [3:24:01<19:30,  4.15it/s]

{'loss': 0.9482, 'grad_norm': 2.877040386199951, 'learning_rate': 2.2942156303689734e-07, 'epoch': 0.91}


 91%|█████████ | 47000/51760 [3:24:29<22:09,  3.58it/s]

{'loss': 1.095, 'grad_norm': 1.822325587272644, 'learning_rate': 2.201466718517298e-07, 'epoch': 0.91}


 91%|█████████ | 47100/51760 [3:24:55<23:03,  3.37it/s]

{'loss': 1.0836, 'grad_norm': 2.9669108390808105, 'learning_rate': 2.1105892830655172e-07, 'epoch': 0.91}


 91%|█████████ | 47200/51760 [3:25:21<17:20,  4.38it/s]

{'loss': 1.0348, 'grad_norm': 3.5756402015686035, 'learning_rate': 2.0215868821766958e-07, 'epoch': 0.91}


 91%|█████████▏| 47300/51760 [3:25:48<20:51,  3.56it/s]

{'loss': 1.0116, 'grad_norm': 1.6568074226379395, 'learning_rate': 1.934463000599862e-07, 'epoch': 0.91}


 92%|█████████▏| 47400/51760 [3:26:14<21:25,  3.39it/s]

{'loss': 1.1113, 'grad_norm': 0.8650783896446228, 'learning_rate': 1.8492210495336004e-07, 'epoch': 0.92}


 92%|█████████▏| 47500/51760 [3:26:39<16:10,  4.39it/s]

{'loss': 1.0543, 'grad_norm': 2.8973071575164795, 'learning_rate': 1.7658643664924314e-07, 'epoch': 0.92}


 92%|█████████▏| 47600/51760 [3:27:05<16:04,  4.31it/s]

{'loss': 1.0139, 'grad_norm': 19.217933654785156, 'learning_rate': 1.6843962151761873e-07, 'epoch': 0.92}


 92%|█████████▏| 47700/51760 [3:27:31<16:10,  4.19it/s]

{'loss': 0.9822, 'grad_norm': 2.240084648132324, 'learning_rate': 1.6048197853422264e-07, 'epoch': 0.92}


 92%|█████████▏| 47800/51760 [3:28:00<19:03,  3.46it/s]

{'loss': 1.0547, 'grad_norm': 2.6977920532226562, 'learning_rate': 1.5271381926805052e-07, 'epoch': 0.92}


 93%|█████████▎| 47900/51760 [3:28:26<15:44,  4.08it/s]

{'loss': 1.0231, 'grad_norm': 3.557598352432251, 'learning_rate': 1.4513544786916077e-07, 'epoch': 0.93}


 93%|█████████▎| 48000/51760 [3:28:51<14:25,  4.34it/s]

{'loss': 1.062, 'grad_norm': 3.954167127609253, 'learning_rate': 1.3774716105676934e-07, 'epoch': 0.93}


 93%|█████████▎| 48100/51760 [3:29:17<17:51,  3.42it/s]

{'loss': 1.0409, 'grad_norm': 0.9619799852371216, 'learning_rate': 1.3054924810762537e-07, 'epoch': 0.93}


 93%|█████████▎| 48200/51760 [3:29:43<14:01,  4.23it/s]

{'loss': 1.0245, 'grad_norm': 8.051080703735352, 'learning_rate': 1.2354199084469164e-07, 'epoch': 0.93}


 93%|█████████▎| 48300/51760 [3:30:08<13:31,  4.26it/s]

{'loss': 0.9987, 'grad_norm': 1.2253694534301758, 'learning_rate': 1.1672566362610549e-07, 'epoch': 0.93}


 94%|█████████▎| 48400/51760 [3:30:35<16:52,  3.32it/s]

{'loss': 1.0994, 'grad_norm': 6.823174476623535, 'learning_rate': 1.1010053333443904e-07, 'epoch': 0.94}


 94%|█████████▎| 48500/51760 [3:31:00<13:09,  4.13it/s]

{'loss': 1.0363, 'grad_norm': 6.340661525726318, 'learning_rate': 1.0366685936625099e-07, 'epoch': 0.94}


 94%|█████████▍| 48600/51760 [3:31:26<15:14,  3.46it/s]

{'loss': 0.9759, 'grad_norm': 0.8999930024147034, 'learning_rate': 9.742489362192741e-08, 'epoch': 0.94}


 94%|█████████▍| 48700/51760 [3:31:55<13:35,  3.75it/s]

{'loss': 0.9723, 'grad_norm': 2.276137113571167, 'learning_rate': 9.137488049581922e-08, 'epoch': 0.94}


 94%|█████████▍| 48800/51760 [3:32:21<18:07,  2.72it/s]

{'loss': 1.0051, 'grad_norm': 0.729568600654602, 'learning_rate': 8.551705686667744e-08, 'epoch': 0.94}


 94%|█████████▍| 48900/51760 [3:32:48<11:20,  4.21it/s]

{'loss': 1.0165, 'grad_norm': 22.59227752685547, 'learning_rate': 7.985165208837297e-08, 'epoch': 0.94}


 95%|█████████▍| 49000/51760 [3:33:14<13:40,  3.36it/s]

{'loss': 1.0741, 'grad_norm': 2.1729414463043213, 'learning_rate': 7.437888798092041e-08, 'epoch': 0.95}


 95%|█████████▍| 49100/51760 [3:33:41<11:59,  3.70it/s]

{'loss': 1.0241, 'grad_norm': 1.172561526298523, 'learning_rate': 6.909897882179162e-08, 'epoch': 0.95}


 95%|█████████▌| 49200/51760 [3:34:07<10:21,  4.12it/s]

{'loss': 1.0293, 'grad_norm': 4.41267204284668, 'learning_rate': 6.401213133752637e-08, 'epoch': 0.95}


 95%|█████████▌| 49300/51760 [3:34:34<10:52,  3.77it/s]

{'loss': 0.9625, 'grad_norm': 4.060686111450195, 'learning_rate': 5.9118544695637714e-08, 'epoch': 0.95}


 95%|█████████▌| 49400/51760 [3:35:01<09:41,  4.06it/s]

{'loss': 1.1445, 'grad_norm': 12.253299713134766, 'learning_rate': 5.441841049681484e-08, 'epoch': 0.95}


 96%|█████████▌| 49500/51760 [3:35:27<08:50,  4.26it/s]

{'loss': 0.9751, 'grad_norm': 1.183219075202942, 'learning_rate': 4.99119127674208e-08, 'epoch': 0.96}


 96%|█████████▌| 49600/51760 [3:35:53<09:16,  3.88it/s]

{'loss': 1.0303, 'grad_norm': 1.0607444047927856, 'learning_rate': 4.559922795228655e-08, 'epoch': 0.96}


 96%|█████████▌| 49700/51760 [3:36:18<07:56,  4.32it/s]

{'loss': 0.9651, 'grad_norm': 5.761286735534668, 'learning_rate': 4.14805249078043e-08, 'epoch': 0.96}


 96%|█████████▌| 49800/51760 [3:36:43<07:31,  4.34it/s]

{'loss': 0.9583, 'grad_norm': 2.6757726669311523, 'learning_rate': 3.755596489531388e-08, 'epoch': 0.96}


 96%|█████████▋| 49900/51760 [3:37:09<08:31,  3.63it/s]

{'loss': 1.0111, 'grad_norm': 2.3563976287841797, 'learning_rate': 3.382570157479059e-08, 'epoch': 0.96}


 97%|█████████▋| 50000/51760 [3:37:36<07:22,  3.98it/s]

{'loss': 1.0491, 'grad_norm': 36.88207244873047, 'learning_rate': 3.028988099882835e-08, 'epoch': 0.97}


 97%|█████████▋| 50100/51760 [3:38:04<06:41,  4.14it/s]

{'loss': 0.9919, 'grad_norm': 3.51263165473938, 'learning_rate': 2.6948641606921454e-08, 'epoch': 0.97}


 97%|█████████▋| 50200/51760 [3:38:30<06:29,  4.00it/s]

{'loss': 0.9615, 'grad_norm': 1.3130837678909302, 'learning_rate': 2.380211422004397e-08, 'epoch': 0.97}


 97%|█████████▋| 50300/51760 [3:38:56<06:53,  3.53it/s]

{'loss': 1.0494, 'grad_norm': 1.4696288108825684, 'learning_rate': 2.0850422035526008e-08, 'epoch': 0.97}


 97%|█████████▋| 50400/51760 [3:39:22<06:23,  3.54it/s]

{'loss': 1.1022, 'grad_norm': 2.7734434604644775, 'learning_rate': 1.809368062223482e-08, 'epoch': 0.97}


 98%|█████████▊| 50500/51760 [3:39:48<04:51,  4.33it/s]

{'loss': 0.9305, 'grad_norm': 3.80133318901062, 'learning_rate': 1.553199791604343e-08, 'epoch': 0.98}


 98%|█████████▊| 50600/51760 [3:40:15<04:33,  4.24it/s]

{'loss': 1.0166, 'grad_norm': 3.3763935565948486, 'learning_rate': 1.3165474215610651e-08, 'epoch': 0.98}


 98%|█████████▊| 50700/51760 [3:40:41<04:19,  4.09it/s]

{'loss': 0.9368, 'grad_norm': 1.3210352659225464, 'learning_rate': 1.099420217844982e-08, 'epoch': 0.98}


 98%|█████████▊| 50800/51760 [3:41:07<03:48,  4.20it/s]

{'loss': 1.0635, 'grad_norm': 1.6391295194625854, 'learning_rate': 9.01826681730278e-09, 'epoch': 0.98}


 98%|█████████▊| 50900/51760 [3:41:33<03:34,  4.01it/s]

{'loss': 1.0498, 'grad_norm': 1.469282865524292, 'learning_rate': 7.237745496810889e-09, 'epoch': 0.98}


 99%|█████████▊| 51000/51760 [3:42:01<04:50,  2.62it/s]

{'loss': 1.1724, 'grad_norm': 1.8440059423446655, 'learning_rate': 5.652707930486889e-09, 'epoch': 0.99}


 99%|█████████▊| 51100/51760 [3:42:27<02:56,  3.74it/s]

{'loss': 0.9248, 'grad_norm': 8.19423770904541, 'learning_rate': 4.2632161779843085e-09, 'epoch': 0.99}


 99%|█████████▉| 51200/51760 [3:42:52<02:08,  4.34it/s]

{'loss': 1.0553, 'grad_norm': 3.0527663230895996, 'learning_rate': 3.0693246426660717e-09, 'epoch': 0.99}


 99%|█████████▉| 51300/51760 [3:43:18<01:49,  4.20it/s]

{'loss': 0.9243, 'grad_norm': 1.761596918106079, 'learning_rate': 2.0710800694789812e-09, 'epoch': 0.99}


 99%|█████████▉| 51400/51760 [3:43:44<01:27,  4.11it/s]

{'loss': 1.0468, 'grad_norm': 4.3272705078125, 'learning_rate': 1.2685215431190722e-09, 'epoch': 0.99}


 99%|█████████▉| 51500/51760 [3:44:09<01:04,  4.01it/s]

{'loss': 1.0356, 'grad_norm': 1.1022262573242188, 'learning_rate': 6.616804865028359e-10, 'epoch': 0.99}


100%|█████████▉| 51600/51760 [3:44:36<00:40,  3.96it/s]

{'loss': 0.9367, 'grad_norm': 2.332582473754883, 'learning_rate': 2.505806595382021e-10, 'epoch': 1.0}


100%|█████████▉| 51700/51760 [3:45:02<00:14,  4.08it/s]

{'loss': 1.0188, 'grad_norm': 3.8935835361480713, 'learning_rate': 3.523815819195253e-11, 'epoch': 1.0}


100%|██████████| 51760/51760 [3:45:18<00:00,  3.83it/s]

{'train_runtime': 13518.0159, 'train_samples_per_second': 3.829, 'train_steps_per_second': 3.829, 'train_loss': 1.0298691117229197, 'epoch': 1.0}



