## Set Up the Environment

In [1]:
import os

In [1]:
%pip install transformers datasets

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install tf-keras

Collecting tf-keras
  Downloading tf_keras-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting tensorflow<2.19,>=2.18 (from tf-keras)
  Using cached tensorflow-2.18.0-cp311-cp311-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow<2.19,>=2.18->tf-keras)
  Using cached tensorflow_intel-2.18.0-cp311-cp311-win_amd64.whl.metadata (4.9 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow-intel==2.18.0->tensorflow<2.19,>=2.18->tf-keras)
  Using cached tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting keras>=3.5.0 (from tensorflow-intel==2.18.0->tensorflow<2.19,>=2.18->tf-keras)
  Using cached keras-3.6.0-py3-none-any.whl.metadata (5.8 kB)
Downloading tf_keras-2.18.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ---------------------------------------- 1.7/1.7 MB 13.3 MB/s eta 0:00:00
Using cached tensorflow-2.18.0-cp311-cp311-win_amd64.whl (7.5 kB)
Using cached tensorflow_intel-2.18.0-cp31


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
%pip install sentencepiece sacremoses

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl.metadata (8.3 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp311-cp311-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   --------------------------------------- 991.5/991.5 kB 15.5 MB/s eta 0:00:00
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
   ---------------------------------------- 0.0/897.5 kB ? eta -:--:--
   --------------------------------------- 897.5/897.5 kB 19.9 MB/s eta 0:00:00
Installing collected packages: sentencepiece, sacremoses
Successfully installed sacremoses-0.1.1 sentencepiece-0.2.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
%pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-1.1.1-py3-none-any.whl.metadata (19 kB)
Collecting torch (from transformers[torch])
  Using cached torch-2.5.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting sympy==1.13.1 (from torch->transformers[torch])
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch->transformers[torch])
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading accelerate-1.1.1-py3-none-any.whl (333 kB)
Using cached torch-2.5.1-cp311-cp311-win_amd64.whl (203.1 MB)
Using cached sympy-1.13.1-py3-none-any.whl (6.2 MB)
Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Installing collected packages: mpmath, sympy, torch, accelerate
Successfully installed accelerate-1.1.1 mpmath-1.3.0 sympy-1.13.1 torch-2.5.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## Load the Dataset

In [5]:
from datasets import load_dataset

ds = load_dataset("wmt/wmt14", "de-en")
print(ds)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 4508785
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 3003
    })
})


## Loading the Tokenizer

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset

# Load the tokenizer and model
model_name = "Helsinki-NLP/opus-mt-de-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)




## Preprocess the Data

In [7]:
# Preprocess data
def preprocess_function(examples):
    inputs = [ex["de"] for ex in examples["translation"]]
    targets = [ex["en"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = ds.map(preprocess_function, batched=True)

## Model and Training Arguments

In [8]:
batch_size = 16

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./opus-mt-de-en-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
)



## Initialize the Trainer and Fine-Tune

In [9]:
# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

# Start fine-tuning
trainer.train()

  trainer = Seq2SeqTrainer(


{'loss': 1.3394, 'grad_norm': 1.3751311302185059, 'learning_rate': 1.9964939673527327e-05, 'epoch': 0.0}


  0%|          | 1000/281800 [01:25<6:57:36, 11.21it/s]

{'loss': 0.883, 'grad_norm': 1.5851682424545288, 'learning_rate': 1.992945351312988e-05, 'epoch': 0.0}


  1%|          | 1500/281800 [02:11<6:57:35, 11.19it/s] 

{'loss': 0.8071, 'grad_norm': 1.452210545539856, 'learning_rate': 1.9893967352732433e-05, 'epoch': 0.01}


  1%|          | 2000/281800 [02:56<6:59:54, 11.11it/s] 

{'loss': 0.7606, 'grad_norm': 1.2534818649291992, 'learning_rate': 1.985848119233499e-05, 'epoch': 0.01}


  1%|          | 2500/281800 [03:42<7:13:56, 10.73it/s] 

{'loss': 0.7378, 'grad_norm': 1.5736395120620728, 'learning_rate': 1.9822995031937546e-05, 'epoch': 0.01}


  1%|          | 3000/281800 [04:28<6:48:56, 11.36it/s] 

{'loss': 0.7201, 'grad_norm': 1.341543436050415, 'learning_rate': 1.9787508871540102e-05, 'epoch': 0.01}


  1%|          | 3500/281800 [05:14<7:07:16, 10.86it/s] 

{'loss': 0.6889, 'grad_norm': 1.5074005126953125, 'learning_rate': 1.9752022711142655e-05, 'epoch': 0.01}


  1%|▏         | 4000/281800 [06:00<6:58:18, 11.07it/s] 

{'loss': 0.6922, 'grad_norm': 1.2115817070007324, 'learning_rate': 1.9716536550745212e-05, 'epoch': 0.01}


  2%|▏         | 4500/281800 [06:47<6:50:20, 11.26it/s] 

{'loss': 0.6638, 'grad_norm': 1.2172752618789673, 'learning_rate': 1.9681050390347768e-05, 'epoch': 0.02}


  2%|▏         | 5000/281800 [07:33<6:50:33, 11.24it/s] 

{'loss': 0.6635, 'grad_norm': 1.2077304124832153, 'learning_rate': 1.964556422995032e-05, 'epoch': 0.02}


  2%|▏         | 5500/281800 [08:19<6:47:16, 11.31it/s] 

{'loss': 0.6533, 'grad_norm': 1.3467170000076294, 'learning_rate': 1.9610078069552878e-05, 'epoch': 0.02}


  2%|▏         | 6000/281800 [09:05<6:59:46, 10.95it/s] 

{'loss': 0.6512, 'grad_norm': 1.155332326889038, 'learning_rate': 1.957459190915543e-05, 'epoch': 0.02}


  2%|▏         | 6500/281800 [09:50<6:47:40, 11.25it/s] 

{'loss': 0.6363, 'grad_norm': 1.520063877105713, 'learning_rate': 1.9539105748757983e-05, 'epoch': 0.02}


  2%|▏         | 7000/281800 [10:37<6:43:14, 11.36it/s] 

{'loss': 0.6293, 'grad_norm': 1.400255799293518, 'learning_rate': 1.950361958836054e-05, 'epoch': 0.02}


  3%|▎         | 7500/281800 [11:22<6:31:08, 11.69it/s] 

{'loss': 0.6362, 'grad_norm': 1.6213574409484863, 'learning_rate': 1.9468133427963096e-05, 'epoch': 0.03}


  3%|▎         | 8000/281800 [12:08<6:52:09, 11.07it/s] 

{'loss': 0.6291, 'grad_norm': 1.2142205238342285, 'learning_rate': 1.9432647267565653e-05, 'epoch': 0.03}


  3%|▎         | 8500/281800 [12:54<6:20:49, 11.96it/s] 

{'loss': 0.6301, 'grad_norm': 1.27955961227417, 'learning_rate': 1.9397161107168206e-05, 'epoch': 0.03}


  3%|▎         | 9000/281800 [13:40<6:37:30, 11.44it/s] 

{'loss': 0.6204, 'grad_norm': 1.4589385986328125, 'learning_rate': 1.9361674946770762e-05, 'epoch': 0.03}


  3%|▎         | 9500/281800 [14:26<6:22:14, 11.87it/s] 

{'loss': 0.6151, 'grad_norm': 1.1426520347595215, 'learning_rate': 1.9326188786373318e-05, 'epoch': 0.03}


  4%|▎         | 10000/281800 [15:12<6:45:44, 11.16it/s]

{'loss': 0.6189, 'grad_norm': 1.5338670015335083, 'learning_rate': 1.929070262597587e-05, 'epoch': 0.04}


  4%|▎         | 10500/281800 [15:58<6:49:44, 11.04it/s] 

{'loss': 0.6159, 'grad_norm': 1.100847601890564, 'learning_rate': 1.9255216465578424e-05, 'epoch': 0.04}


  4%|▍         | 11000/281800 [16:44<6:37:01, 11.37it/s] 

{'loss': 0.6007, 'grad_norm': 1.4344757795333862, 'learning_rate': 1.921973030518098e-05, 'epoch': 0.04}


  4%|▍         | 11500/281800 [17:29<7:01:13, 10.69it/s] 

{'loss': 0.6017, 'grad_norm': 1.2453420162200928, 'learning_rate': 1.9184244144783537e-05, 'epoch': 0.04}


  4%|▍         | 12000/281800 [18:16<6:29:32, 11.54it/s] 

{'loss': 0.6093, 'grad_norm': 1.1844966411590576, 'learning_rate': 1.914875798438609e-05, 'epoch': 0.04}


  4%|▍         | 12500/281800 [19:02<6:17:26, 11.89it/s] 

{'loss': 0.5991, 'grad_norm': 1.2222621440887451, 'learning_rate': 1.9113271823988646e-05, 'epoch': 0.04}


  5%|▍         | 13000/281800 [19:47<6:38:49, 11.23it/s] 

{'loss': 0.5965, 'grad_norm': 1.550661325454712, 'learning_rate': 1.9077785663591203e-05, 'epoch': 0.05}


  5%|▍         | 13500/281800 [20:34<6:56:21, 10.74it/s] 

{'loss': 0.5914, 'grad_norm': 1.1469810009002686, 'learning_rate': 1.9042299503193756e-05, 'epoch': 0.05}


  5%|▍         | 14000/281800 [21:19<6:31:55, 11.39it/s] 

{'loss': 0.6054, 'grad_norm': 1.2493983507156372, 'learning_rate': 1.9006813342796312e-05, 'epoch': 0.05}


  5%|▌         | 14500/281800 [22:06<6:41:18, 11.10it/s] 

{'loss': 0.5921, 'grad_norm': 1.1859700679779053, 'learning_rate': 1.8971327182398865e-05, 'epoch': 0.05}


  5%|▌         | 15000/281800 [22:51<6:40:38, 11.10it/s] 

{'loss': 0.5935, 'grad_norm': 1.0783170461654663, 'learning_rate': 1.893584102200142e-05, 'epoch': 0.05}


  6%|▌         | 15500/281800 [23:37<6:34:41, 11.25it/s] 

{'loss': 0.5784, 'grad_norm': 1.3253296613693237, 'learning_rate': 1.8900354861603974e-05, 'epoch': 0.06}


  6%|▌         | 16000/281800 [24:23<6:32:16, 11.29it/s] 

{'loss': 0.5741, 'grad_norm': 1.2235878705978394, 'learning_rate': 1.886486870120653e-05, 'epoch': 0.06}


  6%|▌         | 16500/281800 [25:09<6:29:04, 11.36it/s] 

{'loss': 0.5768, 'grad_norm': 1.1081430912017822, 'learning_rate': 1.882945351312988e-05, 'epoch': 0.06}


  6%|▌         | 17000/281800 [25:54<6:30:10, 11.31it/s] 

{'loss': 0.5695, 'grad_norm': 1.3864309787750244, 'learning_rate': 1.879403832505323e-05, 'epoch': 0.06}


  6%|▌         | 17500/281800 [26:40<6:35:12, 11.15it/s] 

{'loss': 0.5866, 'grad_norm': 1.2692115306854248, 'learning_rate': 1.8758552164655784e-05, 'epoch': 0.06}


  6%|▋         | 18000/281800 [27:25<6:29:01, 11.30it/s] 

{'loss': 0.5736, 'grad_norm': 1.1779206991195679, 'learning_rate': 1.872306600425834e-05, 'epoch': 0.06}


  7%|▋         | 18500/281800 [28:11<6:27:10, 11.33it/s] 

{'loss': 0.5844, 'grad_norm': 1.2236034870147705, 'learning_rate': 1.8687579843860897e-05, 'epoch': 0.07}


  7%|▋         | 19000/281800 [28:56<6:35:00, 11.09it/s] 

{'loss': 0.5788, 'grad_norm': 1.0285475254058838, 'learning_rate': 1.865209368346345e-05, 'epoch': 0.07}


  7%|▋         | 19500/281800 [29:42<6:30:29, 11.20it/s] 

{'loss': 0.5682, 'grad_norm': 1.1870031356811523, 'learning_rate': 1.8616607523066006e-05, 'epoch': 0.07}


  7%|▋         | 20000/281800 [30:28<6:36:14, 11.01it/s] 

{'loss': 0.5729, 'grad_norm': 1.160744547843933, 'learning_rate': 1.8581121362668562e-05, 'epoch': 0.07}


  7%|▋         | 20500/281800 [31:13<6:30:07, 11.16it/s] 

{'loss': 0.569, 'grad_norm': 1.4235965013504028, 'learning_rate': 1.8545635202271115e-05, 'epoch': 0.07}


  7%|▋         | 21000/281800 [31:59<6:27:01, 11.23it/s] 

{'loss': 0.5723, 'grad_norm': 1.0654003620147705, 'learning_rate': 1.8510220014194465e-05, 'epoch': 0.07}


  8%|▊         | 21500/281800 [32:44<6:28:23, 11.17it/s] 

{'loss': 0.5659, 'grad_norm': 1.4658315181732178, 'learning_rate': 1.847473385379702e-05, 'epoch': 0.08}


  8%|▊         | 22000/281800 [33:30<6:38:39, 10.86it/s] 

{'loss': 0.5696, 'grad_norm': 1.1599538326263428, 'learning_rate': 1.8439247693399575e-05, 'epoch': 0.08}


  8%|▊         | 22500/281800 [34:16<6:23:15, 11.28it/s] 

{'loss': 0.5601, 'grad_norm': 1.5800378322601318, 'learning_rate': 1.840376153300213e-05, 'epoch': 0.08}


  8%|▊         | 23000/281800 [35:02<6:24:42, 11.21it/s] 

{'loss': 0.5703, 'grad_norm': 1.0589845180511475, 'learning_rate': 1.8368275372604684e-05, 'epoch': 0.08}


  8%|▊         | 23500/281800 [35:48<6:26:14, 11.15it/s] 

{'loss': 0.5654, 'grad_norm': 1.1836539506912231, 'learning_rate': 1.8332860184528038e-05, 'epoch': 0.08}


  9%|▊         | 24000/281800 [36:33<6:26:19, 11.12it/s] 

{'loss': 0.5626, 'grad_norm': 1.214566707611084, 'learning_rate': 1.829737402413059e-05, 'epoch': 0.09}


  9%|▊         | 24500/281800 [37:19<6:23:43, 11.18it/s] 

{'loss': 0.5708, 'grad_norm': 1.3187588453292847, 'learning_rate': 1.8261887863733147e-05, 'epoch': 0.09}


  9%|▉         | 25000/281800 [38:04<6:32:44, 10.90it/s] 

{'loss': 0.5633, 'grad_norm': 1.319372534751892, 'learning_rate': 1.82264017033357e-05, 'epoch': 0.09}


  9%|▉         | 25500/281800 [38:50<5:49:20, 12.23it/s] 

{'loss': 0.5582, 'grad_norm': 1.1034433841705322, 'learning_rate': 1.819098651525905e-05, 'epoch': 0.09}


  9%|▉         | 26000/281800 [39:36<6:31:31, 10.89it/s] 

{'loss': 0.5677, 'grad_norm': 1.3395615816116333, 'learning_rate': 1.8155500354861606e-05, 'epoch': 0.09}


  9%|▉         | 26500/281800 [40:22<6:16:26, 11.30it/s] 

{'loss': 0.5642, 'grad_norm': 1.1569175720214844, 'learning_rate': 1.812001419446416e-05, 'epoch': 0.09}


 10%|▉         | 27000/281800 [41:08<6:23:09, 11.08it/s] 

{'loss': 0.5591, 'grad_norm': 1.2829513549804688, 'learning_rate': 1.8084528034066716e-05, 'epoch': 0.1}


 10%|▉         | 27500/281800 [41:54<6:10:24, 11.44it/s] 

{'loss': 0.5613, 'grad_norm': 1.3330316543579102, 'learning_rate': 1.8049112845990066e-05, 'epoch': 0.1}


 10%|▉         | 28000/281800 [42:39<6:13:45, 11.32it/s] 

{'loss': 0.55, 'grad_norm': 1.5127109289169312, 'learning_rate': 1.801362668559262e-05, 'epoch': 0.1}


 10%|█         | 28500/281800 [43:25<6:21:13, 11.07it/s] 

{'loss': 0.5563, 'grad_norm': 1.0048608779907227, 'learning_rate': 1.7978140525195175e-05, 'epoch': 0.1}


 10%|█         | 29000/281800 [44:11<6:19:28, 11.10it/s] 

{'loss': 0.5614, 'grad_norm': 1.0492982864379883, 'learning_rate': 1.794265436479773e-05, 'epoch': 0.1}


 10%|█         | 29500/281800 [44:56<5:59:27, 11.70it/s] 

{'loss': 0.5595, 'grad_norm': 1.074137568473816, 'learning_rate': 1.7907239176721082e-05, 'epoch': 0.1}


 11%|█         | 30000/281800 [45:42<6:16:01, 11.16it/s] 

{'loss': 0.5681, 'grad_norm': 1.0880930423736572, 'learning_rate': 1.787182398864443e-05, 'epoch': 0.11}


 11%|█         | 30500/281800 [46:27<6:02:50, 11.54it/s] 

{'loss': 0.5448, 'grad_norm': 1.0425904989242554, 'learning_rate': 1.7836337828246985e-05, 'epoch': 0.11}


 11%|█         | 31000/281800 [47:13<6:13:35, 11.19it/s] 

{'loss': 0.5447, 'grad_norm': 1.0200183391571045, 'learning_rate': 1.7800851667849538e-05, 'epoch': 0.11}


 11%|█         | 31500/281800 [47:59<6:07:32, 11.35it/s] 

{'loss': 0.5646, 'grad_norm': 1.2593196630477905, 'learning_rate': 1.7765365507452094e-05, 'epoch': 0.11}


 11%|█▏        | 32000/281800 [48:44<5:44:06, 12.10it/s] 

{'loss': 0.552, 'grad_norm': 1.082169532775879, 'learning_rate': 1.772987934705465e-05, 'epoch': 0.11}


 12%|█▏        | 32500/281800 [49:29<6:18:39, 10.97it/s] 

{'loss': 0.5616, 'grad_norm': 1.2383627891540527, 'learning_rate': 1.7694393186657207e-05, 'epoch': 0.12}


 12%|█▏        | 33000/281800 [50:15<6:20:57, 10.88it/s] 

{'loss': 0.5607, 'grad_norm': 1.0247368812561035, 'learning_rate': 1.765890702625976e-05, 'epoch': 0.12}


 12%|█▏        | 33500/281800 [51:01<5:59:57, 11.50it/s] 

{'loss': 0.5467, 'grad_norm': 1.0528240203857422, 'learning_rate': 1.762349183818311e-05, 'epoch': 0.12}


 12%|█▏        | 34000/281800 [51:46<6:13:30, 11.06it/s] 

{'loss': 0.5498, 'grad_norm': 1.109207272529602, 'learning_rate': 1.7588005677785663e-05, 'epoch': 0.12}


 12%|█▏        | 34500/281800 [52:32<6:10:36, 11.12it/s] 

{'loss': 0.545, 'grad_norm': 0.9619848728179932, 'learning_rate': 1.755251951738822e-05, 'epoch': 0.12}


 12%|█▏        | 35000/281800 [53:18<6:01:54, 11.37it/s] 

{'loss': 0.5309, 'grad_norm': 1.22623610496521, 'learning_rate': 1.7517033356990776e-05, 'epoch': 0.12}


 13%|█▎        | 35500/281800 [54:03<6:06:38, 11.20it/s] 

{'loss': 0.5475, 'grad_norm': 1.0519026517868042, 'learning_rate': 1.748154719659333e-05, 'epoch': 0.13}


 13%|█▎        | 36000/281800 [54:49<5:56:22, 11.50it/s] 

{'loss': 0.5427, 'grad_norm': 1.1530616283416748, 'learning_rate': 1.7446061036195885e-05, 'epoch': 0.13}


 13%|█▎        | 36500/281800 [55:34<6:04:21, 11.22it/s] 

{'loss': 0.5483, 'grad_norm': 1.3780359029769897, 'learning_rate': 1.741057487579844e-05, 'epoch': 0.13}


 13%|█▎        | 37000/281800 [56:20<6:02:07, 11.27it/s] 

{'loss': 0.5547, 'grad_norm': 1.0022187232971191, 'learning_rate': 1.7375088715400994e-05, 'epoch': 0.13}


 13%|█▎        | 37500/281800 [57:05<6:04:23, 11.17it/s] 

{'loss': 0.5509, 'grad_norm': 1.2848938703536987, 'learning_rate': 1.7339673527324345e-05, 'epoch': 0.13}


 13%|█▎        | 38000/281800 [57:51<6:01:18, 11.25it/s] 

{'loss': 0.5407, 'grad_norm': 1.2499895095825195, 'learning_rate': 1.7304187366926898e-05, 'epoch': 0.13}


 14%|█▎        | 38500/281800 [58:37<5:36:06, 12.06it/s] 

{'loss': 0.545, 'grad_norm': 1.1048612594604492, 'learning_rate': 1.7268701206529454e-05, 'epoch': 0.14}


 14%|█▍        | 39000/281800 [59:23<6:07:21, 11.02it/s] 

{'loss': 0.5465, 'grad_norm': 0.988811194896698, 'learning_rate': 1.723321504613201e-05, 'epoch': 0.14}


 14%|█▍        | 39500/281800 [1:00:09<5:57:10, 11.31it/s]

{'loss': 0.5477, 'grad_norm': 1.2736809253692627, 'learning_rate': 1.719779985805536e-05, 'epoch': 0.14}


 14%|█▍        | 40000/281800 [1:00:55<5:46:24, 11.63it/s] 

{'loss': 0.5456, 'grad_norm': 1.3694251775741577, 'learning_rate': 1.7162313697657917e-05, 'epoch': 0.14}


 14%|█▍        | 40500/281800 [1:01:40<5:58:08, 11.23it/s] 

{'loss': 0.55, 'grad_norm': 1.251103401184082, 'learning_rate': 1.712682753726047e-05, 'epoch': 0.14}


 15%|█▍        | 41000/281800 [1:02:25<5:54:07, 11.33it/s] 

{'loss': 0.5356, 'grad_norm': 1.1059536933898926, 'learning_rate': 1.7091341376863023e-05, 'epoch': 0.15}


 15%|█▍        | 41500/281800 [1:03:11<5:52:38, 11.36it/s] 

{'loss': 0.5404, 'grad_norm': 1.2490112781524658, 'learning_rate': 1.7055926188786373e-05, 'epoch': 0.15}


 15%|█▍        | 42000/281800 [1:03:57<6:03:30, 10.99it/s] 

{'loss': 0.5365, 'grad_norm': 1.0613757371902466, 'learning_rate': 1.7020511000709726e-05, 'epoch': 0.15}


 15%|█▌        | 42500/281800 [1:04:43<5:58:46, 11.12it/s] 

{'loss': 0.5361, 'grad_norm': 1.237758994102478, 'learning_rate': 1.698502484031228e-05, 'epoch': 0.15}


 15%|█▌        | 43000/281800 [1:05:29<6:04:27, 10.92it/s] 

{'loss': 0.5369, 'grad_norm': 1.471661925315857, 'learning_rate': 1.6949538679914836e-05, 'epoch': 0.15}


 15%|█▌        | 43500/281800 [1:06:14<5:53:40, 11.23it/s] 

{'loss': 0.5386, 'grad_norm': 1.0337486267089844, 'learning_rate': 1.691405251951739e-05, 'epoch': 0.15}


 16%|█▌        | 44000/281800 [1:07:00<5:52:19, 11.25it/s] 

{'loss': 0.5358, 'grad_norm': 1.360400676727295, 'learning_rate': 1.6878566359119945e-05, 'epoch': 0.16}


 16%|█▌        | 44500/281800 [1:07:45<5:56:04, 11.11it/s] 

{'loss': 0.5397, 'grad_norm': 1.2226628065109253, 'learning_rate': 1.6843080198722498e-05, 'epoch': 0.16}


 16%|█▌        | 45000/281800 [1:08:32<6:00:36, 10.94it/s] 

{'loss': 0.5442, 'grad_norm': 1.2453649044036865, 'learning_rate': 1.6807594038325054e-05, 'epoch': 0.16}


 16%|█▌        | 45500/281800 [1:09:18<5:38:08, 11.65it/s] 

{'loss': 0.5391, 'grad_norm': 1.0153733491897583, 'learning_rate': 1.677210787792761e-05, 'epoch': 0.16}


 16%|█▋        | 46000/281800 [1:10:03<5:49:46, 11.24it/s] 

{'loss': 0.5346, 'grad_norm': 1.1595566272735596, 'learning_rate': 1.6736763662171755e-05, 'epoch': 0.16}


 17%|█▋        | 46500/281800 [1:10:49<5:50:13, 11.20it/s] 

{'loss': 0.5398, 'grad_norm': 0.8785991668701172, 'learning_rate': 1.6701277501774308e-05, 'epoch': 0.17}


 17%|█▋        | 47000/281800 [1:11:34<5:47:12, 11.27it/s] 

{'loss': 0.5283, 'grad_norm': 1.268650770187378, 'learning_rate': 1.6665791341376864e-05, 'epoch': 0.17}


 17%|█▋        | 47500/281800 [1:12:20<5:45:21, 11.31it/s] 

{'loss': 0.5282, 'grad_norm': 1.0471786260604858, 'learning_rate': 1.663030518097942e-05, 'epoch': 0.17}


 17%|█▋        | 48000/281800 [1:13:05<5:30:09, 11.80it/s] 

{'loss': 0.5384, 'grad_norm': 1.1059118509292603, 'learning_rate': 1.6594819020581973e-05, 'epoch': 0.17}


 17%|█▋        | 48500/281800 [1:13:50<5:47:25, 11.19it/s] 

{'loss': 0.545, 'grad_norm': 1.0890403985977173, 'learning_rate': 1.6559403832505327e-05, 'epoch': 0.17}


 17%|█▋        | 49000/281800 [1:14:36<5:51:18, 11.04it/s] 

{'loss': 0.5334, 'grad_norm': 1.0048788785934448, 'learning_rate': 1.652391767210788e-05, 'epoch': 0.17}


 18%|█▊        | 49500/281800 [1:15:22<5:46:10, 11.18it/s] 

{'loss': 0.5436, 'grad_norm': 0.96970134973526, 'learning_rate': 1.6488431511710433e-05, 'epoch': 0.18}


 18%|█▊        | 50000/281800 [1:16:07<5:46:26, 11.15it/s] 

{'loss': 0.5413, 'grad_norm': 1.24338698387146, 'learning_rate': 1.645294535131299e-05, 'epoch': 0.18}


 18%|█▊        | 50500/281800 [1:16:53<5:42:16, 11.26it/s] 

{'loss': 0.5349, 'grad_norm': 1.2311851978302002, 'learning_rate': 1.6417459190915542e-05, 'epoch': 0.18}


 18%|█▊        | 51000/281800 [1:17:38<5:38:47, 11.35it/s] 

{'loss': 0.5247, 'grad_norm': 1.0754059553146362, 'learning_rate': 1.63819730305181e-05, 'epoch': 0.18}


 18%|█▊        | 51500/281800 [1:18:24<5:28:52, 11.67it/s] 

{'loss': 0.5381, 'grad_norm': 1.090555191040039, 'learning_rate': 1.6346486870120655e-05, 'epoch': 0.18}


 18%|█▊        | 52000/281800 [1:19:09<5:38:12, 11.32it/s] 

{'loss': 0.5321, 'grad_norm': 1.3277870416641235, 'learning_rate': 1.631100070972321e-05, 'epoch': 0.18}


 19%|█▊        | 52500/281800 [1:19:55<5:39:55, 11.24it/s] 

{'loss': 0.532, 'grad_norm': 0.9401282668113708, 'learning_rate': 1.6275585521646558e-05, 'epoch': 0.19}


 19%|█▉        | 53000/281800 [1:20:40<5:42:46, 11.13it/s] 

{'loss': 0.5339, 'grad_norm': 1.0545721054077148, 'learning_rate': 1.6240099361249114e-05, 'epoch': 0.19}


 19%|█▉        | 53500/281800 [1:21:24<6:21:45,  9.97it/s] 

{'loss': 0.5276, 'grad_norm': 0.9685181975364685, 'learning_rate': 1.6204613200851667e-05, 'epoch': 0.19}


 19%|█▉        | 54000/281800 [1:22:03<4:32:10, 13.95it/s] 

{'loss': 0.5157, 'grad_norm': 1.0624477863311768, 'learning_rate': 1.6169127040454224e-05, 'epoch': 0.19}


 19%|█▉        | 54500/281800 [1:22:39<4:16:35, 14.76it/s] 

{'loss': 0.542, 'grad_norm': 1.2168822288513184, 'learning_rate': 1.6133711852377574e-05, 'epoch': 0.19}


 20%|█▉        | 55000/281800 [1:23:14<4:13:43, 14.90it/s] 

{'loss': 0.5173, 'grad_norm': 1.193863868713379, 'learning_rate': 1.609822569198013e-05, 'epoch': 0.2}


 20%|█▉        | 55500/281800 [1:23:49<4:17:04, 14.67it/s] 

{'loss': 0.5335, 'grad_norm': 1.028759241104126, 'learning_rate': 1.6062739531582687e-05, 'epoch': 0.2}


 20%|█▉        | 56000/281800 [1:24:24<4:18:03, 14.58it/s] 

{'loss': 0.5302, 'grad_norm': 1.042725920677185, 'learning_rate': 1.602725337118524e-05, 'epoch': 0.2}


 20%|██        | 56500/281800 [1:24:59<4:15:36, 14.69it/s] 

{'loss': 0.5354, 'grad_norm': 1.1392710208892822, 'learning_rate': 1.5991767210787793e-05, 'epoch': 0.2}


 20%|██        | 57000/281800 [1:25:35<4:17:23, 14.56it/s] 

{'loss': 0.5145, 'grad_norm': 0.9796858429908752, 'learning_rate': 1.595628105039035e-05, 'epoch': 0.2}


 20%|██        | 57500/281800 [1:26:24<5:53:34, 10.57it/s] 

{'loss': 0.5261, 'grad_norm': 0.9599082469940186, 'learning_rate': 1.5920794889992902e-05, 'epoch': 0.2}


 21%|██        | 58000/281800 [1:27:13<5:54:22, 10.53it/s] 

{'loss': 0.5221, 'grad_norm': 1.2830137014389038, 'learning_rate': 1.5885308729595458e-05, 'epoch': 0.21}


 21%|██        | 58500/281800 [1:28:02<5:53:38, 10.52it/s] 

{'loss': 0.5264, 'grad_norm': 0.8477219939231873, 'learning_rate': 1.584989354151881e-05, 'epoch': 0.21}


 21%|██        | 59000/281800 [1:28:52<5:53:25, 10.51it/s] 

{'loss': 0.5334, 'grad_norm': 1.2427664995193481, 'learning_rate': 1.5814407381121365e-05, 'epoch': 0.21}


 21%|██        | 59500/281800 [1:29:41<5:47:20, 10.67it/s] 

{'loss': 0.5285, 'grad_norm': 1.1137595176696777, 'learning_rate': 1.577892122072392e-05, 'epoch': 0.21}


 21%|██▏       | 60000/281800 [1:30:30<5:41:55, 10.81it/s] 

{'loss': 0.5327, 'grad_norm': 1.0191081762313843, 'learning_rate': 1.5743435060326474e-05, 'epoch': 0.21}


 21%|██▏       | 60500/281800 [1:31:19<5:47:26, 10.62it/s] 

{'loss': 0.5222, 'grad_norm': 1.1378185749053955, 'learning_rate': 1.5708019872249824e-05, 'epoch': 0.21}


 22%|██▏       | 61000/281800 [1:32:08<5:52:45, 10.43it/s] 

{'loss': 0.5216, 'grad_norm': 1.059006690979004, 'learning_rate': 1.5672533711852377e-05, 'epoch': 0.22}


 22%|██▏       | 61500/281800 [1:32:58<5:48:19, 10.54it/s] 

{'loss': 0.5292, 'grad_norm': 1.0664358139038086, 'learning_rate': 1.5637047551454934e-05, 'epoch': 0.22}


 22%|██▏       | 62000/281800 [1:33:46<5:45:51, 10.59it/s] 

{'loss': 0.5228, 'grad_norm': 1.283945083618164, 'learning_rate': 1.560156139105749e-05, 'epoch': 0.22}


 22%|██▏       | 62500/281800 [1:34:35<5:50:18, 10.43it/s] 

{'loss': 0.526, 'grad_norm': 1.0460909605026245, 'learning_rate': 1.556614620298084e-05, 'epoch': 0.22}


 22%|██▏       | 63000/281800 [1:35:24<5:40:59, 10.69it/s] 

{'loss': 0.5467, 'grad_norm': 1.1117905378341675, 'learning_rate': 1.5530660042583393e-05, 'epoch': 0.22}


 23%|██▎       | 63500/281800 [1:36:14<5:46:07, 10.51it/s] 

{'loss': 0.522, 'grad_norm': 0.8157203793525696, 'learning_rate': 1.5495173882185946e-05, 'epoch': 0.23}


 23%|██▎       | 64000/281800 [1:37:04<5:44:09, 10.55it/s] 

{'loss': 0.5264, 'grad_norm': 1.0291228294372559, 'learning_rate': 1.5459687721788502e-05, 'epoch': 0.23}


 23%|██▎       | 64500/281800 [1:37:53<5:40:52, 10.62it/s] 

{'loss': 0.5253, 'grad_norm': 0.8175716400146484, 'learning_rate': 1.5424272533711852e-05, 'epoch': 0.23}


 23%|██▎       | 65000/281800 [1:38:43<5:48:19, 10.37it/s] 

{'loss': 0.5224, 'grad_norm': 1.0669829845428467, 'learning_rate': 1.538878637331441e-05, 'epoch': 0.23}


 23%|██▎       | 65500/281800 [1:39:32<5:42:27, 10.53it/s] 

{'loss': 0.5163, 'grad_norm': 1.2452670335769653, 'learning_rate': 1.5353300212916965e-05, 'epoch': 0.23}


 23%|██▎       | 66000/281800 [1:40:21<5:39:57, 10.58it/s] 

{'loss': 0.5247, 'grad_norm': 1.0932191610336304, 'learning_rate': 1.5317814052519518e-05, 'epoch': 0.23}


 24%|██▎       | 66500/281800 [1:41:10<5:36:47, 10.65it/s] 

{'loss': 0.5159, 'grad_norm': 1.0713059902191162, 'learning_rate': 1.528239886444287e-05, 'epoch': 0.24}


 24%|██▍       | 67000/281800 [1:41:59<5:38:12, 10.59it/s] 

{'loss': 0.5312, 'grad_norm': 1.0364112854003906, 'learning_rate': 1.5246912704045425e-05, 'epoch': 0.24}


 24%|██▍       | 67500/281800 [1:42:48<5:33:02, 10.72it/s] 

{'loss': 0.5301, 'grad_norm': 1.1045020818710327, 'learning_rate': 1.5211426543647978e-05, 'epoch': 0.24}


 24%|██▍       | 68000/281800 [1:43:37<5:39:49, 10.49it/s] 

{'loss': 0.5255, 'grad_norm': 1.122795820236206, 'learning_rate': 1.5175940383250534e-05, 'epoch': 0.24}


 24%|██▍       | 68500/281800 [1:44:26<5:37:35, 10.53it/s] 

{'loss': 0.518, 'grad_norm': 0.9170549511909485, 'learning_rate': 1.5140525195173882e-05, 'epoch': 0.24}


 24%|██▍       | 69000/281800 [1:45:15<5:43:37, 10.32it/s] 

{'loss': 0.5235, 'grad_norm': 1.098803997039795, 'learning_rate': 1.5105039034776439e-05, 'epoch': 0.24}


 25%|██▍       | 69500/281800 [1:46:05<5:36:56, 10.50it/s] 

{'loss': 0.5312, 'grad_norm': 1.0564814805984497, 'learning_rate': 1.5069552874378994e-05, 'epoch': 0.25}


 25%|██▍       | 70000/281800 [1:46:54<5:43:48, 10.27it/s] 

{'loss': 0.5245, 'grad_norm': 1.2651070356369019, 'learning_rate': 1.5034066713981548e-05, 'epoch': 0.25}


 25%|██▌       | 70500/281800 [1:47:43<5:30:05, 10.67it/s] 

{'loss': 0.5263, 'grad_norm': 1.2766330242156982, 'learning_rate': 1.4998580553584103e-05, 'epoch': 0.25}


 25%|██▌       | 71000/281800 [1:48:32<5:32:23, 10.57it/s] 

{'loss': 0.5311, 'grad_norm': 0.9460147023200989, 'learning_rate': 1.4963165365507453e-05, 'epoch': 0.25}


 25%|██▌       | 71500/281800 [1:49:21<5:31:06, 10.59it/s] 

{'loss': 0.5117, 'grad_norm': 1.0260452032089233, 'learning_rate': 1.4927679205110008e-05, 'epoch': 0.25}


 26%|██▌       | 72000/281800 [1:50:10<5:30:36, 10.58it/s] 

{'loss': 0.5226, 'grad_norm': 1.049854040145874, 'learning_rate': 1.4892193044712564e-05, 'epoch': 0.26}


 26%|██▌       | 72500/281800 [1:50:59<5:34:13, 10.44it/s] 

{'loss': 0.5161, 'grad_norm': 1.1086502075195312, 'learning_rate': 1.4856706884315117e-05, 'epoch': 0.26}


 26%|██▌       | 73000/281800 [1:51:48<5:30:54, 10.52it/s] 

{'loss': 0.5137, 'grad_norm': 1.1284183263778687, 'learning_rate': 1.4821291696238469e-05, 'epoch': 0.26}


 26%|██▌       | 73500/281800 [1:52:38<5:33:02, 10.42it/s] 

{'loss': 0.5142, 'grad_norm': 1.0431259870529175, 'learning_rate': 1.4785805535841022e-05, 'epoch': 0.26}


 26%|██▋       | 74000/281800 [1:53:27<5:32:02, 10.43it/s] 

{'loss': 0.5153, 'grad_norm': 0.8173641562461853, 'learning_rate': 1.4750319375443578e-05, 'epoch': 0.26}


 26%|██▋       | 74500/281800 [1:54:16<5:30:31, 10.45it/s] 

{'loss': 0.5213, 'grad_norm': 1.1194462776184082, 'learning_rate': 1.4714833215046135e-05, 'epoch': 0.26}


 27%|██▋       | 75000/281800 [1:55:05<5:25:57, 10.57it/s] 

{'loss': 0.5269, 'grad_norm': 1.1464323997497559, 'learning_rate': 1.4679418026969483e-05, 'epoch': 0.27}


 27%|██▋       | 75500/281800 [1:55:54<5:26:43, 10.52it/s] 

{'loss': 0.5197, 'grad_norm': 1.4506088495254517, 'learning_rate': 1.464393186657204e-05, 'epoch': 0.27}


 27%|██▋       | 76000/281800 [1:56:43<5:24:43, 10.56it/s] 

{'loss': 0.508, 'grad_norm': 1.055250883102417, 'learning_rate': 1.4608445706174592e-05, 'epoch': 0.27}


 27%|██▋       | 76500/281800 [1:57:32<5:18:07, 10.76it/s] 

{'loss': 0.5173, 'grad_norm': 0.9783798456192017, 'learning_rate': 1.4572959545777147e-05, 'epoch': 0.27}


 27%|██▋       | 77000/281800 [1:58:21<5:26:38, 10.45it/s] 

{'loss': 0.5087, 'grad_norm': 1.24388587474823, 'learning_rate': 1.4537544357700497e-05, 'epoch': 0.27}


 28%|██▊       | 77500/281800 [1:59:10<5:20:58, 10.61it/s] 

{'loss': 0.5236, 'grad_norm': 0.9542014598846436, 'learning_rate': 1.4502058197303053e-05, 'epoch': 0.28}


 28%|██▊       | 78000/281800 [1:59:59<5:24:05, 10.48it/s] 

{'loss': 0.5252, 'grad_norm': 1.0257898569107056, 'learning_rate': 1.4466572036905608e-05, 'epoch': 0.28}


 28%|██▊       | 78500/281800 [2:00:48<5:19:52, 10.59it/s] 

{'loss': 0.5127, 'grad_norm': 1.0041415691375732, 'learning_rate': 1.4431156848828958e-05, 'epoch': 0.28}


 28%|██▊       | 79000/281800 [2:01:37<5:19:12, 10.59it/s] 

{'loss': 0.5106, 'grad_norm': 1.3106333017349243, 'learning_rate': 1.4395670688431513e-05, 'epoch': 0.28}


 28%|██▊       | 79500/281800 [2:02:27<5:20:20, 10.53it/s] 

{'loss': 0.5148, 'grad_norm': 1.0662994384765625, 'learning_rate': 1.4360184528034068e-05, 'epoch': 0.28}


 28%|██▊       | 80000/281800 [2:03:16<5:19:26, 10.53it/s] 

{'loss': 0.5314, 'grad_norm': 0.8897107243537903, 'learning_rate': 1.4324698367636622e-05, 'epoch': 0.28}


 29%|██▊       | 80500/281800 [2:04:05<5:23:08, 10.38it/s] 

{'loss': 0.5021, 'grad_norm': 0.9441683888435364, 'learning_rate': 1.4289212207239179e-05, 'epoch': 0.29}


 29%|██▊       | 81000/281800 [2:04:54<5:14:39, 10.64it/s] 

{'loss': 0.5162, 'grad_norm': 0.9047272205352783, 'learning_rate': 1.4253726046841733e-05, 'epoch': 0.29}


 29%|██▉       | 81500/281800 [2:05:43<5:11:33, 10.71it/s] 

{'loss': 0.5099, 'grad_norm': 1.0889866352081299, 'learning_rate': 1.4218239886444288e-05, 'epoch': 0.29}


 29%|██▉       | 82000/281800 [2:06:32<5:14:57, 10.57it/s] 

{'loss': 0.5154, 'grad_norm': 1.038815975189209, 'learning_rate': 1.4182753726046843e-05, 'epoch': 0.29}


 29%|██▉       | 82500/281800 [2:07:21<5:12:15, 10.64it/s] 

{'loss': 0.5055, 'grad_norm': 1.1007376909255981, 'learning_rate': 1.4147338537970193e-05, 'epoch': 0.29}


 29%|██▉       | 83000/281800 [2:08:10<5:19:35, 10.37it/s] 

{'loss': 0.5121, 'grad_norm': 0.9549280405044556, 'learning_rate': 1.4111852377572747e-05, 'epoch': 0.29}


 30%|██▉       | 83500/281800 [2:09:03<5:14:23, 10.51it/s] 

{'loss': 0.5131, 'grad_norm': 0.9322795867919922, 'learning_rate': 1.4076366217175304e-05, 'epoch': 0.3}


 30%|██▉       | 84000/281800 [2:09:54<5:29:59,  9.99it/s] 

{'loss': 0.5146, 'grad_norm': 1.0410672426223755, 'learning_rate': 1.4040880056777857e-05, 'epoch': 0.3}


 30%|██▉       | 84500/281800 [2:10:46<5:17:37, 10.35it/s] 

{'loss': 0.5079, 'grad_norm': 1.1396421194076538, 'learning_rate': 1.4005464868701209e-05, 'epoch': 0.3}


 30%|███       | 85000/281800 [2:11:38<5:26:26, 10.05it/s] 

{'loss': 0.5093, 'grad_norm': 1.3636913299560547, 'learning_rate': 1.3969978708303762e-05, 'epoch': 0.3}


 30%|███       | 85500/281800 [2:12:32<5:21:54, 10.16it/s] 

{'loss': 0.5149, 'grad_norm': 1.1941474676132202, 'learning_rate': 1.3934492547906318e-05, 'epoch': 0.3}


 31%|███       | 86000/281800 [2:13:24<5:36:07,  9.71it/s] 

{'loss': 0.5162, 'grad_norm': 0.9566338658332825, 'learning_rate': 1.3899006387508873e-05, 'epoch': 0.31}


 31%|███       | 86500/281800 [2:14:16<5:14:54, 10.34it/s] 

{'loss': 0.5149, 'grad_norm': 1.0251845121383667, 'learning_rate': 1.3863520227111427e-05, 'epoch': 0.31}


 31%|███       | 87000/281800 [2:15:08<5:29:44,  9.85it/s] 

{'loss': 0.5046, 'grad_norm': 0.8732764720916748, 'learning_rate': 1.3828105039034777e-05, 'epoch': 0.31}


 31%|███       | 87500/281800 [2:16:00<5:11:27, 10.40it/s] 

{'loss': 0.5124, 'grad_norm': 1.1576164960861206, 'learning_rate': 1.3792618878637332e-05, 'epoch': 0.31}


 31%|███       | 88000/281800 [2:16:51<5:08:11, 10.48it/s] 

{'loss': 0.502, 'grad_norm': 1.2324497699737549, 'learning_rate': 1.3757132718239887e-05, 'epoch': 0.31}


 31%|███▏      | 88500/281800 [2:17:43<5:06:50, 10.50it/s] 

{'loss': 0.512, 'grad_norm': 1.0403692722320557, 'learning_rate': 1.3721646557842443e-05, 'epoch': 0.31}


 32%|███▏      | 89000/281800 [2:18:34<5:13:48, 10.24it/s] 

{'loss': 0.5123, 'grad_norm': 1.1961222887039185, 'learning_rate': 1.3686231369765792e-05, 'epoch': 0.32}


 32%|███▏      | 89500/281800 [2:19:26<5:24:24,  9.88it/s] 

{'loss': 0.5113, 'grad_norm': 1.0855181217193604, 'learning_rate': 1.3650745209368348e-05, 'epoch': 0.32}


 32%|███▏      | 90000/281800 [2:20:20<5:08:37, 10.36it/s] 

{'loss': 0.5042, 'grad_norm': 1.1526591777801514, 'learning_rate': 1.3615259048970904e-05, 'epoch': 0.32}


 32%|███▏      | 90500/281800 [2:21:12<5:07:02, 10.38it/s] 

{'loss': 0.5047, 'grad_norm': 1.1978740692138672, 'learning_rate': 1.3579772888573457e-05, 'epoch': 0.32}


 32%|███▏      | 91000/281800 [2:22:04<5:27:45,  9.70it/s] 

{'loss': 0.5095, 'grad_norm': 1.0991114377975464, 'learning_rate': 1.3544357700496806e-05, 'epoch': 0.32}


 32%|███▏      | 91500/281800 [2:22:55<5:10:04, 10.23it/s] 

{'loss': 0.5206, 'grad_norm': 0.9006065130233765, 'learning_rate': 1.3508871540099362e-05, 'epoch': 0.32}


 33%|███▎      | 92000/281800 [2:23:46<4:59:52, 10.55it/s] 

{'loss': 0.5116, 'grad_norm': 1.1281867027282715, 'learning_rate': 1.3473385379701918e-05, 'epoch': 0.33}


 33%|███▎      | 92500/281800 [2:24:37<5:12:48, 10.09it/s] 

{'loss': 0.5061, 'grad_norm': 1.115171194076538, 'learning_rate': 1.3437899219304473e-05, 'epoch': 0.33}


 33%|███▎      | 93000/281800 [2:25:30<5:16:52,  9.93it/s] 

{'loss': 0.5188, 'grad_norm': 0.9870453476905823, 'learning_rate': 1.3402484031227823e-05, 'epoch': 0.33}


 33%|███▎      | 93500/281800 [2:26:23<5:35:09,  9.36it/s] 

{'loss': 0.5163, 'grad_norm': 1.0105611085891724, 'learning_rate': 1.3366997870830378e-05, 'epoch': 0.33}


 33%|███▎      | 94000/281800 [2:27:13<5:25:09,  9.63it/s] 

{'loss': 0.5072, 'grad_norm': 0.891745924949646, 'learning_rate': 1.3331511710432931e-05, 'epoch': 0.33}


 34%|███▎      | 94500/281800 [2:28:08<5:01:27, 10.36it/s] 

{'loss': 0.5125, 'grad_norm': 1.0577201843261719, 'learning_rate': 1.3296025550035487e-05, 'epoch': 0.34}


 34%|███▎      | 95000/281800 [2:29:02<4:56:40, 10.49it/s] 

{'loss': 0.5231, 'grad_norm': 1.013600468635559, 'learning_rate': 1.3260681334279633e-05, 'epoch': 0.34}


 34%|███▍      | 95500/281800 [2:29:56<5:10:43,  9.99it/s] 

{'loss': 0.52, 'grad_norm': 0.8754515051841736, 'learning_rate': 1.3225195173882186e-05, 'epoch': 0.34}


 34%|███▍      | 96000/281800 [2:30:50<5:21:06,  9.64it/s] 

{'loss': 0.5123, 'grad_norm': 1.1849242448806763, 'learning_rate': 1.3189709013484742e-05, 'epoch': 0.34}


 34%|███▍      | 96500/281800 [2:31:42<5:04:44, 10.13it/s] 

{'loss': 0.5098, 'grad_norm': 1.1027567386627197, 'learning_rate': 1.3154222853087297e-05, 'epoch': 0.34}


 34%|███▍      | 97000/281800 [2:32:35<4:53:50, 10.48it/s] 

{'loss': 0.5256, 'grad_norm': 0.945651650428772, 'learning_rate': 1.3118736692689853e-05, 'epoch': 0.34}


 35%|███▍      | 97500/281800 [2:33:28<5:02:35, 10.15it/s] 

{'loss': 0.5148, 'grad_norm': 1.165950059890747, 'learning_rate': 1.3083250532292406e-05, 'epoch': 0.35}


 35%|███▍      | 98000/281800 [2:34:21<5:31:31,  9.24it/s] 

{'loss': 0.4973, 'grad_norm': 1.258467674255371, 'learning_rate': 1.3047764371894963e-05, 'epoch': 0.35}


 35%|███▍      | 98500/281800 [2:35:13<5:21:28,  9.50it/s] 

{'loss': 0.5058, 'grad_norm': 1.0372627973556519, 'learning_rate': 1.3012278211497517e-05, 'epoch': 0.35}


 35%|███▌      | 99000/281800 [2:36:07<5:23:59,  9.40it/s] 

{'loss': 0.5064, 'grad_norm': 1.1874940395355225, 'learning_rate': 1.2976863023420867e-05, 'epoch': 0.35}


 35%|███▌      | 99500/281800 [2:36:59<5:28:39,  9.24it/s] 

{'loss': 0.5055, 'grad_norm': 1.4676824808120728, 'learning_rate': 1.2941376863023422e-05, 'epoch': 0.35}


 35%|███▌      | 100000/281800 [2:37:52<5:40:02,  8.91it/s]

{'loss': 0.5095, 'grad_norm': 1.040894865989685, 'learning_rate': 1.2905890702625977e-05, 'epoch': 0.35}


 36%|███▌      | 100500/281800 [2:38:43<5:13:19,  9.64it/s] 

{'loss': 0.5093, 'grad_norm': 1.153449535369873, 'learning_rate': 1.2870404542228531e-05, 'epoch': 0.36}


 36%|███▌      | 101000/281800 [2:39:35<4:35:41, 10.93it/s] 

{'loss': 0.5052, 'grad_norm': 1.3157284259796143, 'learning_rate': 1.2834918381831088e-05, 'epoch': 0.36}


 36%|███▌      | 101500/281800 [2:40:27<4:47:23, 10.46it/s] 

{'loss': 0.5023, 'grad_norm': 0.9462428689002991, 'learning_rate': 1.2799503193754436e-05, 'epoch': 0.36}


 36%|███▌      | 102000/281800 [2:41:17<4:56:51, 10.09it/s] 

{'loss': 0.5175, 'grad_norm': 0.9476959705352783, 'learning_rate': 1.2764017033356993e-05, 'epoch': 0.36}


 36%|███▋      | 102500/281800 [2:42:09<5:27:39,  9.12it/s] 

{'loss': 0.5106, 'grad_norm': 1.0692501068115234, 'learning_rate': 1.2728530872959546e-05, 'epoch': 0.36}


 37%|███▋      | 103000/281800 [2:43:02<4:44:01, 10.49it/s] 

{'loss': 0.5214, 'grad_norm': 0.9643185138702393, 'learning_rate': 1.2693044712562102e-05, 'epoch': 0.37}


 37%|███▋      | 103500/281800 [2:43:53<5:20:20,  9.28it/s] 

{'loss': 0.5095, 'grad_norm': 1.0661299228668213, 'learning_rate': 1.265762952448545e-05, 'epoch': 0.37}


 37%|███▋      | 104000/281800 [2:44:47<4:56:42,  9.99it/s] 

{'loss': 0.4998, 'grad_norm': 1.0312525033950806, 'learning_rate': 1.2622143364088007e-05, 'epoch': 0.37}


 37%|███▋      | 104500/281800 [2:45:41<4:45:56, 10.33it/s] 

{'loss': 0.5036, 'grad_norm': 0.9031345844268799, 'learning_rate': 1.2586657203690561e-05, 'epoch': 0.37}


 37%|███▋      | 105000/281800 [2:46:33<4:55:38,  9.97it/s] 

{'loss': 0.5127, 'grad_norm': 1.2287862300872803, 'learning_rate': 1.2551171043293118e-05, 'epoch': 0.37}


 37%|███▋      | 105500/281800 [2:47:25<5:13:26,  9.37it/s] 

{'loss': 0.5105, 'grad_norm': 0.8069232702255249, 'learning_rate': 1.2515755855216468e-05, 'epoch': 0.37}


 38%|███▊      | 106000/281800 [2:48:18<4:45:27, 10.26it/s] 

{'loss': 0.5023, 'grad_norm': 0.9573516249656677, 'learning_rate': 1.2480269694819021e-05, 'epoch': 0.38}


 38%|███▊      | 106500/281800 [2:49:08<4:40:54, 10.40it/s] 

{'loss': 0.5047, 'grad_norm': 1.1563235521316528, 'learning_rate': 1.2444783534421576e-05, 'epoch': 0.38}


 38%|███▊      | 107000/281800 [2:49:59<4:22:00, 11.12it/s] 

{'loss': 0.5064, 'grad_norm': 0.919673502445221, 'learning_rate': 1.2409297374024132e-05, 'epoch': 0.38}


 38%|███▊      | 107500/281800 [2:50:52<5:14:52,  9.23it/s] 

{'loss': 0.4932, 'grad_norm': 1.1708383560180664, 'learning_rate': 1.237388218594748e-05, 'epoch': 0.38}


 38%|███▊      | 108000/281800 [2:51:44<4:36:32, 10.47it/s] 

{'loss': 0.5139, 'grad_norm': 1.2670916318893433, 'learning_rate': 1.2338396025550037e-05, 'epoch': 0.38}


 39%|███▊      | 108500/281800 [2:52:35<4:53:30,  9.84it/s] 

{'loss': 0.4976, 'grad_norm': 1.0001097917556763, 'learning_rate': 1.2302909865152593e-05, 'epoch': 0.39}


 39%|███▊      | 109000/281800 [2:53:27<4:45:06, 10.10it/s] 

{'loss': 0.5112, 'grad_norm': 1.1077604293823242, 'learning_rate': 1.2267494677075942e-05, 'epoch': 0.39}


 39%|███▉      | 109500/281800 [2:54:20<4:43:28, 10.13it/s] 

{'loss': 0.4953, 'grad_norm': 1.002794623374939, 'learning_rate': 1.2232008516678498e-05, 'epoch': 0.39}


 39%|███▉      | 110000/281800 [2:55:12<4:31:36, 10.54it/s] 

{'loss': 0.5125, 'grad_norm': 1.0230333805084229, 'learning_rate': 1.2196522356281051e-05, 'epoch': 0.39}


 39%|███▉      | 110500/281800 [2:56:03<3:14:54, 14.65it/s] 

{'loss': 0.5066, 'grad_norm': 0.9650119543075562, 'learning_rate': 1.2161036195883607e-05, 'epoch': 0.39}


 39%|███▉      | 111000/281800 [2:56:38<3:11:47, 14.84it/s] 

{'loss': 0.509, 'grad_norm': 1.051639199256897, 'learning_rate': 1.2125550035486162e-05, 'epoch': 0.39}


 40%|███▉      | 111500/281800 [2:57:12<3:11:14, 14.84it/s] 

{'loss': 0.5042, 'grad_norm': 1.0305429697036743, 'learning_rate': 1.2090063875088715e-05, 'epoch': 0.4}


 40%|███▉      | 112000/281800 [2:57:47<3:11:50, 14.75it/s]

{'loss': 0.5023, 'grad_norm': 1.2849200963974, 'learning_rate': 1.2054577714691271e-05, 'epoch': 0.4}


 40%|███▉      | 112500/281800 [2:58:21<3:10:02, 14.85it/s]

{'loss': 0.5021, 'grad_norm': 1.1817188262939453, 'learning_rate': 1.2019091554293828e-05, 'epoch': 0.4}


 40%|████      | 113000/281800 [2:58:56<3:08:44, 14.91it/s]

{'loss': 0.502, 'grad_norm': 1.0918716192245483, 'learning_rate': 1.1983676366217176e-05, 'epoch': 0.4}


 40%|████      | 113500/281800 [2:59:31<3:07:27, 14.96it/s] 

{'loss': 0.5106, 'grad_norm': 0.9322749376296997, 'learning_rate': 1.1948190205819732e-05, 'epoch': 0.4}


 40%|████      | 114000/281800 [3:00:05<3:06:37, 14.99it/s]

{'loss': 0.5101, 'grad_norm': 1.0018908977508545, 'learning_rate': 1.1912704045422285e-05, 'epoch': 0.4}


 41%|████      | 114500/281800 [3:00:40<3:15:13, 14.28it/s]

{'loss': 0.5121, 'grad_norm': 1.0592427253723145, 'learning_rate': 1.1877217885024842e-05, 'epoch': 0.41}


 41%|████      | 115000/281800 [3:01:15<3:09:26, 14.67it/s]

{'loss': 0.5042, 'grad_norm': 0.9425754547119141, 'learning_rate': 1.184180269694819e-05, 'epoch': 0.41}


 41%|████      | 115500/281800 [3:01:50<3:08:36, 14.70it/s]

{'loss': 0.5161, 'grad_norm': 0.8947781920433044, 'learning_rate': 1.1806316536550747e-05, 'epoch': 0.41}


 41%|████      | 116000/281800 [3:02:24<3:12:16, 14.37it/s]

{'loss': 0.5044, 'grad_norm': 1.2578670978546143, 'learning_rate': 1.1770830376153301e-05, 'epoch': 0.41}


 41%|████▏     | 116500/281800 [3:02:59<3:04:41, 14.92it/s]

{'loss': 0.5064, 'grad_norm': 1.0448155403137207, 'learning_rate': 1.1735344215755858e-05, 'epoch': 0.41}


 42%|████▏     | 117000/281800 [3:03:33<3:04:52, 14.86it/s]

{'loss': 0.502, 'grad_norm': 1.040855884552002, 'learning_rate': 1.169985805535841e-05, 'epoch': 0.42}


 42%|████▏     | 117500/281800 [3:04:08<3:06:18, 14.70it/s]

{'loss': 0.5013, 'grad_norm': 1.113305926322937, 'learning_rate': 1.166444286728176e-05, 'epoch': 0.42}


 42%|████▏     | 118000/281800 [3:04:42<3:09:51, 14.38it/s]

{'loss': 0.5048, 'grad_norm': 0.933859646320343, 'learning_rate': 1.1628956706884315e-05, 'epoch': 0.42}


 42%|████▏     | 118500/281800 [3:05:17<3:02:46, 14.89it/s]

{'loss': 0.5114, 'grad_norm': 1.4366730451583862, 'learning_rate': 1.1593470546486872e-05, 'epoch': 0.42}


 42%|████▏     | 119000/281800 [3:05:52<3:04:25, 14.71it/s]

{'loss': 0.4975, 'grad_norm': 0.9674678444862366, 'learning_rate': 1.1557984386089426e-05, 'epoch': 0.42}


 42%|████▏     | 119500/281800 [3:06:26<3:04:28, 14.66it/s]

{'loss': 0.4964, 'grad_norm': 0.9187862277030945, 'learning_rate': 1.1522569198012777e-05, 'epoch': 0.42}


 43%|████▎     | 120000/281800 [3:07:01<3:01:47, 14.83it/s]

{'loss': 0.5006, 'grad_norm': 0.9821234941482544, 'learning_rate': 1.1487083037615331e-05, 'epoch': 0.43}


 43%|████▎     | 120500/281800 [3:07:36<3:00:59, 14.85it/s]

{'loss': 0.5212, 'grad_norm': 0.8266440033912659, 'learning_rate': 1.1451596877217886e-05, 'epoch': 0.43}


 43%|████▎     | 121000/281800 [3:08:11<3:00:33, 14.84it/s]

{'loss': 0.5059, 'grad_norm': 0.9221459627151489, 'learning_rate': 1.141611071682044e-05, 'epoch': 0.43}


 43%|████▎     | 121500/281800 [3:08:45<3:00:26, 14.81it/s]

{'loss': 0.5018, 'grad_norm': 1.1598212718963623, 'learning_rate': 1.138069552874379e-05, 'epoch': 0.43}


 43%|████▎     | 122000/281800 [3:09:20<2:59:13, 14.86it/s]

{'loss': 0.4984, 'grad_norm': 1.0774723291397095, 'learning_rate': 1.1345209368346345e-05, 'epoch': 0.43}


 43%|████▎     | 122500/281800 [3:09:55<2:58:51, 14.84it/s]

{'loss': 0.4982, 'grad_norm': 0.9476217031478882, 'learning_rate': 1.1309723207948902e-05, 'epoch': 0.43}


 44%|████▎     | 123000/281800 [3:10:30<2:59:08, 14.77it/s]

{'loss': 0.5166, 'grad_norm': 1.1092382669448853, 'learning_rate': 1.1274237047551455e-05, 'epoch': 0.44}


 44%|████▍     | 123500/281800 [3:11:04<2:56:54, 14.91it/s]

{'loss': 0.4951, 'grad_norm': 0.985008955001831, 'learning_rate': 1.1238821859474807e-05, 'epoch': 0.44}


 44%|████▍     | 124000/281800 [3:11:39<2:55:18, 15.00it/s]

{'loss': 0.5008, 'grad_norm': 1.1033992767333984, 'learning_rate': 1.120333569907736e-05, 'epoch': 0.44}


 44%|████▍     | 124500/281800 [3:12:13<2:56:46, 14.83it/s]

{'loss': 0.4924, 'grad_norm': 0.890769898891449, 'learning_rate': 1.1167849538679916e-05, 'epoch': 0.44}


 44%|████▍     | 125000/281800 [3:12:48<2:53:30, 15.06it/s]

{'loss': 0.4994, 'grad_norm': 1.1326361894607544, 'learning_rate': 1.1132363378282472e-05, 'epoch': 0.44}


 45%|████▍     | 125500/281800 [3:13:24<2:54:53, 14.89it/s] 

{'loss': 0.4919, 'grad_norm': 1.0778276920318604, 'learning_rate': 1.109694819020582e-05, 'epoch': 0.45}


 45%|████▍     | 126000/281800 [3:13:59<2:56:29, 14.71it/s]

{'loss': 0.4937, 'grad_norm': 1.0835943222045898, 'learning_rate': 1.1061462029808377e-05, 'epoch': 0.45}


 45%|████▍     | 126500/281800 [3:14:34<2:58:41, 14.48it/s]

{'loss': 0.5023, 'grad_norm': 1.2512617111206055, 'learning_rate': 1.102597586941093e-05, 'epoch': 0.45}


 45%|████▌     | 127000/281800 [3:15:09<2:52:53, 14.92it/s]

{'loss': 0.5026, 'grad_norm': 0.892920970916748, 'learning_rate': 1.0990489709013485e-05, 'epoch': 0.45}


 45%|████▌     | 127500/281800 [3:15:43<2:53:46, 14.80it/s] 

{'loss': 0.4926, 'grad_norm': 0.9518246650695801, 'learning_rate': 1.0955074520936835e-05, 'epoch': 0.45}


 45%|████▌     | 128000/281800 [3:16:20<2:51:52, 14.91it/s] 

{'loss': 0.4965, 'grad_norm': 0.8992277979850769, 'learning_rate': 1.0919588360539391e-05, 'epoch': 0.45}


 46%|████▌     | 128500/281800 [3:16:59<3:16:33, 13.00it/s] 

{'loss': 0.505, 'grad_norm': 1.4547984600067139, 'learning_rate': 1.088417317246274e-05, 'epoch': 0.46}


 46%|████▌     | 129000/281800 [3:17:40<2:51:02, 14.89it/s] 

{'loss': 0.4974, 'grad_norm': 1.1829758882522583, 'learning_rate': 1.0848687012065296e-05, 'epoch': 0.46}


 46%|████▌     | 129500/281800 [3:18:14<2:48:36, 15.05it/s]

{'loss': 0.5053, 'grad_norm': 0.8815774321556091, 'learning_rate': 1.081320085166785e-05, 'epoch': 0.46}


 46%|████▌     | 130000/281800 [3:18:49<2:50:06, 14.87it/s]

{'loss': 0.5061, 'grad_norm': 0.9293337464332581, 'learning_rate': 1.0777714691270405e-05, 'epoch': 0.46}


 46%|████▋     | 130500/281800 [3:19:24<2:58:25, 14.13it/s]

{'loss': 0.501, 'grad_norm': 1.0181087255477905, 'learning_rate': 1.074222853087296e-05, 'epoch': 0.46}


 46%|████▋     | 131000/281800 [3:19:58<2:50:00, 14.78it/s]

{'loss': 0.4954, 'grad_norm': 0.9295673370361328, 'learning_rate': 1.0706742370475516e-05, 'epoch': 0.46}


 47%|████▋     | 131500/281800 [3:20:33<2:51:26, 14.61it/s]

{'loss': 0.4991, 'grad_norm': 0.8740679025650024, 'learning_rate': 1.0671256210078071e-05, 'epoch': 0.47}


 47%|████▋     | 132000/281800 [3:21:07<2:50:12, 14.67it/s]

{'loss': 0.5036, 'grad_norm': 1.2018013000488281, 'learning_rate': 1.0635770049680626e-05, 'epoch': 0.47}


 47%|████▋     | 132500/281800 [3:21:42<2:47:49, 14.83it/s]

{'loss': 0.5051, 'grad_norm': 1.1948022842407227, 'learning_rate': 1.0600354861603976e-05, 'epoch': 0.47}


 47%|████▋     | 133000/281800 [3:22:16<2:46:01, 14.94it/s]

{'loss': 0.4975, 'grad_norm': 1.1928563117980957, 'learning_rate': 1.056486870120653e-05, 'epoch': 0.47}


 47%|████▋     | 133500/281800 [3:22:51<2:46:40, 14.83it/s]

{'loss': 0.497, 'grad_norm': 1.1199935674667358, 'learning_rate': 1.0529382540809085e-05, 'epoch': 0.47}


 48%|████▊     | 134000/281800 [3:23:25<2:44:11, 15.00it/s]

{'loss': 0.4949, 'grad_norm': 1.0276587009429932, 'learning_rate': 1.0493896380411642e-05, 'epoch': 0.48}


 48%|████▊     | 134500/281800 [3:24:00<2:44:49, 14.89it/s]

{'loss': 0.5079, 'grad_norm': 0.9690319299697876, 'learning_rate': 1.045848119233499e-05, 'epoch': 0.48}


 48%|████▊     | 135000/281800 [3:24:35<2:43:37, 14.95it/s]

{'loss': 0.501, 'grad_norm': 1.3716593980789185, 'learning_rate': 1.0422995031937546e-05, 'epoch': 0.48}


 48%|████▊     | 135500/281800 [3:25:09<2:41:28, 15.10it/s]

{'loss': 0.509, 'grad_norm': 1.1750847101211548, 'learning_rate': 1.03875088715401e-05, 'epoch': 0.48}


 48%|████▊     | 136000/281800 [3:25:44<2:46:08, 14.63it/s]

{'loss': 0.4904, 'grad_norm': 1.0876094102859497, 'learning_rate': 1.0352022711142656e-05, 'epoch': 0.48}


 48%|████▊     | 136500/281800 [3:26:18<2:46:47, 14.52it/s]

{'loss': 0.498, 'grad_norm': 1.3120349645614624, 'learning_rate': 1.0316607523066004e-05, 'epoch': 0.48}


 49%|████▊     | 137000/281800 [3:26:53<2:41:31, 14.94it/s]

{'loss': 0.5069, 'grad_norm': 0.9174158573150635, 'learning_rate': 1.028112136266856e-05, 'epoch': 0.49}


 49%|████▉     | 137500/281800 [3:27:27<2:47:27, 14.36it/s]

{'loss': 0.4951, 'grad_norm': 1.250253677368164, 'learning_rate': 1.0245635202271115e-05, 'epoch': 0.49}


 49%|████▉     | 138000/281800 [3:28:02<2:41:20, 14.85it/s]

{'loss': 0.5019, 'grad_norm': 0.9870491027832031, 'learning_rate': 1.021014904187367e-05, 'epoch': 0.49}


 49%|████▉     | 138500/281800 [3:28:36<2:42:32, 14.69it/s]

{'loss': 0.5036, 'grad_norm': 0.915999710559845, 'learning_rate': 1.0174733853797022e-05, 'epoch': 0.49}


 49%|████▉     | 139000/281800 [3:29:11<2:41:19, 14.75it/s]

{'loss': 0.4958, 'grad_norm': 1.0194116830825806, 'learning_rate': 1.0139247693399575e-05, 'epoch': 0.49}


 50%|████▉     | 139500/281800 [3:29:45<2:37:53, 15.02it/s]

{'loss': 0.5014, 'grad_norm': 1.0132899284362793, 'learning_rate': 1.010376153300213e-05, 'epoch': 0.5}


 50%|████▉     | 140000/281800 [3:30:20<2:42:26, 14.55it/s]

{'loss': 0.4975, 'grad_norm': 0.9268269538879395, 'learning_rate': 1.0068275372604686e-05, 'epoch': 0.5}


 50%|████▉     | 140500/281800 [3:30:55<2:36:42, 15.03it/s]

{'loss': 0.4955, 'grad_norm': 0.8846341967582703, 'learning_rate': 1.0032860184528034e-05, 'epoch': 0.5}


 50%|█████     | 141000/281800 [3:31:29<2:35:45, 15.07it/s]

{'loss': 0.4995, 'grad_norm': 1.0100823640823364, 'learning_rate': 9.997374024130589e-06, 'epoch': 0.5}


 50%|█████     | 141500/281800 [3:32:03<2:38:03, 14.79it/s]

{'loss': 0.5058, 'grad_norm': 1.1958889961242676, 'learning_rate': 9.961887863733145e-06, 'epoch': 0.5}


 50%|█████     | 142000/281800 [3:32:38<2:39:57, 14.57it/s]

{'loss': 0.4962, 'grad_norm': 0.9340690970420837, 'learning_rate': 9.9264017033357e-06, 'epoch': 0.5}


 51%|█████     | 142500/281800 [3:33:12<2:34:20, 15.04it/s]

{'loss': 0.4966, 'grad_norm': 1.21627938747406, 'learning_rate': 9.89098651525905e-06, 'epoch': 0.51}


 51%|█████     | 143000/281800 [3:33:47<2:34:37, 14.96it/s]

{'loss': 0.5005, 'grad_norm': 0.9595435857772827, 'learning_rate': 9.855500354861605e-06, 'epoch': 0.51}


 51%|█████     | 143500/281800 [3:34:21<2:35:58, 14.78it/s]

{'loss': 0.4913, 'grad_norm': 1.0705207586288452, 'learning_rate': 9.820014194464161e-06, 'epoch': 0.51}


 51%|█████     | 144000/281800 [3:34:56<2:33:16, 14.98it/s]

{'loss': 0.4997, 'grad_norm': 0.9133467078208923, 'learning_rate': 9.784528034066716e-06, 'epoch': 0.51}


 51%|█████▏    | 144500/281800 [3:35:30<2:31:54, 15.06it/s]

{'loss': 0.4987, 'grad_norm': 1.0928128957748413, 'learning_rate': 9.749112845990064e-06, 'epoch': 0.51}


 51%|█████▏    | 145000/281800 [3:36:05<2:32:10, 14.98it/s]

{'loss': 0.4988, 'grad_norm': 1.0845280885696411, 'learning_rate': 9.71362668559262e-06, 'epoch': 0.51}


 52%|█████▏    | 145500/281800 [3:36:40<2:35:18, 14.63it/s]

{'loss': 0.4942, 'grad_norm': 1.1662702560424805, 'learning_rate': 9.678140525195175e-06, 'epoch': 0.52}


 52%|█████▏    | 146000/281800 [3:37:14<2:31:00, 14.99it/s]

{'loss': 0.4939, 'grad_norm': 0.8356581330299377, 'learning_rate': 9.64265436479773e-06, 'epoch': 0.52}


 52%|█████▏    | 146500/281800 [3:37:48<2:27:32, 15.28it/s]

{'loss': 0.4921, 'grad_norm': 1.2203539609909058, 'learning_rate': 9.607168204400284e-06, 'epoch': 0.52}


 52%|█████▏    | 147000/281800 [3:38:23<2:32:15, 14.75it/s]

{'loss': 0.4977, 'grad_norm': 0.9817522168159485, 'learning_rate': 9.571753016323635e-06, 'epoch': 0.52}


 52%|█████▏    | 147500/281800 [3:38:57<2:31:05, 14.81it/s]

{'loss': 0.5067, 'grad_norm': 1.0379531383514404, 'learning_rate': 9.53626685592619e-06, 'epoch': 0.52}


 53%|█████▎    | 148000/281800 [3:39:32<2:28:55, 14.97it/s]

{'loss': 0.4975, 'grad_norm': 0.9128932356834412, 'learning_rate': 9.500780695528744e-06, 'epoch': 0.53}


 53%|█████▎    | 148500/281800 [3:40:07<2:30:40, 14.74it/s]

{'loss': 0.4952, 'grad_norm': 0.8741190433502197, 'learning_rate': 9.4652945351313e-06, 'epoch': 0.53}


 53%|█████▎    | 149000/281800 [3:40:41<2:28:31, 14.90it/s]

{'loss': 0.4913, 'grad_norm': 0.8895214200019836, 'learning_rate': 9.429879347054649e-06, 'epoch': 0.53}


 53%|█████▎    | 149500/281800 [3:41:16<2:25:24, 15.17it/s]

{'loss': 0.5002, 'grad_norm': 1.3293147087097168, 'learning_rate': 9.394393186657205e-06, 'epoch': 0.53}


 53%|█████▎    | 150000/281800 [3:41:50<2:28:23, 14.80it/s]

{'loss': 0.495, 'grad_norm': 0.9944522380828857, 'learning_rate': 9.35890702625976e-06, 'epoch': 0.53}


 53%|█████▎    | 150500/281800 [3:42:25<2:29:35, 14.63it/s]

{'loss': 0.4974, 'grad_norm': 1.1353681087493896, 'learning_rate': 9.323420865862314e-06, 'epoch': 0.53}


 54%|█████▎    | 151000/281800 [3:43:00<2:25:07, 15.02it/s]

{'loss': 0.4887, 'grad_norm': 0.9515295028686523, 'learning_rate': 9.288005677785665e-06, 'epoch': 0.54}


 54%|█████▍    | 151500/281800 [3:43:34<2:23:43, 15.11it/s]

{'loss': 0.4993, 'grad_norm': 1.1523629426956177, 'learning_rate': 9.25251951738822e-06, 'epoch': 0.54}


 54%|█████▍    | 152000/281800 [3:44:08<2:25:24, 14.88it/s]

{'loss': 0.4988, 'grad_norm': 1.278090000152588, 'learning_rate': 9.217033356990774e-06, 'epoch': 0.54}


 54%|█████▍    | 152500/281800 [3:44:43<2:24:10, 14.95it/s]

{'loss': 0.4995, 'grad_norm': 1.0840407609939575, 'learning_rate': 9.181547196593329e-06, 'epoch': 0.54}


 54%|█████▍    | 153000/281800 [3:45:18<2:23:07, 15.00it/s]

{'loss': 0.4974, 'grad_norm': 1.1595999002456665, 'learning_rate': 9.146132008516679e-06, 'epoch': 0.54}


 54%|█████▍    | 153500/281800 [3:45:52<2:20:25, 15.23it/s]

{'loss': 0.4857, 'grad_norm': 0.855664849281311, 'learning_rate': 9.110645848119233e-06, 'epoch': 0.54}


 55%|█████▍    | 154000/281800 [3:46:27<2:27:36, 14.43it/s]

{'loss': 0.4872, 'grad_norm': 1.0449118614196777, 'learning_rate': 9.07515968772179e-06, 'epoch': 0.55}


 55%|█████▍    | 154500/281800 [3:47:01<2:22:50, 14.85it/s]

{'loss': 0.4965, 'grad_norm': 1.0819979906082153, 'learning_rate': 9.039673527324344e-06, 'epoch': 0.55}


 55%|█████▌    | 155000/281800 [3:47:36<2:24:46, 14.60it/s]

{'loss': 0.492, 'grad_norm': 0.9851181507110596, 'learning_rate': 9.004258339247695e-06, 'epoch': 0.55}


 55%|█████▌    | 155500/281800 [3:48:10<2:20:44, 14.96it/s]

{'loss': 0.4867, 'grad_norm': 1.1423693895339966, 'learning_rate': 8.96877217885025e-06, 'epoch': 0.55}


 55%|█████▌    | 156000/281800 [3:48:45<2:21:56, 14.77it/s]

{'loss': 0.4906, 'grad_norm': 0.8947198390960693, 'learning_rate': 8.933286018452804e-06, 'epoch': 0.55}


 56%|█████▌    | 156500/281800 [3:49:19<2:18:31, 15.08it/s]

{'loss': 0.4827, 'grad_norm': 1.111653208732605, 'learning_rate': 8.897799858055359e-06, 'epoch': 0.56}


 56%|█████▌    | 157000/281800 [3:49:54<2:25:07, 14.33it/s]

{'loss': 0.4886, 'grad_norm': 0.9578570127487183, 'learning_rate': 8.862384669978709e-06, 'epoch': 0.56}


 56%|█████▌    | 157500/281800 [3:50:29<2:21:54, 14.60it/s]

{'loss': 0.4968, 'grad_norm': 1.1704623699188232, 'learning_rate': 8.826898509581265e-06, 'epoch': 0.56}


 56%|█████▌    | 158000/281800 [3:51:04<2:17:28, 15.01it/s]

{'loss': 0.4923, 'grad_norm': 0.8322632312774658, 'learning_rate': 8.791412349183818e-06, 'epoch': 0.56}


 56%|█████▌    | 158500/281800 [3:51:39<2:22:47, 14.39it/s]

{'loss': 0.4961, 'grad_norm': 1.0021755695343018, 'learning_rate': 8.755926188786374e-06, 'epoch': 0.56}


 56%|█████▋    | 159000/281800 [3:52:13<2:16:59, 14.94it/s]

{'loss': 0.4913, 'grad_norm': 0.9712691903114319, 'learning_rate': 8.720511000709725e-06, 'epoch': 0.56}


 57%|█████▋    | 159500/281800 [3:52:48<2:18:03, 14.76it/s]

{'loss': 0.4888, 'grad_norm': 1.0695490837097168, 'learning_rate': 8.68502484031228e-06, 'epoch': 0.57}


 57%|█████▋    | 160000/281800 [3:53:22<2:18:11, 14.69it/s]

{'loss': 0.4916, 'grad_norm': 0.8867224454879761, 'learning_rate': 8.649538679914834e-06, 'epoch': 0.57}


 57%|█████▋    | 160500/281800 [3:53:57<2:15:04, 14.97it/s]

{'loss': 0.4912, 'grad_norm': 0.8945800065994263, 'learning_rate': 8.614052519517389e-06, 'epoch': 0.57}


 57%|█████▋    | 161000/281800 [3:54:31<2:13:42, 15.06it/s]

{'loss': 0.4877, 'grad_norm': 1.1559585332870483, 'learning_rate': 8.578566359119945e-06, 'epoch': 0.57}


 57%|█████▋    | 161500/281800 [3:55:06<2:12:54, 15.09it/s]

{'loss': 0.5079, 'grad_norm': 1.030098557472229, 'learning_rate': 8.543151171043293e-06, 'epoch': 0.57}


 57%|█████▋    | 162000/281800 [3:55:40<2:13:06, 15.00it/s]

{'loss': 0.5027, 'grad_norm': 1.2126127481460571, 'learning_rate': 8.50766501064585e-06, 'epoch': 0.57}


 58%|█████▊    | 162500/281800 [3:56:14<2:14:27, 14.79it/s]

{'loss': 0.4963, 'grad_norm': 0.8962947130203247, 'learning_rate': 8.472178850248404e-06, 'epoch': 0.58}


 58%|█████▊    | 163000/281800 [3:56:49<2:12:52, 14.90it/s]

{'loss': 0.4953, 'grad_norm': 0.937481701374054, 'learning_rate': 8.436692689850959e-06, 'epoch': 0.58}


 58%|█████▊    | 163500/281800 [3:57:24<2:12:52, 14.84it/s]

{'loss': 0.4877, 'grad_norm': 1.1852682828903198, 'learning_rate': 8.40127750177431e-06, 'epoch': 0.58}


 58%|█████▊    | 164000/281800 [3:57:59<2:11:13, 14.96it/s]

{'loss': 0.4946, 'grad_norm': 0.8454204797744751, 'learning_rate': 8.365791341376864e-06, 'epoch': 0.58}


 58%|█████▊    | 164500/281800 [3:58:33<2:16:25, 14.33it/s]

{'loss': 0.4875, 'grad_norm': 0.9222447872161865, 'learning_rate': 8.330305180979419e-06, 'epoch': 0.58}


 59%|█████▊    | 165000/281800 [3:59:08<2:12:45, 14.66it/s]

{'loss': 0.4994, 'grad_norm': 1.071335792541504, 'learning_rate': 8.294819020581973e-06, 'epoch': 0.59}


 59%|█████▊    | 165500/281800 [3:59:42<2:08:05, 15.13it/s]

{'loss': 0.4867, 'grad_norm': 1.0077837705612183, 'learning_rate': 8.259403832505323e-06, 'epoch': 0.59}


 59%|█████▉    | 166000/281800 [4:00:17<2:09:39, 14.89it/s]

{'loss': 0.4946, 'grad_norm': 1.098090648651123, 'learning_rate': 8.223917672107878e-06, 'epoch': 0.59}


 59%|█████▉    | 166500/281800 [4:00:51<2:07:43, 15.05it/s]

{'loss': 0.4986, 'grad_norm': 0.809903621673584, 'learning_rate': 8.188431511710434e-06, 'epoch': 0.59}


 59%|█████▉    | 167000/281800 [4:01:26<2:08:24, 14.90it/s]

{'loss': 0.4876, 'grad_norm': 1.0180526971817017, 'learning_rate': 8.152945351312989e-06, 'epoch': 0.59}


 59%|█████▉    | 167500/281800 [4:02:00<2:06:47, 15.02it/s]

{'loss': 0.4952, 'grad_norm': 0.9376288056373596, 'learning_rate': 8.11753016323634e-06, 'epoch': 0.59}


 60%|█████▉    | 168000/281800 [4:02:35<2:07:48, 14.84it/s]

{'loss': 0.4999, 'grad_norm': 1.1468162536621094, 'learning_rate': 8.082044002838894e-06, 'epoch': 0.6}


 60%|█████▉    | 168500/281800 [4:03:09<2:07:48, 14.78it/s]

{'loss': 0.482, 'grad_norm': 1.0732921361923218, 'learning_rate': 8.046557842441449e-06, 'epoch': 0.6}


 60%|█████▉    | 169000/281800 [4:03:44<2:07:03, 14.80it/s]

{'loss': 0.4837, 'grad_norm': 1.099340796470642, 'learning_rate': 8.011071682044003e-06, 'epoch': 0.6}


 60%|██████    | 169500/281800 [4:04:19<2:09:15, 14.48it/s]

{'loss': 0.4986, 'grad_norm': 1.0023845434188843, 'learning_rate': 7.975656493967353e-06, 'epoch': 0.6}


 60%|██████    | 170000/281800 [4:04:53<2:04:56, 14.91it/s]

{'loss': 0.4913, 'grad_norm': 0.9799073338508606, 'learning_rate': 7.940170333569908e-06, 'epoch': 0.6}


 61%|██████    | 170500/281800 [4:05:28<2:03:41, 15.00it/s]

{'loss': 0.4928, 'grad_norm': 1.2228964567184448, 'learning_rate': 7.904684173172463e-06, 'epoch': 0.61}


 61%|██████    | 171000/281800 [4:06:02<2:03:29, 14.95it/s]

{'loss': 0.4861, 'grad_norm': 0.861712634563446, 'learning_rate': 7.869198012775019e-06, 'epoch': 0.61}


 61%|██████    | 171500/281800 [4:06:37<2:04:59, 14.71it/s]

{'loss': 0.4903, 'grad_norm': 1.0154271125793457, 'learning_rate': 7.833782824698367e-06, 'epoch': 0.61}


 61%|██████    | 172000/281800 [4:07:11<2:02:23, 14.95it/s]

{'loss': 0.4952, 'grad_norm': 1.0246634483337402, 'learning_rate': 7.79836763662172e-06, 'epoch': 0.61}


 61%|██████    | 172500/281800 [4:07:46<2:02:02, 14.93it/s]

{'loss': 0.4877, 'grad_norm': 0.8419238328933716, 'learning_rate': 7.762881476224274e-06, 'epoch': 0.61}


 61%|██████▏   | 173000/281800 [4:08:21<2:02:43, 14.78it/s]

{'loss': 0.4996, 'grad_norm': 1.036338448524475, 'learning_rate': 7.727395315826827e-06, 'epoch': 0.61}


 62%|██████▏   | 173500/281800 [4:08:56<2:02:14, 14.77it/s]

{'loss': 0.4942, 'grad_norm': 1.0315300226211548, 'learning_rate': 7.691909155429383e-06, 'epoch': 0.62}


 62%|██████▏   | 174000/281800 [4:09:30<2:01:39, 14.77it/s]

{'loss': 0.4869, 'grad_norm': 1.2021452188491821, 'learning_rate': 7.656493967352733e-06, 'epoch': 0.62}


 62%|██████▏   | 174500/281800 [4:10:05<2:01:10, 14.76it/s]

{'loss': 0.4887, 'grad_norm': 0.8872093558311462, 'learning_rate': 7.621078779276084e-06, 'epoch': 0.62}


 62%|██████▏   | 175000/281800 [4:10:40<2:01:59, 14.59it/s]

{'loss': 0.491, 'grad_norm': 0.9283515810966492, 'learning_rate': 7.585592618878637e-06, 'epoch': 0.62}


 62%|██████▏   | 175500/281800 [4:11:14<2:00:24, 14.71it/s]

{'loss': 0.4984, 'grad_norm': 1.1555081605911255, 'learning_rate': 7.550106458481194e-06, 'epoch': 0.62}


 62%|██████▏   | 176000/281800 [4:11:49<1:59:35, 14.74it/s]

{'loss': 0.4986, 'grad_norm': 1.4134832620620728, 'learning_rate': 7.514620298083748e-06, 'epoch': 0.62}


 63%|██████▎   | 176500/281800 [4:12:23<2:00:39, 14.54it/s]

{'loss': 0.4901, 'grad_norm': 1.002585530281067, 'learning_rate': 7.479134137686302e-06, 'epoch': 0.63}


 63%|██████▎   | 177000/281800 [4:12:58<2:00:32, 14.49it/s]

{'loss': 0.4975, 'grad_norm': 1.0050771236419678, 'learning_rate': 7.443647977288858e-06, 'epoch': 0.63}


 63%|██████▎   | 177500/281800 [4:13:33<1:56:52, 14.87it/s]

{'loss': 0.4993, 'grad_norm': 1.085963249206543, 'learning_rate': 7.4081618168914124e-06, 'epoch': 0.63}


 63%|██████▎   | 178000/281800 [4:14:08<1:58:33, 14.59it/s]

{'loss': 0.4921, 'grad_norm': 1.041619896888733, 'learning_rate': 7.372675656493968e-06, 'epoch': 0.63}


 63%|██████▎   | 178500/281800 [4:14:43<1:57:11, 14.69it/s]

{'loss': 0.4942, 'grad_norm': 1.0738052129745483, 'learning_rate': 7.337189496096523e-06, 'epoch': 0.63}


 64%|██████▎   | 179000/281800 [4:15:18<1:55:53, 14.78it/s]

{'loss': 0.4966, 'grad_norm': 1.108578085899353, 'learning_rate': 7.301703335699077e-06, 'epoch': 0.64}


 64%|██████▎   | 179500/281800 [4:15:52<1:54:14, 14.92it/s]

{'loss': 0.502, 'grad_norm': 1.0161075592041016, 'learning_rate': 7.266217175301633e-06, 'epoch': 0.64}


 64%|██████▍   | 180000/281800 [4:16:27<1:56:32, 14.56it/s]

{'loss': 0.5031, 'grad_norm': 0.897552490234375, 'learning_rate': 7.2307310149041875e-06, 'epoch': 0.64}


 64%|██████▍   | 180500/281800 [4:17:02<1:54:15, 14.78it/s]

{'loss': 0.4958, 'grad_norm': 0.7955353856086731, 'learning_rate': 7.195244854506743e-06, 'epoch': 0.64}


 64%|██████▍   | 181000/281800 [4:17:36<1:54:14, 14.70it/s]

{'loss': 0.4906, 'grad_norm': 1.2268191576004028, 'learning_rate': 7.159829666430092e-06, 'epoch': 0.64}


 64%|██████▍   | 181500/281800 [4:18:11<1:53:09, 14.77it/s]

{'loss': 0.5044, 'grad_norm': 1.2014262676239014, 'learning_rate': 7.124343506032649e-06, 'epoch': 0.64}


 65%|██████▍   | 182000/281800 [4:18:46<1:52:26, 14.79it/s]

{'loss': 0.4938, 'grad_norm': 0.9233160614967346, 'learning_rate': 7.0888573456352025e-06, 'epoch': 0.65}


 65%|██████▍   | 182500/281800 [4:19:21<1:50:25, 14.99it/s]

{'loss': 0.4899, 'grad_norm': 1.068897008895874, 'learning_rate': 7.0534421575585535e-06, 'epoch': 0.65}


 65%|██████▍   | 183000/281800 [4:19:56<1:50:40, 14.88it/s]

{'loss': 0.4915, 'grad_norm': 0.9914959073066711, 'learning_rate': 7.017955997161108e-06, 'epoch': 0.65}


 65%|██████▌   | 183500/281800 [4:20:30<1:51:54, 14.64it/s]

{'loss': 0.4865, 'grad_norm': 0.8656782507896423, 'learning_rate': 6.982469836763662e-06, 'epoch': 0.65}


 65%|██████▌   | 184000/281800 [4:21:13<2:32:30, 10.69it/s]

{'loss': 0.4845, 'grad_norm': 1.1370301246643066, 'learning_rate': 6.946983676366218e-06, 'epoch': 0.65}


 65%|██████▌   | 184500/281800 [4:22:06<2:55:02,  9.26it/s]

{'loss': 0.4911, 'grad_norm': 1.2191513776779175, 'learning_rate': 6.911497515968772e-06, 'epoch': 0.65}


 66%|██████▌   | 185000/281800 [4:22:58<2:45:54,  9.72it/s] 

{'loss': 0.4897, 'grad_norm': 1.0747218132019043, 'learning_rate': 6.8760113555713285e-06, 'epoch': 0.66}


 66%|██████▌   | 185500/281800 [4:23:50<2:41:04,  9.96it/s]

{'loss': 0.4878, 'grad_norm': 1.0860369205474854, 'learning_rate': 6.840525195173882e-06, 'epoch': 0.66}


 66%|██████▌   | 186000/281800 [4:24:41<2:38:58, 10.04it/s]

{'loss': 0.4928, 'grad_norm': 1.0256175994873047, 'learning_rate': 6.805110007097233e-06, 'epoch': 0.66}


 66%|██████▌   | 186500/281800 [4:25:32<2:35:22, 10.22it/s]

{'loss': 0.4904, 'grad_norm': 0.7541542053222656, 'learning_rate': 6.769623846699788e-06, 'epoch': 0.66}


 66%|██████▋   | 187000/281800 [4:26:23<2:56:11,  8.97it/s]

{'loss': 0.4921, 'grad_norm': 0.9717622995376587, 'learning_rate': 6.734137686302342e-06, 'epoch': 0.66}


 67%|██████▋   | 187500/281800 [4:27:15<2:30:21, 10.45it/s] 

{'loss': 0.4899, 'grad_norm': 0.9810792207717896, 'learning_rate': 6.698651525904898e-06, 'epoch': 0.67}


 67%|██████▋   | 188000/281800 [4:28:06<2:38:12,  9.88it/s]

{'loss': 0.4869, 'grad_norm': 0.8495543599128723, 'learning_rate': 6.663165365507453e-06, 'epoch': 0.67}


 67%|██████▋   | 188500/281800 [4:28:58<2:32:59, 10.16it/s]

{'loss': 0.4965, 'grad_norm': 1.0324543714523315, 'learning_rate': 6.627679205110008e-06, 'epoch': 0.67}


 67%|██████▋   | 189000/281800 [4:29:52<2:38:22,  9.77it/s]

{'loss': 0.4909, 'grad_norm': 0.7673303484916687, 'learning_rate': 6.592193044712563e-06, 'epoch': 0.67}


 67%|██████▋   | 189500/281800 [4:30:46<2:37:40,  9.76it/s]

{'loss': 0.4895, 'grad_norm': 1.0636019706726074, 'learning_rate': 6.556706884315117e-06, 'epoch': 0.67}


 67%|██████▋   | 190000/281800 [4:31:37<2:36:56,  9.75it/s] 

{'loss': 0.4745, 'grad_norm': 1.0385829210281372, 'learning_rate': 6.521291696238468e-06, 'epoch': 0.67}


 68%|██████▊   | 190500/281800 [4:32:29<2:31:44, 10.03it/s] 

{'loss': 0.4835, 'grad_norm': 1.24660062789917, 'learning_rate': 6.4858055358410225e-06, 'epoch': 0.68}


 68%|██████▊   | 191000/281800 [4:33:21<2:30:32, 10.05it/s] 

{'loss': 0.4852, 'grad_norm': 1.133161187171936, 'learning_rate': 6.450319375443578e-06, 'epoch': 0.68}


 68%|██████▊   | 191500/281800 [4:34:14<2:44:25,  9.15it/s]

{'loss': 0.504, 'grad_norm': 1.0319327116012573, 'learning_rate': 6.414833215046133e-06, 'epoch': 0.68}


 68%|██████▊   | 192000/281800 [4:35:05<2:25:50, 10.26it/s] 

{'loss': 0.4849, 'grad_norm': 1.0228122472763062, 'learning_rate': 6.3794889992902774e-06, 'epoch': 0.68}


 68%|██████▊   | 192500/281800 [4:35:57<2:32:43,  9.75it/s]

{'loss': 0.4822, 'grad_norm': 1.100091576576233, 'learning_rate': 6.344002838892832e-06, 'epoch': 0.68}


 68%|██████▊   | 193000/281800 [4:36:51<2:57:20,  8.35it/s]

{'loss': 0.4881, 'grad_norm': 0.9501937627792358, 'learning_rate': 6.308516678495388e-06, 'epoch': 0.68}


 69%|██████▊   | 193500/281800 [4:37:43<2:21:12, 10.42it/s] 

{'loss': 0.4874, 'grad_norm': 1.1338934898376465, 'learning_rate': 6.273030518097942e-06, 'epoch': 0.69}


 69%|██████▉   | 194000/281800 [4:38:35<2:23:34, 10.19it/s]

{'loss': 0.4911, 'grad_norm': 0.8342655897140503, 'learning_rate': 6.237544357700497e-06, 'epoch': 0.69}


 69%|██████▉   | 194500/281800 [4:39:27<2:23:36, 10.13it/s]

{'loss': 0.4853, 'grad_norm': 0.9191175699234009, 'learning_rate': 6.2020581973030525e-06, 'epoch': 0.69}


 69%|██████▉   | 195000/281800 [4:40:20<2:41:03,  8.98it/s]

{'loss': 0.489, 'grad_norm': 1.0810602903366089, 'learning_rate': 6.166572036905607e-06, 'epoch': 0.69}


 69%|██████▉   | 195500/281800 [4:41:13<2:17:08, 10.49it/s] 

{'loss': 0.4922, 'grad_norm': 0.96163010597229, 'learning_rate': 6.131085876508163e-06, 'epoch': 0.69}


 70%|██████▉   | 196000/281800 [4:42:04<2:22:56, 10.00it/s]

{'loss': 0.4912, 'grad_norm': 1.2520089149475098, 'learning_rate': 6.095599716110717e-06, 'epoch': 0.7}


 70%|██████▉   | 196500/281800 [4:42:54<2:19:58, 10.16it/s]

{'loss': 0.4862, 'grad_norm': 0.7703334093093872, 'learning_rate': 6.0601845280340675e-06, 'epoch': 0.7}


 70%|██████▉   | 197000/281800 [4:43:48<2:17:02, 10.31it/s]

{'loss': 0.4977, 'grad_norm': 1.1233532428741455, 'learning_rate': 6.024698367636622e-06, 'epoch': 0.7}


 70%|███████   | 197500/281800 [4:44:41<2:14:34, 10.44it/s]

{'loss': 0.5061, 'grad_norm': 1.3457019329071045, 'learning_rate': 5.989212207239177e-06, 'epoch': 0.7}


 70%|███████   | 198000/281800 [4:45:32<2:18:23, 10.09it/s]

{'loss': 0.4882, 'grad_norm': 1.0982604026794434, 'learning_rate': 5.953726046841732e-06, 'epoch': 0.7}


 70%|███████   | 198500/281800 [4:46:26<2:17:15, 10.11it/s]

{'loss': 0.489, 'grad_norm': 1.1183363199234009, 'learning_rate': 5.918310858765082e-06, 'epoch': 0.7}


 71%|███████   | 199000/281800 [4:47:18<2:14:14, 10.28it/s]

{'loss': 0.4876, 'grad_norm': 0.9888907670974731, 'learning_rate': 5.882824698367637e-06, 'epoch': 0.71}


 71%|███████   | 199500/281800 [4:48:09<2:16:09, 10.07it/s]

{'loss': 0.4896, 'grad_norm': 0.9997544288635254, 'learning_rate': 5.847338537970192e-06, 'epoch': 0.71}


 71%|███████   | 200000/281800 [4:49:00<2:14:58, 10.10it/s]

{'loss': 0.4882, 'grad_norm': 1.1249659061431885, 'learning_rate': 5.811852377572747e-06, 'epoch': 0.71}


 71%|███████   | 200500/281800 [4:49:52<2:11:00, 10.34it/s]

{'loss': 0.482, 'grad_norm': 1.2612625360488892, 'learning_rate': 5.776437189496097e-06, 'epoch': 0.71}


 71%|███████▏  | 201000/281800 [4:50:43<2:11:35, 10.23it/s]

{'loss': 0.487, 'grad_norm': 1.1702684164047241, 'learning_rate': 5.740951029098653e-06, 'epoch': 0.71}


 72%|███████▏  | 201500/281800 [4:51:34<2:18:35,  9.66it/s]

{'loss': 0.4877, 'grad_norm': 1.0410966873168945, 'learning_rate': 5.705464868701207e-06, 'epoch': 0.72}


 72%|███████▏  | 202000/281800 [4:52:25<2:07:39, 10.42it/s]

{'loss': 0.4884, 'grad_norm': 1.1388792991638184, 'learning_rate': 5.6699787083037615e-06, 'epoch': 0.72}


 72%|███████▏  | 202500/281800 [4:53:15<2:08:10, 10.31it/s]

{'loss': 0.4804, 'grad_norm': 0.8896975517272949, 'learning_rate': 5.634492547906317e-06, 'epoch': 0.72}


 72%|███████▏  | 203000/281800 [4:54:06<2:07:11, 10.33it/s]

{'loss': 0.4906, 'grad_norm': 1.1253098249435425, 'learning_rate': 5.599077359829666e-06, 'epoch': 0.72}


 72%|███████▏  | 203500/281800 [4:54:58<2:08:56, 10.12it/s]

{'loss': 0.4986, 'grad_norm': 0.8513559699058533, 'learning_rate': 5.563591199432223e-06, 'epoch': 0.72}


 72%|███████▏  | 204000/281800 [4:55:49<2:07:37, 10.16it/s]

{'loss': 0.4842, 'grad_norm': 1.0441374778747559, 'learning_rate': 5.5281050390347764e-06, 'epoch': 0.72}


 73%|███████▎  | 204500/281800 [4:56:39<2:04:07, 10.38it/s]

{'loss': 0.491, 'grad_norm': 0.9586490988731384, 'learning_rate': 5.492618878637331e-06, 'epoch': 0.73}


 73%|███████▎  | 205000/281800 [4:57:33<2:06:39, 10.11it/s]

{'loss': 0.4805, 'grad_norm': 1.0391017198562622, 'learning_rate': 5.457203690560682e-06, 'epoch': 0.73}


 73%|███████▎  | 205500/281800 [4:58:25<2:16:51,  9.29it/s]

{'loss': 0.4925, 'grad_norm': 0.8829219937324524, 'learning_rate': 5.421717530163236e-06, 'epoch': 0.73}


 73%|███████▎  | 206000/281800 [4:59:17<2:02:10, 10.34it/s]

{'loss': 0.4947, 'grad_norm': 1.1503103971481323, 'learning_rate': 5.386231369765792e-06, 'epoch': 0.73}


 73%|███████▎  | 206500/281800 [5:00:09<2:18:12,  9.08it/s]

{'loss': 0.4957, 'grad_norm': 1.1005038022994995, 'learning_rate': 5.350745209368346e-06, 'epoch': 0.73}


 73%|███████▎  | 207000/281800 [5:01:00<2:10:38,  9.54it/s]

{'loss': 0.4755, 'grad_norm': 1.1478980779647827, 'learning_rate': 5.315330021291697e-06, 'epoch': 0.73}


 74%|███████▎  | 207500/281800 [5:01:52<2:04:31,  9.94it/s]

{'loss': 0.4881, 'grad_norm': 1.0314269065856934, 'learning_rate': 5.279843860894252e-06, 'epoch': 0.74}


 74%|███████▍  | 208000/281800 [5:02:43<2:01:16, 10.14it/s]

{'loss': 0.4867, 'grad_norm': 0.9091407060623169, 'learning_rate': 5.244357700496807e-06, 'epoch': 0.74}


 74%|███████▍  | 208500/281800 [5:03:34<2:00:05, 10.17it/s]

{'loss': 0.4878, 'grad_norm': 1.2221392393112183, 'learning_rate': 5.208871540099362e-06, 'epoch': 0.74}


 74%|███████▍  | 209000/281800 [5:04:26<1:52:40, 10.77it/s]

{'loss': 0.4906, 'grad_norm': 1.231668472290039, 'learning_rate': 5.173456352022711e-06, 'epoch': 0.74}


 74%|███████▍  | 209500/281800 [5:05:16<1:58:29, 10.17it/s]

{'loss': 0.482, 'grad_norm': 1.0292792320251465, 'learning_rate': 5.137970191625267e-06, 'epoch': 0.74}


 75%|███████▍  | 210000/281800 [5:06:07<1:51:56, 10.69it/s]

{'loss': 0.4716, 'grad_norm': 1.2424081563949585, 'learning_rate': 5.102484031227821e-06, 'epoch': 0.75}


 75%|███████▍  | 210500/281800 [5:06:56<1:55:00, 10.33it/s]

{'loss': 0.4878, 'grad_norm': 0.9564805030822754, 'learning_rate': 5.066997870830377e-06, 'epoch': 0.75}


 75%|███████▍  | 211000/281800 [5:07:48<1:55:10, 10.24it/s]

{'loss': 0.4933, 'grad_norm': 1.0554171800613403, 'learning_rate': 5.031582682753726e-06, 'epoch': 0.75}


 75%|███████▌  | 211500/281800 [5:08:40<1:56:28, 10.06it/s]

{'loss': 0.4899, 'grad_norm': 0.9345956444740295, 'learning_rate': 4.996096522356281e-06, 'epoch': 0.75}


 75%|███████▌  | 212000/281800 [5:09:31<1:55:23, 10.08it/s]

{'loss': 0.4883, 'grad_norm': 1.0345593690872192, 'learning_rate': 4.960610361958836e-06, 'epoch': 0.75}


 75%|███████▌  | 212500/281800 [5:10:23<1:55:15, 10.02it/s]

{'loss': 0.4931, 'grad_norm': 1.1477770805358887, 'learning_rate': 4.925124201561391e-06, 'epoch': 0.75}


 76%|███████▌  | 213000/281800 [5:11:14<1:52:11, 10.22it/s]

{'loss': 0.4875, 'grad_norm': 0.9389874935150146, 'learning_rate': 4.889709013484741e-06, 'epoch': 0.76}


 76%|███████▌  | 213500/281800 [5:12:05<1:51:16, 10.23it/s]

{'loss': 0.5032, 'grad_norm': 1.0270284414291382, 'learning_rate': 4.854222853087297e-06, 'epoch': 0.76}


 76%|███████▌  | 214000/281800 [5:12:55<1:49:11, 10.35it/s]

{'loss': 0.4857, 'grad_norm': 1.0756429433822632, 'learning_rate': 4.818736692689851e-06, 'epoch': 0.76}


 76%|███████▌  | 214500/281800 [5:13:46<1:49:24, 10.25it/s]

{'loss': 0.4815, 'grad_norm': 1.2060970067977905, 'learning_rate': 4.783250532292406e-06, 'epoch': 0.76}


 76%|███████▋  | 215000/281800 [5:14:36<1:51:51,  9.95it/s]

{'loss': 0.4829, 'grad_norm': 0.8463547825813293, 'learning_rate': 4.747835344215756e-06, 'epoch': 0.76}


 76%|███████▋  | 215500/281800 [5:15:27<1:54:09,  9.68it/s]

{'loss': 0.4851, 'grad_norm': 0.9084623456001282, 'learning_rate': 4.712349183818311e-06, 'epoch': 0.76}


 77%|███████▋  | 216000/281800 [5:16:18<1:54:54,  9.54it/s]

{'loss': 0.4956, 'grad_norm': 0.8222469687461853, 'learning_rate': 4.676863023420866e-06, 'epoch': 0.77}


 77%|███████▋  | 216500/281800 [5:17:09<1:53:38,  9.58it/s]

{'loss': 0.4868, 'grad_norm': 1.0165208578109741, 'learning_rate': 4.641376863023421e-06, 'epoch': 0.77}


 77%|███████▋  | 217000/281800 [5:18:01<1:45:19, 10.25it/s]

{'loss': 0.4853, 'grad_norm': 0.8309493064880371, 'learning_rate': 4.605890702625977e-06, 'epoch': 0.77}


 77%|███████▋  | 217500/281800 [5:18:54<1:50:19,  9.71it/s]

{'loss': 0.4935, 'grad_norm': 1.1362640857696533, 'learning_rate': 4.570475514549326e-06, 'epoch': 0.77}


 77%|███████▋  | 218000/281800 [5:19:46<1:51:41,  9.52it/s]

{'loss': 0.4845, 'grad_norm': 1.125994086265564, 'learning_rate': 4.534989354151881e-06, 'epoch': 0.77}


 78%|███████▊  | 218500/281800 [5:20:37<1:38:04, 10.76it/s]

{'loss': 0.4895, 'grad_norm': 0.939311683177948, 'learning_rate': 4.499503193754436e-06, 'epoch': 0.78}


 78%|███████▊  | 219000/281800 [5:21:27<1:35:27, 10.96it/s]

{'loss': 0.4926, 'grad_norm': 1.3960539102554321, 'learning_rate': 4.464017033356991e-06, 'epoch': 0.78}


 78%|███████▊  | 219500/281800 [5:22:18<1:39:28, 10.44it/s]

{'loss': 0.4859, 'grad_norm': 1.0905381441116333, 'learning_rate': 4.428601845280341e-06, 'epoch': 0.78}


 78%|███████▊  | 220000/281800 [5:23:08<1:37:25, 10.57it/s]

{'loss': 0.4914, 'grad_norm': 0.9583627581596375, 'learning_rate': 4.393115684882896e-06, 'epoch': 0.78}


 78%|███████▊  | 220500/281800 [5:23:58<1:37:34, 10.47it/s]

{'loss': 0.4839, 'grad_norm': 1.261742353439331, 'learning_rate': 4.357629524485451e-06, 'epoch': 0.78}


 78%|███████▊  | 221000/281800 [5:24:48<1:35:42, 10.59it/s]

{'loss': 0.4903, 'grad_norm': 0.9983300566673279, 'learning_rate': 4.3221433640880066e-06, 'epoch': 0.78}


 79%|███████▊  | 221500/281800 [5:25:39<1:39:45, 10.07it/s]

{'loss': 0.4854, 'grad_norm': 0.9078107476234436, 'learning_rate': 4.286657203690561e-06, 'epoch': 0.79}


 79%|███████▉  | 222000/281800 [5:26:29<1:34:43, 10.52it/s]

{'loss': 0.4825, 'grad_norm': 1.1972312927246094, 'learning_rate': 4.251242015613911e-06, 'epoch': 0.79}


 79%|███████▉  | 222500/281800 [5:27:19<1:36:16, 10.27it/s]

{'loss': 0.5008, 'grad_norm': 0.8908204436302185, 'learning_rate': 4.215755855216466e-06, 'epoch': 0.79}


 79%|███████▉  | 223000/281800 [5:28:12<1:40:50,  9.72it/s]

{'loss': 0.4833, 'grad_norm': 0.9810956716537476, 'learning_rate': 4.180269694819021e-06, 'epoch': 0.79}


 79%|███████▉  | 223500/281800 [5:29:03<1:34:20, 10.30it/s]

{'loss': 0.4932, 'grad_norm': 1.1664276123046875, 'learning_rate': 4.144783534421576e-06, 'epoch': 0.79}


 79%|███████▉  | 224000/281800 [5:29:55<1:30:53, 10.60it/s]

{'loss': 0.4909, 'grad_norm': 1.1883950233459473, 'learning_rate': 4.1093683463449255e-06, 'epoch': 0.79}


 80%|███████▉  | 224500/281800 [5:30:47<1:49:27,  8.72it/s]

{'loss': 0.4858, 'grad_norm': 1.208227515220642, 'learning_rate': 4.073882185947481e-06, 'epoch': 0.8}


 80%|███████▉  | 225000/281800 [5:31:39<1:32:50, 10.20it/s]

{'loss': 0.4872, 'grad_norm': 0.9184560179710388, 'learning_rate': 4.038396025550036e-06, 'epoch': 0.8}


 80%|████████  | 225500/281800 [5:32:31<1:35:12,  9.86it/s]

{'loss': 0.497, 'grad_norm': 1.106429934501648, 'learning_rate': 4.002909865152591e-06, 'epoch': 0.8}


 80%|████████  | 226000/281800 [5:33:22<1:38:57,  9.40it/s]

{'loss': 0.4926, 'grad_norm': 0.9947997331619263, 'learning_rate': 3.967494677075941e-06, 'epoch': 0.8}


 80%|████████  | 226500/281800 [5:34:14<1:28:31, 10.41it/s]

{'loss': 0.4891, 'grad_norm': 0.9665486216545105, 'learning_rate': 3.932008516678495e-06, 'epoch': 0.8}


 81%|████████  | 227000/281800 [5:35:05<1:29:23, 10.22it/s]

{'loss': 0.4832, 'grad_norm': 0.9978374242782593, 'learning_rate': 3.896522356281051e-06, 'epoch': 0.81}


 81%|████████  | 227500/281800 [5:35:55<1:34:18,  9.60it/s]

{'loss': 0.4845, 'grad_norm': 0.8439867496490479, 'learning_rate': 3.861036195883605e-06, 'epoch': 0.81}


 81%|████████  | 228000/281800 [5:36:48<1:31:54,  9.76it/s]

{'loss': 0.4945, 'grad_norm': 1.094775676727295, 'learning_rate': 3.825550035486161e-06, 'epoch': 0.81}


 81%|████████  | 228500/281800 [5:37:40<1:30:55,  9.77it/s]

{'loss': 0.5067, 'grad_norm': 0.9724471569061279, 'learning_rate': 3.7901348474095106e-06, 'epoch': 0.81}


 81%|████████▏ | 229000/281800 [5:38:31<1:26:43, 10.15it/s]

{'loss': 0.4842, 'grad_norm': 1.2060532569885254, 'learning_rate': 3.7546486870120657e-06, 'epoch': 0.81}


 81%|████████▏ | 229500/281800 [5:39:22<1:27:16,  9.99it/s]

{'loss': 0.489, 'grad_norm': 1.1690700054168701, 'learning_rate': 3.7191625266146208e-06, 'epoch': 0.81}


 82%|████████▏ | 230000/281800 [5:40:13<1:28:06,  9.80it/s]

{'loss': 0.4908, 'grad_norm': 0.9828668236732483, 'learning_rate': 3.6836763662171755e-06, 'epoch': 0.82}


 82%|████████▏ | 230500/281800 [5:41:05<1:24:28, 10.12it/s]

{'loss': 0.4879, 'grad_norm': 1.1192076206207275, 'learning_rate': 3.648261178140525e-06, 'epoch': 0.82}


 82%|████████▏ | 231000/281800 [5:41:56<1:22:11, 10.30it/s]

{'loss': 0.4863, 'grad_norm': 1.0616945028305054, 'learning_rate': 3.6127750177430803e-06, 'epoch': 0.82}


 82%|████████▏ | 231500/281800 [5:42:48<1:24:54,  9.87it/s]

{'loss': 0.4985, 'grad_norm': 1.1291587352752686, 'learning_rate': 3.5772888573456354e-06, 'epoch': 0.82}


 82%|████████▏ | 232000/281800 [5:43:38<1:24:16,  9.85it/s]

{'loss': 0.4842, 'grad_norm': 0.9941397905349731, 'learning_rate': 3.5418026969481904e-06, 'epoch': 0.82}


 83%|████████▎ | 232500/281800 [5:44:30<1:21:15, 10.11it/s]

{'loss': 0.4876, 'grad_norm': 0.9441444277763367, 'learning_rate': 3.5063875088715406e-06, 'epoch': 0.83}


 83%|████████▎ | 233000/281800 [5:45:20<1:15:59, 10.70it/s]

{'loss': 0.4922, 'grad_norm': 0.9254782199859619, 'learning_rate': 3.4709013484740957e-06, 'epoch': 0.83}


 83%|████████▎ | 233500/281800 [5:45:59<54:38, 14.73it/s]  

{'loss': 0.4852, 'grad_norm': 0.8850182294845581, 'learning_rate': 3.4354151880766508e-06, 'epoch': 0.83}


 83%|████████▎ | 234000/281800 [5:46:34<53:44, 14.82it/s]  

{'loss': 0.4806, 'grad_norm': 1.0339523553848267, 'learning_rate': 3.399929027679205e-06, 'epoch': 0.83}


 83%|████████▎ | 234500/281800 [5:47:09<52:34, 15.00it/s]  

{'loss': 0.4827, 'grad_norm': 0.9092094302177429, 'learning_rate': 3.364513839602555e-06, 'epoch': 0.83}


 83%|████████▎ | 235000/281800 [5:47:43<52:35, 14.83it/s]  

{'loss': 0.492, 'grad_norm': 1.018452763557434, 'learning_rate': 3.3290276792051102e-06, 'epoch': 0.83}


 84%|████████▎ | 235500/281800 [5:48:17<52:32, 14.69it/s]  

{'loss': 0.4863, 'grad_norm': 0.9740843176841736, 'learning_rate': 3.2935415188076653e-06, 'epoch': 0.84}


 84%|████████▎ | 236000/281800 [5:48:52<50:36, 15.08it/s]  

{'loss': 0.4801, 'grad_norm': 1.1358964443206787, 'learning_rate': 3.2580553584102204e-06, 'epoch': 0.84}


 84%|████████▍ | 236500/281800 [5:49:27<51:19, 14.71it/s]  

{'loss': 0.4826, 'grad_norm': 1.1904046535491943, 'learning_rate': 3.22264017033357e-06, 'epoch': 0.84}


 84%|████████▍ | 237000/281800 [5:50:01<52:23, 14.25it/s]  

{'loss': 0.4849, 'grad_norm': 1.1011172533035278, 'learning_rate': 3.1871540099361252e-06, 'epoch': 0.84}


 84%|████████▍ | 237500/281800 [5:50:36<49:33, 14.90it/s]  

{'loss': 0.4779, 'grad_norm': 1.0770018100738525, 'learning_rate': 3.1516678495386803e-06, 'epoch': 0.84}


 84%|████████▍ | 238000/281800 [5:51:11<48:55, 14.92it/s]  

{'loss': 0.4907, 'grad_norm': 1.0183286666870117, 'learning_rate': 3.116181689141235e-06, 'epoch': 0.84}


 85%|████████▍ | 238500/281800 [5:51:45<48:25, 14.90it/s]  

{'loss': 0.482, 'grad_norm': 0.9519246816635132, 'learning_rate': 3.08069552874379e-06, 'epoch': 0.85}


 85%|████████▍ | 239000/281800 [5:52:20<47:38, 14.97it/s]  

{'loss': 0.4937, 'grad_norm': 1.1419264078140259, 'learning_rate': 3.04528034066714e-06, 'epoch': 0.85}


 85%|████████▍ | 239500/281800 [5:52:55<47:14, 14.92it/s]  

{'loss': 0.482, 'grad_norm': 0.9971700310707092, 'learning_rate': 3.009794180269695e-06, 'epoch': 0.85}


 85%|████████▌ | 240000/281800 [5:53:29<46:32, 14.97it/s]  

{'loss': 0.4821, 'grad_norm': 1.0402754545211792, 'learning_rate': 2.97430801987225e-06, 'epoch': 0.85}


 85%|████████▌ | 240500/281800 [5:54:03<45:37, 15.09it/s]  

{'loss': 0.4815, 'grad_norm': 1.0374261140823364, 'learning_rate': 2.938821859474805e-06, 'epoch': 0.85}


 86%|████████▌ | 241000/281800 [5:54:38<46:46, 14.54it/s]  

{'loss': 0.4861, 'grad_norm': 0.9310044050216675, 'learning_rate': 2.9034066713981552e-06, 'epoch': 0.86}


 86%|████████▌ | 241500/281800 [5:55:13<45:57, 14.61it/s]  

{'loss': 0.49, 'grad_norm': 1.1371692419052124, 'learning_rate': 2.8679205110007103e-06, 'epoch': 0.86}


 86%|████████▌ | 242000/281800 [5:55:47<43:48, 15.14it/s]  

{'loss': 0.4915, 'grad_norm': 1.0425175428390503, 'learning_rate': 2.832434350603265e-06, 'epoch': 0.86}


 86%|████████▌ | 242500/281800 [5:56:22<43:54, 14.92it/s]  

{'loss': 0.4873, 'grad_norm': 1.0025666952133179, 'learning_rate': 2.79694819020582e-06, 'epoch': 0.86}


 86%|████████▌ | 243000/281800 [5:56:57<43:36, 14.83it/s]  

{'loss': 0.4753, 'grad_norm': 0.9536797404289246, 'learning_rate': 2.761462029808375e-06, 'epoch': 0.86}


 86%|████████▋ | 243500/281800 [5:57:31<43:12, 14.77it/s]  

{'loss': 0.4894, 'grad_norm': 1.1393194198608398, 'learning_rate': 2.7259758694109303e-06, 'epoch': 0.86}


 87%|████████▋ | 244000/281800 [5:58:06<42:12, 14.92it/s]  

{'loss': 0.4948, 'grad_norm': 1.2167068719863892, 'learning_rate': 2.69056068133428e-06, 'epoch': 0.87}


 87%|████████▋ | 244500/281800 [5:58:40<41:45, 14.89it/s]  

{'loss': 0.4901, 'grad_norm': 1.1045559644699097, 'learning_rate': 2.655074520936835e-06, 'epoch': 0.87}


 87%|████████▋ | 245000/281800 [5:59:15<41:13, 14.88it/s]  

{'loss': 0.478, 'grad_norm': 1.0188795328140259, 'learning_rate': 2.6195883605393897e-06, 'epoch': 0.87}


 87%|████████▋ | 245500/281800 [5:59:49<41:23, 14.62it/s]  

{'loss': 0.4973, 'grad_norm': 1.2629308700561523, 'learning_rate': 2.584102200141945e-06, 'epoch': 0.87}


 87%|████████▋ | 246000/281800 [6:00:24<39:43, 15.02it/s]  

{'loss': 0.4908, 'grad_norm': 1.0900989770889282, 'learning_rate': 2.5486870120652945e-06, 'epoch': 0.87}


 87%|████████▋ | 246500/281800 [6:00:58<39:57, 14.73it/s]  

{'loss': 0.4884, 'grad_norm': 0.9710904955863953, 'learning_rate': 2.5132008516678496e-06, 'epoch': 0.87}


 88%|████████▊ | 247000/281800 [6:01:33<39:34, 14.66it/s]  

{'loss': 0.4875, 'grad_norm': 1.1863396167755127, 'learning_rate': 2.4777146912704047e-06, 'epoch': 0.88}


 88%|████████▊ | 247500/281800 [6:02:08<38:40, 14.78it/s]  

{'loss': 0.4867, 'grad_norm': 1.1050565242767334, 'learning_rate': 2.44222853087296e-06, 'epoch': 0.88}


 88%|████████▊ | 248000/281800 [6:02:42<38:31, 14.62it/s]  

{'loss': 0.4818, 'grad_norm': 1.2339357137680054, 'learning_rate': 2.4067423704755145e-06, 'epoch': 0.88}


 88%|████████▊ | 248500/281800 [6:03:17<37:13, 14.91it/s]  

{'loss': 0.482, 'grad_norm': 1.1225672960281372, 'learning_rate': 2.3713271823988646e-06, 'epoch': 0.88}


 88%|████████▊ | 249000/281800 [6:03:52<37:27, 14.59it/s]  

{'loss': 0.4826, 'grad_norm': 1.0198825597763062, 'learning_rate': 2.3358410220014197e-06, 'epoch': 0.88}


 89%|████████▊ | 249500/281800 [6:04:27<37:16, 14.44it/s]  

{'loss': 0.4873, 'grad_norm': 1.3326942920684814, 'learning_rate': 2.300354861603975e-06, 'epoch': 0.89}


 89%|████████▊ | 250000/281800 [6:05:02<36:24, 14.56it/s]  

{'loss': 0.4908, 'grad_norm': 0.9244155287742615, 'learning_rate': 2.2648687012065295e-06, 'epoch': 0.89}


 89%|████████▉ | 250500/281800 [6:05:36<34:40, 15.04it/s]  

{'loss': 0.4965, 'grad_norm': 1.0815836191177368, 'learning_rate': 2.2294535131298796e-06, 'epoch': 0.89}


 89%|████████▉ | 251000/281800 [6:06:11<34:01, 15.08it/s]  

{'loss': 0.4798, 'grad_norm': 1.046034574508667, 'learning_rate': 2.1939673527324347e-06, 'epoch': 0.89}


 89%|████████▉ | 251500/281800 [6:06:46<34:29, 14.64it/s]  

{'loss': 0.4779, 'grad_norm': 1.0306756496429443, 'learning_rate': 2.15848119233499e-06, 'epoch': 0.89}


 89%|████████▉ | 252000/281800 [6:07:20<34:22, 14.45it/s]  

{'loss': 0.4826, 'grad_norm': 0.8593090176582336, 'learning_rate': 2.1229950319375445e-06, 'epoch': 0.89}


 90%|████████▉ | 252500/281800 [6:07:55<33:04, 14.77it/s]  

{'loss': 0.4748, 'grad_norm': 1.1683639287948608, 'learning_rate': 2.0875088715400996e-06, 'epoch': 0.9}


 90%|████████▉ | 253000/281800 [6:08:30<32:12, 14.90it/s]  

{'loss': 0.4873, 'grad_norm': 1.1002625226974487, 'learning_rate': 2.0520936834634493e-06, 'epoch': 0.9}


 90%|████████▉ | 253500/281800 [6:09:04<31:17, 15.07it/s]  

{'loss': 0.4794, 'grad_norm': 1.3211708068847656, 'learning_rate': 2.0166075230660044e-06, 'epoch': 0.9}


 90%|█████████ | 254000/281800 [6:09:39<31:18, 14.80it/s]  

{'loss': 0.4842, 'grad_norm': 1.1203234195709229, 'learning_rate': 1.9811213626685595e-06, 'epoch': 0.9}


 90%|█████████ | 254500/281800 [6:10:13<30:35, 14.87it/s]  

{'loss': 0.4813, 'grad_norm': 1.1502676010131836, 'learning_rate': 1.9456352022711146e-06, 'epoch': 0.9}


 90%|█████████ | 255000/281800 [6:10:48<29:52, 14.95it/s]  

{'loss': 0.4842, 'grad_norm': 1.0004339218139648, 'learning_rate': 1.9102200141944643e-06, 'epoch': 0.9}


 91%|█████████ | 255500/281800 [6:11:23<29:41, 14.77it/s]  

{'loss': 0.4886, 'grad_norm': 1.0917730331420898, 'learning_rate': 1.8747338537970194e-06, 'epoch': 0.91}


 91%|█████████ | 256000/281800 [6:11:58<29:13, 14.72it/s]  

{'loss': 0.4848, 'grad_norm': 1.0438523292541504, 'learning_rate': 1.8392476933995742e-06, 'epoch': 0.91}


 91%|█████████ | 256500/281800 [6:12:33<28:49, 14.63it/s]  

{'loss': 0.4929, 'grad_norm': 0.9424691796302795, 'learning_rate': 1.8037615330021293e-06, 'epoch': 0.91}


 91%|█████████ | 257000/281800 [6:13:08<28:09, 14.68it/s]  

{'loss': 0.4971, 'grad_norm': 0.9186416268348694, 'learning_rate': 1.7683463449254793e-06, 'epoch': 0.91}


 91%|█████████▏| 257500/281800 [6:13:42<27:02, 14.97it/s]  

{'loss': 0.4815, 'grad_norm': 1.1927155256271362, 'learning_rate': 1.7328601845280344e-06, 'epoch': 0.91}


 92%|█████████▏| 258000/281800 [6:14:17<26:28, 14.98it/s]  

{'loss': 0.4859, 'grad_norm': 0.916533350944519, 'learning_rate': 1.6973740241305892e-06, 'epoch': 0.92}


 92%|█████████▏| 258500/281800 [6:14:52<26:24, 14.70it/s]  

{'loss': 0.4923, 'grad_norm': 1.0675932168960571, 'learning_rate': 1.6618878637331443e-06, 'epoch': 0.92}


 92%|█████████▏| 259000/281800 [6:15:26<25:22, 14.98it/s]  

{'loss': 0.4771, 'grad_norm': 0.9799609780311584, 'learning_rate': 1.626401703335699e-06, 'epoch': 0.92}


 92%|█████████▏| 259500/281800 [6:16:01<24:46, 15.00it/s]  

{'loss': 0.489, 'grad_norm': 1.2520945072174072, 'learning_rate': 1.5909865152590491e-06, 'epoch': 0.92}


 92%|█████████▏| 260000/281800 [6:16:35<24:44, 14.69it/s]  

{'loss': 0.4909, 'grad_norm': 1.2329450845718384, 'learning_rate': 1.555500354861604e-06, 'epoch': 0.92}


 92%|█████████▏| 260500/281800 [6:17:10<23:51, 14.88it/s]  

{'loss': 0.4791, 'grad_norm': 0.9622825384140015, 'learning_rate': 1.5200141944641591e-06, 'epoch': 0.92}


 93%|█████████▎| 261000/281800 [6:17:45<23:27, 14.78it/s]  

{'loss': 0.4877, 'grad_norm': 0.8620848655700684, 'learning_rate': 1.484528034066714e-06, 'epoch': 0.93}


 93%|█████████▎| 261500/281800 [6:18:20<23:06, 14.64it/s]  

{'loss': 0.472, 'grad_norm': 1.0679471492767334, 'learning_rate': 1.4491128459900641e-06, 'epoch': 0.93}


 93%|█████████▎| 262000/281800 [6:18:55<22:32, 14.64it/s]  

{'loss': 0.4762, 'grad_norm': 0.9509482979774475, 'learning_rate': 1.413626685592619e-06, 'epoch': 0.93}


 93%|█████████▎| 262500/281800 [6:19:29<21:35, 14.89it/s]  

{'loss': 0.4878, 'grad_norm': 1.001638650894165, 'learning_rate': 1.378140525195174e-06, 'epoch': 0.93}


 93%|█████████▎| 263000/281800 [6:20:04<21:06, 14.85it/s]  

{'loss': 0.4792, 'grad_norm': 1.2719241380691528, 'learning_rate': 1.3426543647977288e-06, 'epoch': 0.93}


 94%|█████████▎| 263500/281800 [6:20:39<21:12, 14.38it/s]

{'loss': 0.4762, 'grad_norm': 1.1156750917434692, 'learning_rate': 1.3072391767210787e-06, 'epoch': 0.94}


 94%|█████████▎| 264000/281800 [6:21:14<19:51, 14.94it/s]  

{'loss': 0.4813, 'grad_norm': 1.0452343225479126, 'learning_rate': 1.2717530163236338e-06, 'epoch': 0.94}


 94%|█████████▍| 264500/281800 [6:21:49<19:19, 14.92it/s]

{'loss': 0.4958, 'grad_norm': 1.0330921411514282, 'learning_rate': 1.2362668559261889e-06, 'epoch': 0.94}


 94%|█████████▍| 265000/281800 [6:22:23<19:15, 14.54it/s]

{'loss': 0.4773, 'grad_norm': 1.065355896949768, 'learning_rate': 1.200780695528744e-06, 'epoch': 0.94}


 94%|█████████▍| 265500/281800 [6:22:58<18:29, 14.68it/s]

{'loss': 0.471, 'grad_norm': 0.8747581839561462, 'learning_rate': 1.1653655074520937e-06, 'epoch': 0.94}


 94%|█████████▍| 266000/281800 [6:23:33<17:52, 14.73it/s]

{'loss': 0.4956, 'grad_norm': 1.0617005825042725, 'learning_rate': 1.1298793470546488e-06, 'epoch': 0.94}


 95%|█████████▍| 266500/281800 [6:24:07<17:10, 14.85it/s]

{'loss': 0.4917, 'grad_norm': 0.8997498750686646, 'learning_rate': 1.0943931866572039e-06, 'epoch': 0.95}


 95%|█████████▍| 267000/281800 [6:24:42<16:44, 14.73it/s]

{'loss': 0.4803, 'grad_norm': 0.9233473539352417, 'learning_rate': 1.0589070262597588e-06, 'epoch': 0.95}


 95%|█████████▍| 267500/281800 [6:25:17<15:54, 14.98it/s]

{'loss': 0.4812, 'grad_norm': 1.1314246654510498, 'learning_rate': 1.0234918381831087e-06, 'epoch': 0.95}


 95%|█████████▌| 268000/281800 [6:25:51<15:20, 15.00it/s]

{'loss': 0.4884, 'grad_norm': 1.088498592376709, 'learning_rate': 9.880056777856636e-07, 'epoch': 0.95}


 95%|█████████▌| 268500/281800 [6:26:32<21:04, 10.52it/s]

{'loss': 0.4784, 'grad_norm': 1.1444565057754517, 'learning_rate': 9.525195173882188e-07, 'epoch': 0.95}


 95%|█████████▌| 269000/281800 [6:27:22<19:59, 10.67it/s]  

{'loss': 0.4782, 'grad_norm': 1.0164300203323364, 'learning_rate': 9.170333569907737e-07, 'epoch': 0.95}


 96%|█████████▌| 269500/281800 [6:28:11<20:02, 10.22it/s]  

{'loss': 0.4826, 'grad_norm': 1.0610593557357788, 'learning_rate': 8.816181689141236e-07, 'epoch': 0.96}


 96%|█████████▌| 270000/281800 [6:29:00<19:12, 10.24it/s]  

{'loss': 0.477, 'grad_norm': 1.1988452672958374, 'learning_rate': 8.461320085166786e-07, 'epoch': 0.96}


 96%|█████████▌| 270500/281800 [6:29:49<17:54, 10.51it/s]  

{'loss': 0.4877, 'grad_norm': 0.7253214716911316, 'learning_rate': 8.106458481192335e-07, 'epoch': 0.96}


 96%|█████████▌| 271000/281800 [6:30:39<17:26, 10.32it/s]

{'loss': 0.494, 'grad_norm': 1.043880820274353, 'learning_rate': 7.751596877217886e-07, 'epoch': 0.96}


 96%|█████████▋| 271500/281800 [6:31:28<15:49, 10.85it/s]

{'loss': 0.4858, 'grad_norm': 1.187256097793579, 'learning_rate': 7.396735273243435e-07, 'epoch': 0.96}


 97%|█████████▋| 272000/281800 [6:32:18<15:18, 10.67it/s]

{'loss': 0.4936, 'grad_norm': 1.1493401527404785, 'learning_rate': 7.042583392476934e-07, 'epoch': 0.97}


 97%|█████████▋| 272500/281800 [6:33:07<15:10, 10.21it/s]

{'loss': 0.4768, 'grad_norm': 1.0300171375274658, 'learning_rate': 6.687721788502484e-07, 'epoch': 0.97}


 97%|█████████▋| 273000/281800 [6:33:56<13:29, 10.87it/s]

{'loss': 0.4922, 'grad_norm': 0.9953418374061584, 'learning_rate': 6.332860184528035e-07, 'epoch': 0.97}


 97%|█████████▋| 273500/281800 [6:34:45<13:32, 10.22it/s]

{'loss': 0.4838, 'grad_norm': 1.073884129524231, 'learning_rate': 5.977998580553584e-07, 'epoch': 0.97}


 97%|█████████▋| 274000/281800 [6:35:34<12:32, 10.36it/s]

{'loss': 0.4777, 'grad_norm': 0.954839825630188, 'learning_rate': 5.623846699787083e-07, 'epoch': 0.97}


 97%|█████████▋| 274500/281800 [6:36:23<11:18, 10.75it/s]

{'loss': 0.4831, 'grad_norm': 1.0922138690948486, 'learning_rate': 5.268985095812633e-07, 'epoch': 0.97}


 98%|█████████▊| 275000/281800 [6:37:12<10:36, 10.68it/s]

{'loss': 0.4897, 'grad_norm': 1.1085866689682007, 'learning_rate': 4.914123491838183e-07, 'epoch': 0.98}


 98%|█████████▊| 275500/281800 [6:38:01<10:01, 10.47it/s]

{'loss': 0.4965, 'grad_norm': 0.9846919178962708, 'learning_rate': 4.5592618878637334e-07, 'epoch': 0.98}


 98%|█████████▊| 276000/281800 [6:38:50<09:35, 10.07it/s]

{'loss': 0.4879, 'grad_norm': 0.8957970142364502, 'learning_rate': 4.204400283889284e-07, 'epoch': 0.98}


 98%|█████████▊| 276500/281800 [6:39:39<08:20, 10.59it/s]

{'loss': 0.5017, 'grad_norm': 1.123740553855896, 'learning_rate': 3.8502484031227825e-07, 'epoch': 0.98}


 98%|█████████▊| 277000/281800 [6:40:28<07:36, 10.52it/s]

{'loss': 0.4881, 'grad_norm': 1.1934735774993896, 'learning_rate': 3.4953867991483324e-07, 'epoch': 0.98}


 98%|█████████▊| 277500/281800 [6:41:17<06:50, 10.47it/s]

{'loss': 0.4834, 'grad_norm': 1.0273939371109009, 'learning_rate': 3.1405251951738823e-07, 'epoch': 0.98}


 99%|█████████▊| 278000/281800 [6:42:06<05:58, 10.60it/s]

{'loss': 0.4843, 'grad_norm': 0.7226535081863403, 'learning_rate': 2.785663591199432e-07, 'epoch': 0.99}


 99%|█████████▉| 278500/281800 [6:42:56<05:21, 10.27it/s]

{'loss': 0.479, 'grad_norm': 0.7980153560638428, 'learning_rate': 2.4315117104329314e-07, 'epoch': 0.99}


 99%|█████████▉| 279000/281800 [6:43:45<04:34, 10.21it/s]

{'loss': 0.4876, 'grad_norm': 0.9969973564147949, 'learning_rate': 2.0766501064584815e-07, 'epoch': 0.99}


 99%|█████████▉| 279500/281800 [6:44:34<03:42, 10.32it/s]

{'loss': 0.4854, 'grad_norm': 1.062855839729309, 'learning_rate': 1.7217885024840314e-07, 'epoch': 0.99}


 99%|█████████▉| 280000/281800 [6:45:23<02:50, 10.57it/s]

{'loss': 0.4876, 'grad_norm': 1.1928625106811523, 'learning_rate': 1.3669268985095813e-07, 'epoch': 0.99}


100%|█████████▉| 280500/281800 [6:46:12<02:04, 10.46it/s]

{'loss': 0.4778, 'grad_norm': 1.1356093883514404, 'learning_rate': 1.0127750177430803e-07, 'epoch': 1.0}


100%|█████████▉| 281000/281800 [6:47:01<01:16, 10.45it/s]

{'loss': 0.4973, 'grad_norm': 1.1282038688659668, 'learning_rate': 6.579134137686303e-08, 'epoch': 1.0}


100%|█████████▉| 281500/281800 [6:47:50<00:28, 10.61it/s]

{'loss': 0.4752, 'grad_norm': 1.1099728345870972, 'learning_rate': 3.0305180979418026e-08, 'epoch': 1.0}


                                                         
100%|██████████| 281800/281800 [6:48:29<00:00, 11.50it/s]

{'eval_loss': 0.32258763909339905, 'eval_runtime': 7.401, 'eval_samples_per_second': 405.351, 'eval_steps_per_second': 25.402, 'epoch': 1.0}
{'train_runtime': 24509.8665, 'train_samples_per_second': 183.958, 'train_steps_per_second': 11.497, 'train_loss': 0.5145373834592387, 'epoch': 1.0}





TrainOutput(global_step=281800, training_loss=0.5145373834592387, metrics={'train_runtime': 24509.8665, 'train_samples_per_second': 183.958, 'train_steps_per_second': 11.497, 'total_flos': 1.5284043931189248e+17, 'train_loss': 0.5145373834592387, 'epoch': 1.0})

## Save the Fine-Tuned Model

In [10]:
model.save_pretrained("./opus-mt-de-en-finetuned")
tokenizer.save_pretrained("./opus-mt-de-en-finetuned")

('./opus-mt-de-en-finetuned\\tokenizer_config.json',
 './opus-mt-de-en-finetuned\\special_tokens_map.json',
 './opus-mt-de-en-finetuned\\vocab.json',
 './opus-mt-de-en-finetuned\\source.spm',
 './opus-mt-de-en-finetuned\\target.spm',
 './opus-mt-de-en-finetuned\\added_tokens.json')