In [3]:
import os
import pandas as pd
import json

# Define the path to the folder containing JSON files
folder_path = '/Users/kritigupta/mlp project/Article-Bias-Prediction/data/jsons'

# List all JSON files in the folder
json_files = [pos_json for pos_json in os.listdir(folder_path) if pos_json.endswith('.json')]

# Initialize an empty list to store data
data_list = []

# Loop through each JSON file and read its content
for json_file in json_files:
    file_path = os.path.join(folder_path, json_file)
    with open(file_path, 'r') as file:
        data = json.load(file)
        data_list.append(data)

# Convert the list of JSON data to a DataFrame
df = pd.DataFrame(data_list)

# # Display the DataFrame
# print(df)

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset as HFDataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_dataset = df.iloc[:round(0.8*len(df))]
valid_dataset = df.iloc[round(0.8*len(df)):round(0.9*len(df))]
test_dataset = df.iloc[round(0.9*len(df)):]

In [6]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [7]:
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)


In [6]:
train_encodings = tokenize_function(train_dataset['content'].to_list())
valid_encodings = tokenize_function(valid_dataset['content'].to_list())
test_encodings = tokenize_function(test_dataset['content'].to_list())

In [13]:
train_dataset = HFDataset.from_dict({"text": train_dataset['content'].to_list(), "label": train_dataset['bias'].to_list()})
val_dataset = HFDataset.from_dict({"text": valid_dataset['content'].to_list(), "label": valid_dataset['bias'].to_list()})
test_dataset = HFDataset.from_dict({"text": test_dataset['content'].to_list(), "label": test_dataset['bias'].to_list()})

In [14]:
train_dataset = train_dataset.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=512), batched=True)
val_dataset = val_dataset.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=512), batched=True)
test_dataset = test_dataset.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=512), batched=True)

Map: 100%|██████████| 30043/30043 [01:18<00:00, 383.43 examples/s]
Map: 100%|██████████| 3756/3756 [00:09<00:00, 386.98 examples/s]
Map: 100%|██████████| 3755/3755 [00:09<00:00, 378.32 examples/s]


In [15]:
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])

In [16]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    max_grad_norm=1.0,
    fp16=False
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [19]:
trainer.train()

  0%|          | 10/11268 [00:18<5:21:46,  1.71s/it]

{'loss': 1.101, 'grad_norm': 2.180868148803711, 'learning_rate': 4.9955626553070645e-06, 'epoch': 0.0}


  0%|          | 20/11268 [00:35<5:15:27,  1.68s/it]

{'loss': 1.1047, 'grad_norm': 20.548362731933594, 'learning_rate': 4.991125310614129e-06, 'epoch': 0.01}


  0%|          | 30/11268 [00:52<5:22:15,  1.72s/it]

{'loss': 1.1001, 'grad_norm': 2.2688851356506348, 'learning_rate': 4.986687965921193e-06, 'epoch': 0.01}


  0%|          | 40/11268 [01:09<5:31:37,  1.77s/it]

{'loss': 1.0965, 'grad_norm': 1.3329694271087646, 'learning_rate': 4.982250621228258e-06, 'epoch': 0.01}


  0%|          | 50/11268 [01:27<5:36:39,  1.80s/it]

{'loss': 1.0884, 'grad_norm': 3.3356387615203857, 'learning_rate': 4.977813276535322e-06, 'epoch': 0.01}


  1%|          | 60/11268 [01:47<6:37:52,  2.13s/it]

{'loss': 1.0968, 'grad_norm': 3.10524582862854, 'learning_rate': 4.973375931842386e-06, 'epoch': 0.02}


  1%|          | 70/11268 [02:14<8:40:05,  2.79s/it]

{'loss': 1.1149, 'grad_norm': 3.935608386993408, 'learning_rate': 4.96893858714945e-06, 'epoch': 0.02}


  1%|          | 80/11268 [02:46<9:51:38,  3.17s/it] 

{'loss': 1.0865, 'grad_norm': 4.374054908752441, 'learning_rate': 4.964501242456514e-06, 'epoch': 0.02}


  1%|          | 90/11268 [03:12<8:01:20,  2.58s/it]

{'loss': 1.093, 'grad_norm': 2.463519811630249, 'learning_rate': 4.960063897763578e-06, 'epoch': 0.02}


  1%|          | 100/11268 [03:38<7:49:26,  2.52s/it]

{'loss': 1.076, 'grad_norm': 3.622931718826294, 'learning_rate': 4.955626553070643e-06, 'epoch': 0.03}


  1%|          | 110/11268 [04:02<7:43:20,  2.49s/it]

{'loss': 1.0658, 'grad_norm': 4.1382036209106445, 'learning_rate': 4.951189208377707e-06, 'epoch': 0.03}


  1%|          | 120/11268 [04:28<7:54:38,  2.55s/it]

{'loss': 1.0998, 'grad_norm': 5.006036758422852, 'learning_rate': 4.946751863684771e-06, 'epoch': 0.03}


  1%|          | 130/11268 [04:53<7:47:32,  2.52s/it]

{'loss': 1.0782, 'grad_norm': 2.9711780548095703, 'learning_rate': 4.942314518991836e-06, 'epoch': 0.03}


  1%|          | 140/11268 [05:19<8:13:49,  2.66s/it]

{'loss': 1.1043, 'grad_norm': 3.5013747215270996, 'learning_rate': 4.9378771742989e-06, 'epoch': 0.04}


  1%|▏         | 150/11268 [05:48<9:32:57,  3.09s/it]

{'loss': 1.0854, 'grad_norm': 3.3159549236297607, 'learning_rate': 4.9334398296059644e-06, 'epoch': 0.04}


  1%|▏         | 160/11268 [06:18<9:19:44,  3.02s/it]

{'loss': 1.0969, 'grad_norm': 3.0417370796203613, 'learning_rate': 4.9290024849130285e-06, 'epoch': 0.04}


  2%|▏         | 170/11268 [06:47<8:53:37,  2.88s/it]

{'loss': 1.064, 'grad_norm': 4.1927995681762695, 'learning_rate': 4.924565140220093e-06, 'epoch': 0.05}


  2%|▏         | 180/11268 [07:15<8:34:00,  2.78s/it]

{'loss': 1.0867, 'grad_norm': 6.60683536529541, 'learning_rate': 4.920127795527157e-06, 'epoch': 0.05}


  2%|▏         | 190/11268 [07:43<8:21:26,  2.72s/it]

{'loss': 1.0379, 'grad_norm': 5.03337287902832, 'learning_rate': 4.915690450834222e-06, 'epoch': 0.05}


  2%|▏         | 200/11268 [08:10<8:17:14,  2.70s/it]

{'loss': 1.0063, 'grad_norm': 20.25901985168457, 'learning_rate': 4.911253106141286e-06, 'epoch': 0.05}


  2%|▏         | 210/11268 [08:38<8:37:34,  2.81s/it]

{'loss': 1.0068, 'grad_norm': 16.945417404174805, 'learning_rate': 4.90681576144835e-06, 'epoch': 0.06}


  2%|▏         | 220/11268 [09:06<8:21:41,  2.72s/it]

{'loss': 1.0226, 'grad_norm': 16.699810028076172, 'learning_rate': 4.902378416755414e-06, 'epoch': 0.06}


  2%|▏         | 230/11268 [09:32<8:05:43,  2.64s/it]

{'loss': 1.0968, 'grad_norm': 17.65724754333496, 'learning_rate': 4.897941072062478e-06, 'epoch': 0.06}


  2%|▏         | 240/11268 [09:59<8:09:22,  2.66s/it]

{'loss': 1.0512, 'grad_norm': 16.281299591064453, 'learning_rate': 4.893503727369542e-06, 'epoch': 0.06}


  2%|▏         | 250/11268 [10:26<8:16:36,  2.70s/it]

{'loss': 1.0198, 'grad_norm': 9.094976425170898, 'learning_rate': 4.889066382676606e-06, 'epoch': 0.07}


  2%|▏         | 260/11268 [10:54<8:16:45,  2.71s/it]

{'loss': 0.9963, 'grad_norm': 14.096275329589844, 'learning_rate': 4.884629037983671e-06, 'epoch': 0.07}


  2%|▏         | 270/11268 [11:21<8:12:45,  2.69s/it]

{'loss': 0.9744, 'grad_norm': 9.90117073059082, 'learning_rate': 4.880191693290735e-06, 'epoch': 0.07}


  2%|▏         | 280/11268 [11:48<8:21:00,  2.74s/it]

{'loss': 0.9397, 'grad_norm': 32.20856475830078, 'learning_rate': 4.875754348597799e-06, 'epoch': 0.07}


  3%|▎         | 290/11268 [12:15<8:16:28,  2.71s/it]

{'loss': 0.9559, 'grad_norm': 24.709117889404297, 'learning_rate': 4.8713170039048635e-06, 'epoch': 0.08}


  3%|▎         | 300/11268 [12:43<8:24:49,  2.76s/it]

{'loss': 1.0161, 'grad_norm': 34.5052490234375, 'learning_rate': 4.866879659211928e-06, 'epoch': 0.08}


  3%|▎         | 310/11268 [13:10<8:08:51,  2.68s/it]

{'loss': 0.9021, 'grad_norm': 24.2115535736084, 'learning_rate': 4.862442314518992e-06, 'epoch': 0.08}


  3%|▎         | 320/11268 [13:37<8:11:13,  2.69s/it]

{'loss': 0.8696, 'grad_norm': 21.005727767944336, 'learning_rate': 4.858004969826057e-06, 'epoch': 0.09}


  3%|▎         | 330/11268 [14:04<8:12:52,  2.70s/it]

{'loss': 0.9507, 'grad_norm': 15.019360542297363, 'learning_rate': 4.853567625133121e-06, 'epoch': 0.09}


  3%|▎         | 340/11268 [14:31<8:16:16,  2.72s/it]

{'loss': 0.9447, 'grad_norm': 16.113773345947266, 'learning_rate': 4.849130280440185e-06, 'epoch': 0.09}


  3%|▎         | 350/11268 [14:59<8:25:44,  2.78s/it]

{'loss': 0.9346, 'grad_norm': 14.285008430480957, 'learning_rate': 4.84469293574725e-06, 'epoch': 0.09}


  3%|▎         | 360/11268 [15:27<8:25:54,  2.78s/it]

{'loss': 0.8827, 'grad_norm': 23.993881225585938, 'learning_rate': 4.840255591054314e-06, 'epoch': 0.1}


  3%|▎         | 370/11268 [15:54<8:21:11,  2.76s/it]

{'loss': 0.8679, 'grad_norm': 17.099952697753906, 'learning_rate': 4.835818246361378e-06, 'epoch': 0.1}


  3%|▎         | 380/11268 [16:22<8:15:44,  2.73s/it]

{'loss': 0.9071, 'grad_norm': 15.823094367980957, 'learning_rate': 4.831380901668442e-06, 'epoch': 0.1}


  3%|▎         | 390/11268 [16:50<8:16:42,  2.74s/it]

{'loss': 0.8569, 'grad_norm': 23.060998916625977, 'learning_rate': 4.826943556975506e-06, 'epoch': 0.1}


  4%|▎         | 400/11268 [17:17<8:16:44,  2.74s/it]

{'loss': 0.8119, 'grad_norm': 47.94389724731445, 'learning_rate': 4.82250621228257e-06, 'epoch': 0.11}


  4%|▎         | 410/11268 [17:44<8:13:40,  2.73s/it]

{'loss': 0.7319, 'grad_norm': 26.12384796142578, 'learning_rate': 4.818068867589634e-06, 'epoch': 0.11}


  4%|▎         | 420/11268 [18:11<7:50:43,  2.60s/it]

{'loss': 0.6861, 'grad_norm': 13.601881980895996, 'learning_rate': 4.813631522896699e-06, 'epoch': 0.11}


  4%|▍         | 430/11268 [18:36<7:22:02,  2.45s/it]

{'loss': 0.8515, 'grad_norm': 24.602535247802734, 'learning_rate': 4.809194178203763e-06, 'epoch': 0.11}


  4%|▍         | 440/11268 [19:00<7:15:48,  2.41s/it]

{'loss': 0.9217, 'grad_norm': 27.71798324584961, 'learning_rate': 4.8047568335108275e-06, 'epoch': 0.12}


  4%|▍         | 450/11268 [19:24<7:33:37,  2.52s/it]

{'loss': 0.708, 'grad_norm': 14.012025833129883, 'learning_rate': 4.800319488817892e-06, 'epoch': 0.12}


  4%|▍         | 460/11268 [19:50<7:35:42,  2.53s/it]

{'loss': 0.861, 'grad_norm': 13.043439865112305, 'learning_rate': 4.795882144124956e-06, 'epoch': 0.12}


  4%|▍         | 470/11268 [20:16<7:43:22,  2.57s/it]

{'loss': 0.6439, 'grad_norm': 21.09777069091797, 'learning_rate': 4.79144479943202e-06, 'epoch': 0.13}


  4%|▍         | 480/11268 [20:42<7:59:19,  2.67s/it]

{'loss': 0.8182, 'grad_norm': 33.561073303222656, 'learning_rate': 4.787007454739085e-06, 'epoch': 0.13}


  4%|▍         | 490/11268 [21:08<7:54:07,  2.64s/it]

{'loss': 0.6582, 'grad_norm': 47.77071762084961, 'learning_rate': 4.782570110046149e-06, 'epoch': 0.13}


  4%|▍         | 500/11268 [21:36<8:01:49,  2.68s/it]

{'loss': 0.626, 'grad_norm': 26.4221248626709, 'learning_rate': 4.778132765353213e-06, 'epoch': 0.13}


  5%|▍         | 510/11268 [22:02<8:01:27,  2.69s/it]

{'loss': 0.7418, 'grad_norm': 25.370988845825195, 'learning_rate': 4.773695420660277e-06, 'epoch': 0.14}


  5%|▍         | 520/11268 [22:30<8:11:49,  2.75s/it]

{'loss': 0.8102, 'grad_norm': 21.20345115661621, 'learning_rate': 4.769258075967342e-06, 'epoch': 0.14}


  5%|▍         | 530/11268 [22:57<8:05:03,  2.71s/it]

{'loss': 0.674, 'grad_norm': 31.8303279876709, 'learning_rate': 4.764820731274406e-06, 'epoch': 0.14}


  5%|▍         | 540/11268 [23:25<8:08:23,  2.73s/it]

{'loss': 0.8384, 'grad_norm': 44.16394805908203, 'learning_rate': 4.76038338658147e-06, 'epoch': 0.14}


  5%|▍         | 550/11268 [23:52<8:01:19,  2.69s/it]

{'loss': 0.7557, 'grad_norm': 23.01811981201172, 'learning_rate': 4.755946041888534e-06, 'epoch': 0.15}


  5%|▍         | 560/11268 [24:19<8:07:38,  2.73s/it]

{'loss': 0.7821, 'grad_norm': 57.59928512573242, 'learning_rate': 4.751508697195598e-06, 'epoch': 0.15}


  5%|▌         | 570/11268 [24:47<8:19:15,  2.80s/it]

{'loss': 0.8087, 'grad_norm': 26.099145889282227, 'learning_rate': 4.747071352502663e-06, 'epoch': 0.15}


  5%|▌         | 580/11268 [25:15<8:25:27,  2.84s/it]

{'loss': 0.7424, 'grad_norm': 17.113813400268555, 'learning_rate': 4.7426340078097274e-06, 'epoch': 0.15}


  5%|▌         | 590/11268 [25:44<8:39:25,  2.92s/it]

{'loss': 0.7406, 'grad_norm': 17.72032356262207, 'learning_rate': 4.7381966631167915e-06, 'epoch': 0.16}


  5%|▌         | 600/11268 [26:13<8:37:11,  2.91s/it]

{'loss': 0.6136, 'grad_norm': 20.37584114074707, 'learning_rate': 4.733759318423856e-06, 'epoch': 0.16}


  5%|▌         | 610/11268 [26:42<8:19:02,  2.81s/it]

{'loss': 0.7863, 'grad_norm': 14.854445457458496, 'learning_rate': 4.72932197373092e-06, 'epoch': 0.16}


  6%|▌         | 620/11268 [27:10<8:24:51,  2.84s/it]

{'loss': 0.7957, 'grad_norm': 32.50448989868164, 'learning_rate': 4.724884629037984e-06, 'epoch': 0.17}


  6%|▌         | 630/11268 [27:40<8:53:59,  3.01s/it]

{'loss': 0.6076, 'grad_norm': 29.92447853088379, 'learning_rate': 4.720447284345048e-06, 'epoch': 0.17}


  6%|▌         | 640/11268 [28:10<8:48:05,  2.98s/it]

{'loss': 0.7537, 'grad_norm': 12.294814109802246, 'learning_rate': 4.716009939652113e-06, 'epoch': 0.17}


  6%|▌         | 650/11268 [28:40<8:44:59,  2.97s/it]

{'loss': 0.6871, 'grad_norm': 28.16832160949707, 'learning_rate': 4.711572594959177e-06, 'epoch': 0.17}


  6%|▌         | 660/11268 [29:08<7:45:39,  2.63s/it]

{'loss': 0.5651, 'grad_norm': 90.14241027832031, 'learning_rate': 4.707135250266241e-06, 'epoch': 0.18}


  6%|▌         | 670/11268 [29:32<6:52:51,  2.34s/it]

{'loss': 0.8134, 'grad_norm': 48.91092300415039, 'learning_rate': 4.702697905573305e-06, 'epoch': 0.18}


  6%|▌         | 680/11268 [29:54<6:25:13,  2.18s/it]

{'loss': 0.8326, 'grad_norm': 27.665481567382812, 'learning_rate': 4.698260560880369e-06, 'epoch': 0.18}


  6%|▌         | 690/11268 [30:15<6:15:58,  2.13s/it]

{'loss': 0.7091, 'grad_norm': 33.83591842651367, 'learning_rate': 4.693823216187433e-06, 'epoch': 0.18}


  6%|▌         | 700/11268 [30:37<6:23:54,  2.18s/it]

{'loss': 0.7269, 'grad_norm': 25.907155990600586, 'learning_rate': 4.6893858714944975e-06, 'epoch': 0.19}


  6%|▋         | 710/11268 [30:59<6:22:41,  2.17s/it]

{'loss': 0.6516, 'grad_norm': 19.19723129272461, 'learning_rate': 4.684948526801562e-06, 'epoch': 0.19}


  6%|▋         | 720/11268 [31:21<6:26:47,  2.20s/it]

{'loss': 0.6718, 'grad_norm': 20.81375503540039, 'learning_rate': 4.6805111821086265e-06, 'epoch': 0.19}


  6%|▋         | 730/11268 [31:43<6:24:43,  2.19s/it]

{'loss': 0.7673, 'grad_norm': 49.074684143066406, 'learning_rate': 4.6760738374156914e-06, 'epoch': 0.19}


  7%|▋         | 740/11268 [32:05<6:26:09,  2.20s/it]

{'loss': 0.6633, 'grad_norm': 31.557104110717773, 'learning_rate': 4.6716364927227555e-06, 'epoch': 0.2}


  7%|▋         | 750/11268 [32:27<6:27:10,  2.21s/it]

{'loss': 0.6651, 'grad_norm': 33.61763000488281, 'learning_rate': 4.66719914802982e-06, 'epoch': 0.2}


  7%|▋         | 760/11268 [32:49<6:25:50,  2.20s/it]

{'loss': 0.7033, 'grad_norm': 29.612642288208008, 'learning_rate': 4.662761803336884e-06, 'epoch': 0.2}


  7%|▋         | 770/11268 [33:11<6:23:04,  2.19s/it]

{'loss': 0.6713, 'grad_norm': 35.30272674560547, 'learning_rate': 4.658324458643948e-06, 'epoch': 0.21}


  7%|▋         | 780/11268 [33:33<6:31:03,  2.24s/it]

{'loss': 0.5338, 'grad_norm': 56.90960693359375, 'learning_rate': 4.653887113951012e-06, 'epoch': 0.21}


  7%|▋         | 790/11268 [33:56<6:29:35,  2.23s/it]

{'loss': 0.6004, 'grad_norm': 42.410377502441406, 'learning_rate': 4.649449769258076e-06, 'epoch': 0.21}


  7%|▋         | 800/11268 [34:18<6:34:35,  2.26s/it]

{'loss': 0.7435, 'grad_norm': 25.92689323425293, 'learning_rate': 4.645012424565141e-06, 'epoch': 0.21}


  7%|▋         | 810/11268 [34:41<6:41:40,  2.30s/it]

{'loss': 0.6701, 'grad_norm': 21.87204933166504, 'learning_rate': 4.640575079872205e-06, 'epoch': 0.22}


  7%|▋         | 820/11268 [35:04<6:29:36,  2.24s/it]

{'loss': 0.5621, 'grad_norm': 29.98390007019043, 'learning_rate': 4.636137735179269e-06, 'epoch': 0.22}


  7%|▋         | 830/11268 [35:26<6:33:57,  2.26s/it]

{'loss': 0.7574, 'grad_norm': 45.0776481628418, 'learning_rate': 4.631700390486333e-06, 'epoch': 0.22}


  7%|▋         | 840/11268 [35:49<6:41:13,  2.31s/it]

{'loss': 0.5998, 'grad_norm': 22.044301986694336, 'learning_rate': 4.627263045793397e-06, 'epoch': 0.22}


  8%|▊         | 850/11268 [36:12<6:26:29,  2.23s/it]

{'loss': 0.6442, 'grad_norm': 19.954084396362305, 'learning_rate': 4.6228257011004615e-06, 'epoch': 0.23}


  8%|▊         | 860/11268 [36:34<6:27:47,  2.24s/it]

{'loss': 0.5442, 'grad_norm': 46.29410171508789, 'learning_rate': 4.618388356407526e-06, 'epoch': 0.23}


  8%|▊         | 870/11268 [36:57<6:32:09,  2.26s/it]

{'loss': 0.4968, 'grad_norm': 46.635860443115234, 'learning_rate': 4.6139510117145905e-06, 'epoch': 0.23}


  8%|▊         | 880/11268 [37:20<6:34:24,  2.28s/it]

{'loss': 0.6694, 'grad_norm': 61.50525665283203, 'learning_rate': 4.609513667021655e-06, 'epoch': 0.23}


  8%|▊         | 890/11268 [37:42<6:23:36,  2.22s/it]

{'loss': 0.4651, 'grad_norm': 22.6109619140625, 'learning_rate': 4.605076322328719e-06, 'epoch': 0.24}


  8%|▊         | 900/11268 [38:04<6:20:50,  2.20s/it]

{'loss': 0.4204, 'grad_norm': 35.642433166503906, 'learning_rate': 4.600638977635783e-06, 'epoch': 0.24}


  8%|▊         | 910/11268 [38:27<6:28:51,  2.25s/it]

{'loss': 0.5079, 'grad_norm': 40.22004699707031, 'learning_rate': 4.596201632942847e-06, 'epoch': 0.24}


  8%|▊         | 920/11268 [38:49<6:30:45,  2.27s/it]

{'loss': 0.6034, 'grad_norm': 54.28346633911133, 'learning_rate': 4.591764288249911e-06, 'epoch': 0.24}


  8%|▊         | 930/11268 [39:12<6:35:34,  2.30s/it]

{'loss': 0.6961, 'grad_norm': 14.682561874389648, 'learning_rate': 4.587326943556976e-06, 'epoch': 0.25}


  8%|▊         | 940/11268 [39:35<6:30:06,  2.27s/it]

{'loss': 0.5714, 'grad_norm': 46.71699142456055, 'learning_rate': 4.58288959886404e-06, 'epoch': 0.25}


  8%|▊         | 950/11268 [39:58<6:30:22,  2.27s/it]

{'loss': 0.5371, 'grad_norm': 19.711076736450195, 'learning_rate': 4.578452254171105e-06, 'epoch': 0.25}


  9%|▊         | 960/11268 [40:20<6:33:13,  2.29s/it]

{'loss': 0.6029, 'grad_norm': 44.26546096801758, 'learning_rate': 4.574014909478169e-06, 'epoch': 0.26}


  9%|▊         | 970/11268 [40:43<6:28:36,  2.26s/it]

{'loss': 0.6433, 'grad_norm': 30.732563018798828, 'learning_rate': 4.569577564785233e-06, 'epoch': 0.26}


  9%|▊         | 980/11268 [41:06<6:25:33,  2.25s/it]

{'loss': 0.5183, 'grad_norm': 36.73834991455078, 'learning_rate': 4.565140220092297e-06, 'epoch': 0.26}


  9%|▉         | 990/11268 [41:28<6:28:21,  2.27s/it]

{'loss': 0.7053, 'grad_norm': 33.021827697753906, 'learning_rate': 4.560702875399361e-06, 'epoch': 0.26}


  9%|▉         | 1000/11268 [41:51<6:29:24,  2.28s/it]

{'loss': 0.5576, 'grad_norm': 15.916735649108887, 'learning_rate': 4.5562655307064255e-06, 'epoch': 0.27}


  9%|▉         | 1010/11268 [42:14<6:26:42,  2.26s/it]

{'loss': 0.5954, 'grad_norm': 33.52116775512695, 'learning_rate': 4.55182818601349e-06, 'epoch': 0.27}


  9%|▉         | 1020/11268 [42:37<6:27:41,  2.27s/it]

{'loss': 0.6136, 'grad_norm': 71.4487533569336, 'learning_rate': 4.5473908413205545e-06, 'epoch': 0.27}


  9%|▉         | 1030/11268 [42:59<6:26:16,  2.26s/it]

{'loss': 0.6643, 'grad_norm': 51.97545623779297, 'learning_rate': 4.542953496627619e-06, 'epoch': 0.27}


  9%|▉         | 1040/11268 [43:22<6:28:24,  2.28s/it]

{'loss': 0.4944, 'grad_norm': 16.853572845458984, 'learning_rate': 4.538516151934683e-06, 'epoch': 0.28}


  9%|▉         | 1050/11268 [43:45<6:23:40,  2.25s/it]

{'loss': 0.671, 'grad_norm': 54.41709518432617, 'learning_rate': 4.534078807241747e-06, 'epoch': 0.28}


  9%|▉         | 1060/11268 [44:07<6:22:51,  2.25s/it]

{'loss': 0.3649, 'grad_norm': 27.009441375732422, 'learning_rate': 4.529641462548811e-06, 'epoch': 0.28}


  9%|▉         | 1070/11268 [44:30<6:23:19,  2.26s/it]

{'loss': 0.6091, 'grad_norm': 58.78076171875, 'learning_rate': 4.525204117855875e-06, 'epoch': 0.28}


 10%|▉         | 1080/11268 [44:53<6:25:02,  2.27s/it]

{'loss': 0.5505, 'grad_norm': 44.915767669677734, 'learning_rate': 4.52076677316294e-06, 'epoch': 0.29}


 10%|▉         | 1090/11268 [45:15<6:23:07,  2.26s/it]

{'loss': 0.5749, 'grad_norm': 40.530338287353516, 'learning_rate': 4.516329428470004e-06, 'epoch': 0.29}


 10%|▉         | 1100/11268 [45:38<6:22:17,  2.26s/it]

{'loss': 0.7549, 'grad_norm': 100.83206939697266, 'learning_rate': 4.511892083777068e-06, 'epoch': 0.29}


 10%|▉         | 1110/11268 [46:02<6:41:08,  2.37s/it]

{'loss': 0.4467, 'grad_norm': 20.176633834838867, 'learning_rate': 4.507454739084132e-06, 'epoch': 0.3}


 10%|▉         | 1120/11268 [46:25<6:27:55,  2.29s/it]

{'loss': 0.5694, 'grad_norm': 6.653196811676025, 'learning_rate': 4.503017394391196e-06, 'epoch': 0.3}


 10%|█         | 1130/11268 [46:48<6:29:06,  2.30s/it]

{'loss': 0.6137, 'grad_norm': 30.77353858947754, 'learning_rate': 4.4985800496982605e-06, 'epoch': 0.3}


 10%|█         | 1140/11268 [47:11<6:25:04,  2.28s/it]

{'loss': 0.7237, 'grad_norm': 47.246639251708984, 'learning_rate': 4.494142705005325e-06, 'epoch': 0.3}


 10%|█         | 1150/11268 [47:34<6:23:50,  2.28s/it]

{'loss': 0.5938, 'grad_norm': 19.615482330322266, 'learning_rate': 4.4897053603123895e-06, 'epoch': 0.31}


 10%|█         | 1160/11268 [47:57<6:30:06,  2.32s/it]

{'loss': 0.5076, 'grad_norm': 43.58363342285156, 'learning_rate': 4.485268015619454e-06, 'epoch': 0.31}


 10%|█         | 1170/11268 [48:20<6:29:07,  2.31s/it]

{'loss': 0.5314, 'grad_norm': 19.3497257232666, 'learning_rate': 4.4808306709265185e-06, 'epoch': 0.31}


 10%|█         | 1180/11268 [48:44<6:27:50,  2.31s/it]

{'loss': 0.6132, 'grad_norm': 34.415191650390625, 'learning_rate': 4.476393326233583e-06, 'epoch': 0.31}


 11%|█         | 1190/11268 [49:07<6:33:01,  2.34s/it]

{'loss': 0.5762, 'grad_norm': 26.902507781982422, 'learning_rate': 4.471955981540647e-06, 'epoch': 0.32}


 11%|█         | 1200/11268 [49:30<6:34:51,  2.35s/it]

{'loss': 0.5765, 'grad_norm': 12.590169906616211, 'learning_rate': 4.467518636847711e-06, 'epoch': 0.32}


 11%|█         | 1210/11268 [49:54<6:38:19,  2.38s/it]

{'loss': 0.56, 'grad_norm': 60.077022552490234, 'learning_rate': 4.463081292154775e-06, 'epoch': 0.32}


 11%|█         | 1220/11268 [50:17<6:28:01,  2.32s/it]

{'loss': 0.6197, 'grad_norm': 38.976280212402344, 'learning_rate': 4.458643947461839e-06, 'epoch': 0.32}


 11%|█         | 1230/11268 [50:41<6:27:27,  2.32s/it]

{'loss': 0.5782, 'grad_norm': 98.09929656982422, 'learning_rate': 4.454206602768903e-06, 'epoch': 0.33}


 11%|█         | 1240/11268 [51:04<6:25:12,  2.30s/it]

{'loss': 0.5094, 'grad_norm': 38.21907424926758, 'learning_rate': 4.449769258075968e-06, 'epoch': 0.33}


 11%|█         | 1250/11268 [51:27<6:15:27,  2.25s/it]

{'loss': 0.5482, 'grad_norm': 33.70249557495117, 'learning_rate': 4.445331913383032e-06, 'epoch': 0.33}


 11%|█         | 1260/11268 [51:49<6:22:14,  2.29s/it]

{'loss': 0.5706, 'grad_norm': 19.78879737854004, 'learning_rate': 4.440894568690096e-06, 'epoch': 0.34}


 11%|█▏        | 1270/11268 [52:12<6:14:12,  2.25s/it]

{'loss': 0.6004, 'grad_norm': 52.534610748291016, 'learning_rate': 4.43645722399716e-06, 'epoch': 0.34}


 11%|█▏        | 1280/11268 [52:35<6:39:36,  2.40s/it]

{'loss': 0.5816, 'grad_norm': 27.925262451171875, 'learning_rate': 4.4320198793042245e-06, 'epoch': 0.34}


 11%|█▏        | 1290/11268 [53:01<7:00:21,  2.53s/it]

{'loss': 0.6185, 'grad_norm': 69.1347885131836, 'learning_rate': 4.4275825346112886e-06, 'epoch': 0.34}


 12%|█▏        | 1300/11268 [53:27<7:16:13,  2.63s/it]

{'loss': 0.6491, 'grad_norm': 31.98657989501953, 'learning_rate': 4.423145189918353e-06, 'epoch': 0.35}


 12%|█▏        | 1310/11268 [53:52<6:50:06,  2.47s/it]

{'loss': 0.4388, 'grad_norm': 16.68319320678711, 'learning_rate': 4.418707845225418e-06, 'epoch': 0.35}


 12%|█▏        | 1320/11268 [54:17<6:59:29,  2.53s/it]

{'loss': 0.6739, 'grad_norm': 39.13215255737305, 'learning_rate': 4.414270500532482e-06, 'epoch': 0.35}


 12%|█▏        | 1330/11268 [54:43<7:00:18,  2.54s/it]

{'loss': 0.5749, 'grad_norm': 36.07078552246094, 'learning_rate': 4.409833155839546e-06, 'epoch': 0.35}


 12%|█▏        | 1340/11268 [55:09<7:09:38,  2.60s/it]

{'loss': 0.598, 'grad_norm': 55.29554748535156, 'learning_rate': 4.40539581114661e-06, 'epoch': 0.36}


 12%|█▏        | 1350/11268 [55:36<7:11:11,  2.61s/it]

{'loss': 0.6119, 'grad_norm': 38.93369674682617, 'learning_rate': 4.400958466453675e-06, 'epoch': 0.36}


 12%|█▏        | 1360/11268 [56:01<7:09:11,  2.60s/it]

{'loss': 0.462, 'grad_norm': 35.16259002685547, 'learning_rate': 4.396521121760739e-06, 'epoch': 0.36}


 12%|█▏        | 1370/11268 [56:28<7:19:30,  2.66s/it]

{'loss': 0.5884, 'grad_norm': 51.75934982299805, 'learning_rate': 4.392083777067803e-06, 'epoch': 0.36}


 12%|█▏        | 1380/11268 [56:54<7:14:46,  2.64s/it]

{'loss': 0.5186, 'grad_norm': 24.179126739501953, 'learning_rate': 4.387646432374867e-06, 'epoch': 0.37}


 12%|█▏        | 1390/11268 [57:20<7:12:27,  2.63s/it]

{'loss': 0.4809, 'grad_norm': 16.37703514099121, 'learning_rate': 4.383209087681931e-06, 'epoch': 0.37}


 12%|█▏        | 1400/11268 [57:47<7:13:39,  2.64s/it]

{'loss': 0.5701, 'grad_norm': 60.02155303955078, 'learning_rate': 4.378771742988996e-06, 'epoch': 0.37}


 13%|█▎        | 1410/11268 [58:14<7:21:46,  2.69s/it]

{'loss': 0.5422, 'grad_norm': 32.391414642333984, 'learning_rate': 4.37433439829606e-06, 'epoch': 0.38}


 13%|█▎        | 1420/11268 [58:42<7:22:05,  2.69s/it]

{'loss': 0.4741, 'grad_norm': 32.528743743896484, 'learning_rate': 4.369897053603124e-06, 'epoch': 0.38}


 13%|█▎        | 1430/11268 [59:08<7:17:04,  2.67s/it]

{'loss': 0.4124, 'grad_norm': 33.44760513305664, 'learning_rate': 4.3654597089101885e-06, 'epoch': 0.38}


 13%|█▎        | 1440/11268 [59:35<7:18:03,  2.67s/it]

{'loss': 0.409, 'grad_norm': 30.302766799926758, 'learning_rate': 4.361022364217253e-06, 'epoch': 0.38}


 13%|█▎        | 1450/11268 [1:00:02<7:30:20,  2.75s/it]

{'loss': 0.4647, 'grad_norm': 46.419212341308594, 'learning_rate': 4.356585019524317e-06, 'epoch': 0.39}


 13%|█▎        | 1460/11268 [1:00:29<7:32:07,  2.77s/it]

{'loss': 0.6913, 'grad_norm': 43.90689468383789, 'learning_rate': 4.352147674831382e-06, 'epoch': 0.39}


 13%|█▎        | 1470/11268 [1:00:57<7:23:57,  2.72s/it]

{'loss': 0.5771, 'grad_norm': 6.017773151397705, 'learning_rate': 4.347710330138446e-06, 'epoch': 0.39}


 13%|█▎        | 1480/11268 [1:01:25<7:37:32,  2.80s/it]

{'loss': 0.601, 'grad_norm': 14.032795906066895, 'learning_rate': 4.34327298544551e-06, 'epoch': 0.39}


 13%|█▎        | 1490/11268 [1:01:52<7:23:54,  2.72s/it]

{'loss': 0.4624, 'grad_norm': 20.390214920043945, 'learning_rate': 4.338835640752574e-06, 'epoch': 0.4}


 13%|█▎        | 1500/11268 [1:02:20<7:28:22,  2.75s/it]

{'loss': 0.5423, 'grad_norm': 58.51810836791992, 'learning_rate': 4.334398296059638e-06, 'epoch': 0.4}


 13%|█▎        | 1510/11268 [1:02:48<7:28:08,  2.76s/it]

{'loss': 0.5219, 'grad_norm': 56.609100341796875, 'learning_rate': 4.329960951366702e-06, 'epoch': 0.4}


 13%|█▎        | 1520/11268 [1:03:15<7:23:16,  2.73s/it]

{'loss': 0.5343, 'grad_norm': 65.66415405273438, 'learning_rate': 4.325523606673766e-06, 'epoch': 0.4}


 14%|█▎        | 1530/11268 [1:03:43<7:38:52,  2.83s/it]

{'loss': 0.4656, 'grad_norm': 32.781795501708984, 'learning_rate': 4.321086261980831e-06, 'epoch': 0.41}


 14%|█▎        | 1540/11268 [1:04:11<7:33:39,  2.80s/it]

{'loss': 0.5398, 'grad_norm': 34.07284927368164, 'learning_rate': 4.316648917287895e-06, 'epoch': 0.41}


 14%|█▍        | 1550/11268 [1:04:39<7:31:06,  2.79s/it]

{'loss': 0.5753, 'grad_norm': 34.39366912841797, 'learning_rate': 4.312211572594959e-06, 'epoch': 0.41}


 14%|█▍        | 1560/11268 [1:05:07<7:30:55,  2.79s/it]

{'loss': 0.3905, 'grad_norm': 11.670945167541504, 'learning_rate': 4.307774227902024e-06, 'epoch': 0.42}


 14%|█▍        | 1570/11268 [1:05:36<7:37:31,  2.83s/it]

{'loss': 0.662, 'grad_norm': 48.68906784057617, 'learning_rate': 4.303336883209088e-06, 'epoch': 0.42}


 14%|█▍        | 1580/11268 [1:06:04<7:32:27,  2.80s/it]

{'loss': 0.4915, 'grad_norm': 16.71038818359375, 'learning_rate': 4.2988995385161525e-06, 'epoch': 0.42}


 14%|█▍        | 1590/11268 [1:06:32<7:39:23,  2.85s/it]

{'loss': 0.6037, 'grad_norm': 24.46503448486328, 'learning_rate': 4.294462193823217e-06, 'epoch': 0.42}


 14%|█▍        | 1600/11268 [1:07:00<7:31:53,  2.80s/it]

{'loss': 0.5119, 'grad_norm': 42.81535720825195, 'learning_rate': 4.290024849130281e-06, 'epoch': 0.43}


 14%|█▍        | 1610/11268 [1:07:29<7:33:46,  2.82s/it]

{'loss': 0.384, 'grad_norm': 33.23320007324219, 'learning_rate': 4.285587504437345e-06, 'epoch': 0.43}


 14%|█▍        | 1620/11268 [1:07:57<7:37:36,  2.85s/it]

{'loss': 0.561, 'grad_norm': 36.269622802734375, 'learning_rate': 4.28115015974441e-06, 'epoch': 0.43}


 14%|█▍        | 1630/11268 [1:08:25<7:43:29,  2.89s/it]

{'loss': 0.4301, 'grad_norm': 93.0941162109375, 'learning_rate': 4.276712815051474e-06, 'epoch': 0.43}


 15%|█▍        | 1640/11268 [1:08:53<7:29:17,  2.80s/it]

{'loss': 0.4352, 'grad_norm': 51.127262115478516, 'learning_rate': 4.272275470358538e-06, 'epoch': 0.44}


 15%|█▍        | 1650/11268 [1:09:22<7:33:03,  2.83s/it]

{'loss': 0.518, 'grad_norm': 35.915279388427734, 'learning_rate': 4.267838125665602e-06, 'epoch': 0.44}


 15%|█▍        | 1660/11268 [1:09:51<7:31:22,  2.82s/it]

{'loss': 0.586, 'grad_norm': 31.928068161010742, 'learning_rate': 4.263400780972666e-06, 'epoch': 0.44}


 15%|█▍        | 1670/11268 [1:10:19<7:36:26,  2.85s/it]

{'loss': 0.4898, 'grad_norm': 43.50398635864258, 'learning_rate': 4.25896343627973e-06, 'epoch': 0.44}


 15%|█▍        | 1680/11268 [1:10:48<7:38:40,  2.87s/it]

{'loss': 0.6626, 'grad_norm': 16.387245178222656, 'learning_rate': 4.254526091586794e-06, 'epoch': 0.45}


 15%|█▍        | 1690/11268 [1:11:16<7:30:02,  2.82s/it]

{'loss': 0.4814, 'grad_norm': 25.265329360961914, 'learning_rate': 4.250088746893859e-06, 'epoch': 0.45}


 15%|█▌        | 1700/11268 [1:11:45<7:37:04,  2.87s/it]

{'loss': 0.4817, 'grad_norm': 59.30329895019531, 'learning_rate': 4.245651402200923e-06, 'epoch': 0.45}


 15%|█▌        | 1710/11268 [1:12:13<7:31:22,  2.83s/it]

{'loss': 0.5139, 'grad_norm': 36.43183135986328, 'learning_rate': 4.2412140575079875e-06, 'epoch': 0.46}


 15%|█▌        | 1720/11268 [1:12:42<7:41:19,  2.90s/it]

{'loss': 0.6202, 'grad_norm': 35.189395904541016, 'learning_rate': 4.2367767128150516e-06, 'epoch': 0.46}


 15%|█▌        | 1730/11268 [1:13:11<7:32:08,  2.84s/it]

{'loss': 0.5062, 'grad_norm': 36.17230987548828, 'learning_rate': 4.232339368122116e-06, 'epoch': 0.46}


 15%|█▌        | 1740/11268 [1:13:39<7:33:04,  2.85s/it]

{'loss': 0.5371, 'grad_norm': 29.44896697998047, 'learning_rate': 4.22790202342918e-06, 'epoch': 0.46}


 16%|█▌        | 1750/11268 [1:14:08<7:35:07,  2.87s/it]

{'loss': 0.4877, 'grad_norm': 29.84225082397461, 'learning_rate': 4.223464678736245e-06, 'epoch': 0.47}


 16%|█▌        | 1760/11268 [1:14:37<7:33:35,  2.86s/it]

{'loss': 0.368, 'grad_norm': 67.50408935546875, 'learning_rate': 4.219027334043309e-06, 'epoch': 0.47}


 16%|█▌        | 1770/11268 [1:15:05<7:31:16,  2.85s/it]

{'loss': 0.6171, 'grad_norm': 35.0632209777832, 'learning_rate': 4.214589989350373e-06, 'epoch': 0.47}


 16%|█▌        | 1780/11268 [1:15:34<7:36:09,  2.88s/it]

{'loss': 0.4611, 'grad_norm': 67.43580627441406, 'learning_rate': 4.210152644657438e-06, 'epoch': 0.47}


 16%|█▌        | 1790/11268 [1:16:03<7:38:02,  2.90s/it]

{'loss': 0.3923, 'grad_norm': 14.031696319580078, 'learning_rate': 4.205715299964502e-06, 'epoch': 0.48}


 16%|█▌        | 1800/11268 [1:16:33<7:42:55,  2.93s/it]

{'loss': 0.5624, 'grad_norm': 34.448787689208984, 'learning_rate': 4.201277955271566e-06, 'epoch': 0.48}


 16%|█▌        | 1810/11268 [1:17:02<7:35:17,  2.89s/it]

{'loss': 0.5367, 'grad_norm': 22.780643463134766, 'learning_rate': 4.19684061057863e-06, 'epoch': 0.48}


 16%|█▌        | 1820/11268 [1:17:31<7:33:24,  2.88s/it]

{'loss': 0.5435, 'grad_norm': 23.203998565673828, 'learning_rate': 4.192403265885694e-06, 'epoch': 0.48}


 16%|█▌        | 1830/11268 [1:18:00<7:36:23,  2.90s/it]

{'loss': 0.429, 'grad_norm': 29.038433074951172, 'learning_rate': 4.187965921192758e-06, 'epoch': 0.49}


 16%|█▋        | 1840/11268 [1:18:29<7:47:07,  2.97s/it]

{'loss': 0.6275, 'grad_norm': 46.48039245605469, 'learning_rate': 4.183528576499823e-06, 'epoch': 0.49}


 16%|█▋        | 1850/11268 [1:18:58<7:27:53,  2.85s/it]

{'loss': 0.5386, 'grad_norm': 25.627628326416016, 'learning_rate': 4.179091231806887e-06, 'epoch': 0.49}


 17%|█▋        | 1860/11268 [1:19:27<7:38:27,  2.92s/it]

{'loss': 0.5058, 'grad_norm': 16.85994529724121, 'learning_rate': 4.1746538871139515e-06, 'epoch': 0.5}


 17%|█▋        | 1870/11268 [1:19:57<7:37:24,  2.92s/it]

{'loss': 0.2934, 'grad_norm': 25.239809036254883, 'learning_rate': 4.170216542421016e-06, 'epoch': 0.5}


 17%|█▋        | 1880/11268 [1:20:26<7:37:01,  2.92s/it]

{'loss': 0.3887, 'grad_norm': 26.714866638183594, 'learning_rate': 4.16577919772808e-06, 'epoch': 0.5}


 17%|█▋        | 1890/11268 [1:20:56<7:41:49,  2.95s/it]

{'loss': 0.5879, 'grad_norm': 29.640674591064453, 'learning_rate': 4.161341853035144e-06, 'epoch': 0.5}


 17%|█▋        | 1900/11268 [1:21:25<7:40:23,  2.95s/it]

{'loss': 0.4893, 'grad_norm': 18.185070037841797, 'learning_rate': 4.156904508342208e-06, 'epoch': 0.51}


 17%|█▋        | 1910/11268 [1:21:55<7:33:33,  2.91s/it]

{'loss': 0.4306, 'grad_norm': 35.684303283691406, 'learning_rate': 4.152467163649273e-06, 'epoch': 0.51}


 17%|█▋        | 1920/11268 [1:22:24<7:34:37,  2.92s/it]

{'loss': 0.7325, 'grad_norm': 40.58506393432617, 'learning_rate': 4.148029818956337e-06, 'epoch': 0.51}


 17%|█▋        | 1930/11268 [1:22:53<7:37:53,  2.94s/it]

{'loss': 0.5623, 'grad_norm': 17.87203025817871, 'learning_rate': 4.143592474263401e-06, 'epoch': 0.51}


 17%|█▋        | 1940/11268 [1:23:23<7:30:50,  2.90s/it]

{'loss': 0.6075, 'grad_norm': 71.33222961425781, 'learning_rate': 4.139155129570465e-06, 'epoch': 0.52}


 17%|█▋        | 1950/11268 [1:23:50<7:06:20,  2.75s/it]

{'loss': 0.6177, 'grad_norm': 26.079328536987305, 'learning_rate': 4.134717784877529e-06, 'epoch': 0.52}


 17%|█▋        | 1960/11268 [1:24:18<7:10:53,  2.78s/it]

{'loss': 0.4658, 'grad_norm': 25.3037166595459, 'learning_rate': 4.130280440184593e-06, 'epoch': 0.52}


 17%|█▋        | 1970/11268 [1:24:46<7:18:29,  2.83s/it]

{'loss': 0.5115, 'grad_norm': 46.988792419433594, 'learning_rate': 4.125843095491658e-06, 'epoch': 0.52}


 18%|█▊        | 1980/11268 [1:25:15<7:20:34,  2.85s/it]

{'loss': 0.5386, 'grad_norm': 36.285133361816406, 'learning_rate': 4.121405750798722e-06, 'epoch': 0.53}


 18%|█▊        | 1990/11268 [1:25:43<7:15:52,  2.82s/it]

{'loss': 0.4324, 'grad_norm': 37.87679672241211, 'learning_rate': 4.1169684061057865e-06, 'epoch': 0.53}


 18%|█▊        | 2000/11268 [1:26:12<7:24:38,  2.88s/it]

{'loss': 0.6276, 'grad_norm': 25.39067840576172, 'learning_rate': 4.112531061412851e-06, 'epoch': 0.53}


 18%|█▊        | 2010/11268 [1:26:41<7:33:58,  2.94s/it]

{'loss': 0.4346, 'grad_norm': 20.298128128051758, 'learning_rate': 4.1080937167199155e-06, 'epoch': 0.54}


 18%|█▊        | 2020/11268 [1:27:11<7:25:15,  2.89s/it]

{'loss': 0.5676, 'grad_norm': 20.097126007080078, 'learning_rate': 4.10365637202698e-06, 'epoch': 0.54}


 18%|█▊        | 2030/11268 [1:27:39<7:18:08,  2.85s/it]

{'loss': 0.513, 'grad_norm': 30.568199157714844, 'learning_rate': 4.099219027334044e-06, 'epoch': 0.54}


 18%|█▊        | 2040/11268 [1:28:08<7:22:22,  2.88s/it]

{'loss': 0.5528, 'grad_norm': 24.185678482055664, 'learning_rate': 4.094781682641108e-06, 'epoch': 0.54}


 18%|█▊        | 2050/11268 [1:28:38<7:37:21,  2.98s/it]

{'loss': 0.4815, 'grad_norm': 15.934721946716309, 'learning_rate': 4.090344337948172e-06, 'epoch': 0.55}


 18%|█▊        | 2060/11268 [1:29:07<7:27:01,  2.91s/it]

{'loss': 0.5723, 'grad_norm': 41.82506561279297, 'learning_rate': 4.085906993255237e-06, 'epoch': 0.55}


 18%|█▊        | 2070/11268 [1:29:36<7:17:03,  2.85s/it]

{'loss': 0.4566, 'grad_norm': 36.954341888427734, 'learning_rate': 4.081469648562301e-06, 'epoch': 0.55}


 18%|█▊        | 2080/11268 [1:30:04<7:16:21,  2.85s/it]

{'loss': 0.6509, 'grad_norm': 49.04082489013672, 'learning_rate': 4.077032303869365e-06, 'epoch': 0.55}


 19%|█▊        | 2090/11268 [1:30:33<7:16:08,  2.85s/it]

{'loss': 0.5015, 'grad_norm': 17.157665252685547, 'learning_rate': 4.072594959176429e-06, 'epoch': 0.56}


 19%|█▊        | 2100/11268 [1:31:02<7:24:34,  2.91s/it]

{'loss': 0.5426, 'grad_norm': 21.952617645263672, 'learning_rate': 4.068157614483493e-06, 'epoch': 0.56}


 19%|█▊        | 2110/11268 [1:31:32<7:31:01,  2.95s/it]

{'loss': 0.4363, 'grad_norm': 23.275299072265625, 'learning_rate': 4.063720269790557e-06, 'epoch': 0.56}


 19%|█▉        | 2120/11268 [1:32:00<7:16:28,  2.86s/it]

{'loss': 0.475, 'grad_norm': 16.434040069580078, 'learning_rate': 4.0592829250976214e-06, 'epoch': 0.56}


 19%|█▉        | 2130/11268 [1:32:29<7:17:31,  2.87s/it]

{'loss': 0.5098, 'grad_norm': 37.54704666137695, 'learning_rate': 4.054845580404686e-06, 'epoch': 0.57}


 19%|█▉        | 2140/11268 [1:32:58<7:17:06,  2.87s/it]

{'loss': 0.4563, 'grad_norm': 9.03038501739502, 'learning_rate': 4.0504082357117505e-06, 'epoch': 0.57}


 19%|█▉        | 2150/11268 [1:33:27<7:19:03,  2.89s/it]

{'loss': 0.513, 'grad_norm': 87.7586669921875, 'learning_rate': 4.0459708910188146e-06, 'epoch': 0.57}


 19%|█▉        | 2160/11268 [1:33:56<7:18:29,  2.89s/it]

{'loss': 0.5034, 'grad_norm': 29.9242000579834, 'learning_rate': 4.041533546325879e-06, 'epoch': 0.58}


 19%|█▉        | 2170/11268 [1:34:25<7:13:49,  2.86s/it]

{'loss': 0.5702, 'grad_norm': 52.17824172973633, 'learning_rate': 4.037096201632943e-06, 'epoch': 0.58}


 19%|█▉        | 2180/11268 [1:34:55<7:24:00,  2.93s/it]

{'loss': 0.3469, 'grad_norm': 38.12932586669922, 'learning_rate': 4.032658856940008e-06, 'epoch': 0.58}


 19%|█▉        | 2190/11268 [1:35:24<7:16:12,  2.88s/it]

{'loss': 0.4787, 'grad_norm': 39.25871276855469, 'learning_rate': 4.028221512247072e-06, 'epoch': 0.58}


 20%|█▉        | 2200/11268 [1:35:53<7:18:52,  2.90s/it]

{'loss': 0.4713, 'grad_norm': 22.107851028442383, 'learning_rate': 4.023784167554136e-06, 'epoch': 0.59}


 20%|█▉        | 2210/11268 [1:36:23<7:35:47,  3.02s/it]

{'loss': 0.4283, 'grad_norm': 44.968650817871094, 'learning_rate': 4.0193468228612e-06, 'epoch': 0.59}


 20%|█▉        | 2220/11268 [1:36:55<7:35:42,  3.02s/it]

{'loss': 0.4444, 'grad_norm': 29.94749641418457, 'learning_rate': 4.014909478168265e-06, 'epoch': 0.59}


 20%|█▉        | 2230/11268 [1:37:25<7:36:52,  3.03s/it]

{'loss': 0.3339, 'grad_norm': 27.584457397460938, 'learning_rate': 4.010472133475329e-06, 'epoch': 0.59}


 20%|█▉        | 2240/11268 [1:37:55<7:29:19,  2.99s/it]

{'loss': 0.5293, 'grad_norm': 42.6247673034668, 'learning_rate': 4.006034788782393e-06, 'epoch': 0.6}


 20%|█▉        | 2250/11268 [1:38:24<7:25:18,  2.96s/it]

{'loss': 0.4374, 'grad_norm': 52.9679069519043, 'learning_rate': 4.001597444089457e-06, 'epoch': 0.6}


 20%|██        | 2260/11268 [1:38:54<7:24:34,  2.96s/it]

{'loss': 0.4075, 'grad_norm': 26.44418716430664, 'learning_rate': 3.997160099396521e-06, 'epoch': 0.6}


 20%|██        | 2270/11268 [1:39:24<7:16:53,  2.91s/it]

{'loss': 0.6161, 'grad_norm': 30.389057159423828, 'learning_rate': 3.9927227547035854e-06, 'epoch': 0.6}


 20%|██        | 2280/11268 [1:39:53<7:08:18,  2.86s/it]

{'loss': 0.6018, 'grad_norm': 56.16935348510742, 'learning_rate': 3.9882854100106495e-06, 'epoch': 0.61}


 20%|██        | 2290/11268 [1:40:24<7:48:40,  3.13s/it]

{'loss': 0.4651, 'grad_norm': 29.757858276367188, 'learning_rate': 3.9838480653177145e-06, 'epoch': 0.61}


 20%|██        | 2300/11268 [1:40:55<7:45:50,  3.12s/it]

{'loss': 0.5459, 'grad_norm': 69.78173828125, 'learning_rate': 3.979410720624779e-06, 'epoch': 0.61}


 21%|██        | 2310/11268 [1:41:27<8:09:59,  3.28s/it]

{'loss': 0.5168, 'grad_norm': 11.894743919372559, 'learning_rate': 3.974973375931843e-06, 'epoch': 0.62}


 21%|██        | 2320/11268 [1:42:00<8:23:47,  3.38s/it]

{'loss': 0.4355, 'grad_norm': 48.09218215942383, 'learning_rate': 3.970536031238907e-06, 'epoch': 0.62}


 21%|██        | 2330/11268 [1:42:34<8:32:20,  3.44s/it]

{'loss': 0.4098, 'grad_norm': 44.30209732055664, 'learning_rate': 3.966098686545971e-06, 'epoch': 0.62}


 21%|██        | 2340/11268 [1:43:06<7:47:53,  3.14s/it]

{'loss': 0.7032, 'grad_norm': 21.9451961517334, 'learning_rate': 3.961661341853035e-06, 'epoch': 0.62}


 21%|██        | 2350/11268 [1:43:37<7:34:28,  3.06s/it]

{'loss': 0.458, 'grad_norm': 24.848848342895508, 'learning_rate': 3.9572239971601e-06, 'epoch': 0.63}


 21%|██        | 2360/11268 [1:44:07<7:31:20,  3.04s/it]

{'loss': 0.5756, 'grad_norm': 34.55842971801758, 'learning_rate': 3.952786652467164e-06, 'epoch': 0.63}


 21%|██        | 2370/11268 [1:44:39<7:49:25,  3.17s/it]

{'loss': 0.3295, 'grad_norm': 33.10586166381836, 'learning_rate': 3.948349307774228e-06, 'epoch': 0.63}


 21%|██        | 2380/11268 [1:45:12<7:42:02,  3.12s/it]

{'loss': 0.3606, 'grad_norm': 18.658554077148438, 'learning_rate': 3.943911963081292e-06, 'epoch': 0.63}


 21%|██        | 2390/11268 [1:45:43<7:34:45,  3.07s/it]

{'loss': 0.3052, 'grad_norm': 49.64012145996094, 'learning_rate': 3.939474618388357e-06, 'epoch': 0.64}


 21%|██▏       | 2400/11268 [1:46:17<8:27:28,  3.43s/it]

{'loss': 0.2831, 'grad_norm': 16.022010803222656, 'learning_rate': 3.935037273695421e-06, 'epoch': 0.64}


 21%|██▏       | 2410/11268 [1:46:48<7:32:15,  3.06s/it]

{'loss': 0.5239, 'grad_norm': 56.64069366455078, 'learning_rate': 3.930599929002485e-06, 'epoch': 0.64}


 21%|██▏       | 2420/11268 [1:47:19<7:22:41,  3.00s/it]

{'loss': 0.4597, 'grad_norm': 57.587303161621094, 'learning_rate': 3.9261625843095495e-06, 'epoch': 0.64}


 22%|██▏       | 2430/11268 [1:47:50<7:55:01,  3.22s/it]

{'loss': 0.6487, 'grad_norm': 42.01318359375, 'learning_rate': 3.9217252396166136e-06, 'epoch': 0.65}


 22%|██▏       | 2440/11268 [1:48:20<7:19:35,  2.99s/it]

{'loss': 0.6415, 'grad_norm': 104.44071960449219, 'learning_rate': 3.9172878949236785e-06, 'epoch': 0.65}


 22%|██▏       | 2450/11268 [1:48:50<7:12:18,  2.94s/it]

{'loss': 0.4841, 'grad_norm': 16.419157028198242, 'learning_rate': 3.912850550230743e-06, 'epoch': 0.65}


 22%|██▏       | 2460/11268 [1:49:19<7:01:42,  2.87s/it]

{'loss': 0.66, 'grad_norm': 38.34169387817383, 'learning_rate': 3.908413205537807e-06, 'epoch': 0.65}


 22%|██▏       | 2470/11268 [1:49:49<7:06:31,  2.91s/it]

{'loss': 0.5374, 'grad_norm': 14.542204856872559, 'learning_rate': 3.903975860844871e-06, 'epoch': 0.66}


 22%|██▏       | 2480/11268 [1:50:19<7:12:37,  2.95s/it]

{'loss': 0.4279, 'grad_norm': 19.804458618164062, 'learning_rate': 3.899538516151935e-06, 'epoch': 0.66}


 22%|██▏       | 2490/11268 [1:50:48<7:03:57,  2.90s/it]

{'loss': 0.4125, 'grad_norm': 42.396751403808594, 'learning_rate': 3.895101171458999e-06, 'epoch': 0.66}


 22%|██▏       | 2500/11268 [1:51:18<7:21:33,  3.02s/it]

{'loss': 0.5548, 'grad_norm': 46.004024505615234, 'learning_rate': 3.890663826766063e-06, 'epoch': 0.67}


 22%|██▏       | 2510/11268 [1:51:47<7:00:58,  2.88s/it]

{'loss': 0.4561, 'grad_norm': 57.96288299560547, 'learning_rate': 3.886226482073128e-06, 'epoch': 0.67}


 22%|██▏       | 2520/11268 [1:52:15<6:54:03,  2.84s/it]

{'loss': 0.4999, 'grad_norm': 24.315519332885742, 'learning_rate': 3.881789137380192e-06, 'epoch': 0.67}


 22%|██▏       | 2530/11268 [1:52:44<6:58:23,  2.87s/it]

{'loss': 0.5261, 'grad_norm': 13.40101146697998, 'learning_rate': 3.877351792687256e-06, 'epoch': 0.67}


 23%|██▎       | 2540/11268 [1:53:13<6:59:24,  2.88s/it]

{'loss': 0.4626, 'grad_norm': 37.81547164916992, 'learning_rate': 3.87291444799432e-06, 'epoch': 0.68}


 23%|██▎       | 2550/11268 [1:53:42<7:00:32,  2.89s/it]

{'loss': 0.5874, 'grad_norm': 39.67838668823242, 'learning_rate': 3.8684771033013844e-06, 'epoch': 0.68}


 23%|██▎       | 2560/11268 [1:54:10<6:57:55,  2.88s/it]

{'loss': 0.5641, 'grad_norm': 23.241886138916016, 'learning_rate': 3.8640397586084485e-06, 'epoch': 0.68}


 23%|██▎       | 2570/11268 [1:54:39<6:59:55,  2.90s/it]

{'loss': 0.6331, 'grad_norm': 25.078184127807617, 'learning_rate': 3.859602413915513e-06, 'epoch': 0.68}


 23%|██▎       | 2580/11268 [1:55:09<7:05:37,  2.94s/it]

{'loss': 0.4448, 'grad_norm': 20.75146484375, 'learning_rate': 3.855165069222578e-06, 'epoch': 0.69}


 23%|██▎       | 2590/11268 [1:55:39<7:20:07,  3.04s/it]

{'loss': 0.3436, 'grad_norm': 12.216752052307129, 'learning_rate': 3.850727724529642e-06, 'epoch': 0.69}


 23%|██▎       | 2600/11268 [1:56:10<7:21:57,  3.06s/it]

{'loss': 0.6558, 'grad_norm': 41.08386993408203, 'learning_rate': 3.846290379836707e-06, 'epoch': 0.69}


 23%|██▎       | 2610/11268 [1:56:40<7:18:11,  3.04s/it]

{'loss': 0.2737, 'grad_norm': 10.4852933883667, 'learning_rate': 3.841853035143771e-06, 'epoch': 0.69}


 23%|██▎       | 2620/11268 [1:57:09<7:09:03,  2.98s/it]

{'loss': 0.3726, 'grad_norm': 28.873003005981445, 'learning_rate': 3.837415690450835e-06, 'epoch': 0.7}


 23%|██▎       | 2630/11268 [1:57:40<7:19:13,  3.05s/it]

{'loss': 0.5008, 'grad_norm': 39.70866775512695, 'learning_rate': 3.832978345757899e-06, 'epoch': 0.7}


 23%|██▎       | 2640/11268 [1:58:10<7:17:43,  3.04s/it]

{'loss': 0.4126, 'grad_norm': 31.29554557800293, 'learning_rate': 3.828541001064963e-06, 'epoch': 0.7}


 24%|██▎       | 2650/11268 [1:58:40<7:05:19,  2.96s/it]

{'loss': 0.4141, 'grad_norm': 6.893424987792969, 'learning_rate': 3.824103656372027e-06, 'epoch': 0.71}


 24%|██▎       | 2660/11268 [1:59:10<7:13:09,  3.02s/it]

{'loss': 0.6189, 'grad_norm': 74.72613525390625, 'learning_rate': 3.819666311679091e-06, 'epoch': 0.71}


 24%|██▎       | 2670/11268 [1:59:41<7:06:30,  2.98s/it]

{'loss': 0.5298, 'grad_norm': 76.01870727539062, 'learning_rate': 3.815228966986156e-06, 'epoch': 0.71}


 24%|██▍       | 2680/11268 [2:00:11<7:12:26,  3.02s/it]

{'loss': 0.3905, 'grad_norm': 17.088764190673828, 'learning_rate': 3.8107916222932203e-06, 'epoch': 0.71}


 24%|██▍       | 2690/11268 [2:00:41<7:18:05,  3.06s/it]

{'loss': 0.5484, 'grad_norm': 46.16141128540039, 'learning_rate': 3.8063542776002844e-06, 'epoch': 0.72}


 24%|██▍       | 2700/11268 [2:01:14<7:31:44,  3.16s/it]

{'loss': 0.4768, 'grad_norm': 16.93437385559082, 'learning_rate': 3.8019169329073485e-06, 'epoch': 0.72}


 24%|██▍       | 2710/11268 [2:01:46<8:08:55,  3.43s/it]

{'loss': 0.5061, 'grad_norm': 41.89393615722656, 'learning_rate': 3.7974795882144125e-06, 'epoch': 0.72}


 24%|██▍       | 2720/11268 [2:02:16<7:14:01,  3.05s/it]

{'loss': 0.3526, 'grad_norm': 33.18333435058594, 'learning_rate': 3.793042243521477e-06, 'epoch': 0.72}


 24%|██▍       | 2730/11268 [2:02:46<6:53:54,  2.91s/it]

{'loss': 0.6246, 'grad_norm': 10.398252487182617, 'learning_rate': 3.788604898828541e-06, 'epoch': 0.73}


 24%|██▍       | 2740/11268 [2:03:15<6:51:35,  2.90s/it]

{'loss': 0.5033, 'grad_norm': 28.568174362182617, 'learning_rate': 3.7841675541356053e-06, 'epoch': 0.73}


 24%|██▍       | 2750/11268 [2:03:43<6:42:32,  2.84s/it]

{'loss': 0.3727, 'grad_norm': 53.6268196105957, 'learning_rate': 3.77973020944267e-06, 'epoch': 0.73}


 24%|██▍       | 2760/11268 [2:04:12<6:49:31,  2.89s/it]

{'loss': 0.3693, 'grad_norm': 55.8770637512207, 'learning_rate': 3.775292864749734e-06, 'epoch': 0.73}


 25%|██▍       | 2770/11268 [2:04:43<7:15:15,  3.07s/it]

{'loss': 0.2361, 'grad_norm': 39.622161865234375, 'learning_rate': 3.770855520056798e-06, 'epoch': 0.74}


 25%|██▍       | 2780/11268 [2:05:13<7:07:10,  3.02s/it]

{'loss': 0.3064, 'grad_norm': 1.1328420639038086, 'learning_rate': 3.7664181753638625e-06, 'epoch': 0.74}


 25%|██▍       | 2790/11268 [2:05:43<7:05:40,  3.01s/it]

{'loss': 0.4954, 'grad_norm': 49.34321212768555, 'learning_rate': 3.7619808306709266e-06, 'epoch': 0.74}


 25%|██▍       | 2800/11268 [2:06:15<7:14:08,  3.08s/it]

{'loss': 0.482, 'grad_norm': 32.354949951171875, 'learning_rate': 3.7575434859779916e-06, 'epoch': 0.75}


 25%|██▍       | 2810/11268 [2:06:44<6:45:08,  2.87s/it]

{'loss': 0.5572, 'grad_norm': 48.409175872802734, 'learning_rate': 3.7531061412850556e-06, 'epoch': 0.75}


 25%|██▌       | 2820/11268 [2:07:14<7:07:43,  3.04s/it]

{'loss': 0.4575, 'grad_norm': 26.978425979614258, 'learning_rate': 3.7486687965921197e-06, 'epoch': 0.75}


 25%|██▌       | 2830/11268 [2:07:42<6:41:09,  2.85s/it]

{'loss': 0.5782, 'grad_norm': 41.84669494628906, 'learning_rate': 3.744231451899184e-06, 'epoch': 0.75}


 25%|██▌       | 2840/11268 [2:08:11<6:49:27,  2.91s/it]

{'loss': 0.4928, 'grad_norm': 39.736915588378906, 'learning_rate': 3.7397941072062484e-06, 'epoch': 0.76}


 25%|██▌       | 2850/11268 [2:08:41<7:01:50,  3.01s/it]

{'loss': 0.4704, 'grad_norm': 26.33810806274414, 'learning_rate': 3.7353567625133125e-06, 'epoch': 0.76}


 25%|██▌       | 2860/11268 [2:09:12<7:10:14,  3.07s/it]

{'loss': 0.5336, 'grad_norm': 25.78823471069336, 'learning_rate': 3.7309194178203766e-06, 'epoch': 0.76}


 25%|██▌       | 2870/11268 [2:09:42<7:00:26,  3.00s/it]

{'loss': 0.3914, 'grad_norm': 50.200721740722656, 'learning_rate': 3.726482073127441e-06, 'epoch': 0.76}


 26%|██▌       | 2880/11268 [2:10:12<6:47:46,  2.92s/it]

{'loss': 0.5317, 'grad_norm': 35.294254302978516, 'learning_rate': 3.722044728434505e-06, 'epoch': 0.77}


 26%|██▌       | 2890/11268 [2:10:42<7:05:51,  3.05s/it]

{'loss': 0.4422, 'grad_norm': 45.99223327636719, 'learning_rate': 3.7176073837415693e-06, 'epoch': 0.77}


 26%|██▌       | 2900/11268 [2:11:13<6:55:33,  2.98s/it]

{'loss': 0.5555, 'grad_norm': 23.060827255249023, 'learning_rate': 3.713170039048634e-06, 'epoch': 0.77}


 26%|██▌       | 2910/11268 [2:11:41<6:29:18,  2.79s/it]

{'loss': 0.3346, 'grad_norm': 52.92953872680664, 'learning_rate': 3.708732694355698e-06, 'epoch': 0.77}


 26%|██▌       | 2920/11268 [2:12:10<6:50:52,  2.95s/it]

{'loss': 0.4427, 'grad_norm': 48.301513671875, 'learning_rate': 3.704295349662762e-06, 'epoch': 0.78}


 26%|██▌       | 2930/11268 [2:12:41<6:51:05,  2.96s/it]

{'loss': 0.4144, 'grad_norm': 48.90705108642578, 'learning_rate': 3.699858004969826e-06, 'epoch': 0.78}


 26%|██▌       | 2940/11268 [2:13:10<6:41:17,  2.89s/it]

{'loss': 0.4262, 'grad_norm': 50.751583099365234, 'learning_rate': 3.6954206602768906e-06, 'epoch': 0.78}


 26%|██▌       | 2950/11268 [2:13:38<6:32:07,  2.83s/it]

{'loss': 0.4766, 'grad_norm': 36.761234283447266, 'learning_rate': 3.6909833155839547e-06, 'epoch': 0.79}


 26%|██▋       | 2960/11268 [2:14:07<6:33:02,  2.84s/it]

{'loss': 0.462, 'grad_norm': 39.654090881347656, 'learning_rate': 3.686545970891019e-06, 'epoch': 0.79}


 26%|██▋       | 2970/11268 [2:14:35<6:37:30,  2.87s/it]

{'loss': 0.4302, 'grad_norm': 40.53966522216797, 'learning_rate': 3.6821086261980833e-06, 'epoch': 0.79}


 26%|██▋       | 2980/11268 [2:15:05<6:48:11,  2.96s/it]

{'loss': 0.3504, 'grad_norm': 39.34804153442383, 'learning_rate': 3.6776712815051474e-06, 'epoch': 0.79}


 27%|██▋       | 2990/11268 [2:15:34<6:43:28,  2.92s/it]

{'loss': 0.3504, 'grad_norm': 39.78946304321289, 'learning_rate': 3.6732339368122115e-06, 'epoch': 0.8}


 27%|██▋       | 3000/11268 [2:16:04<7:00:22,  3.05s/it]

{'loss': 0.4809, 'grad_norm': 53.113887786865234, 'learning_rate': 3.668796592119276e-06, 'epoch': 0.8}


 27%|██▋       | 3010/11268 [2:16:34<6:38:39,  2.90s/it]

{'loss': 0.5108, 'grad_norm': 46.49972915649414, 'learning_rate': 3.6643592474263406e-06, 'epoch': 0.8}


 27%|██▋       | 3020/11268 [2:17:03<6:35:55,  2.88s/it]

{'loss': 0.5426, 'grad_norm': 4.3874921798706055, 'learning_rate': 3.6599219027334047e-06, 'epoch': 0.8}


 27%|██▋       | 3030/11268 [2:17:32<6:50:28,  2.99s/it]

{'loss': 0.3684, 'grad_norm': 35.56306457519531, 'learning_rate': 3.655484558040469e-06, 'epoch': 0.81}


 27%|██▋       | 3040/11268 [2:18:02<6:35:20,  2.88s/it]

{'loss': 0.5309, 'grad_norm': 54.54863739013672, 'learning_rate': 3.6510472133475333e-06, 'epoch': 0.81}


 27%|██▋       | 3050/11268 [2:18:31<6:34:41,  2.88s/it]

{'loss': 0.3355, 'grad_norm': 39.69523620605469, 'learning_rate': 3.6466098686545974e-06, 'epoch': 0.81}


 27%|██▋       | 3060/11268 [2:19:00<6:30:29,  2.85s/it]

{'loss': 0.5092, 'grad_norm': 49.38870620727539, 'learning_rate': 3.642172523961662e-06, 'epoch': 0.81}


 27%|██▋       | 3070/11268 [2:19:29<6:30:18,  2.86s/it]

{'loss': 0.3772, 'grad_norm': 34.680328369140625, 'learning_rate': 3.637735179268726e-06, 'epoch': 0.82}


 27%|██▋       | 3080/11268 [2:19:58<6:38:58,  2.92s/it]

{'loss': 0.4852, 'grad_norm': 16.585277557373047, 'learning_rate': 3.63329783457579e-06, 'epoch': 0.82}


 27%|██▋       | 3090/11268 [2:20:27<6:33:49,  2.89s/it]

{'loss': 0.598, 'grad_norm': 35.107913970947266, 'learning_rate': 3.6288604898828546e-06, 'epoch': 0.82}


 28%|██▊       | 3100/11268 [2:20:56<6:38:49,  2.93s/it]

{'loss': 0.4372, 'grad_norm': 34.67172622680664, 'learning_rate': 3.6244231451899187e-06, 'epoch': 0.83}


 28%|██▊       | 3110/11268 [2:21:25<6:34:00,  2.90s/it]

{'loss': 0.5542, 'grad_norm': 69.40287017822266, 'learning_rate': 3.619985800496983e-06, 'epoch': 0.83}


 28%|██▊       | 3120/11268 [2:21:54<6:30:17,  2.87s/it]

{'loss': 0.4781, 'grad_norm': 26.95096206665039, 'learning_rate': 3.615548455804047e-06, 'epoch': 0.83}


 28%|██▊       | 3130/11268 [2:22:22<6:21:26,  2.81s/it]

{'loss': 0.5491, 'grad_norm': 28.22862434387207, 'learning_rate': 3.6111111111111115e-06, 'epoch': 0.83}


 28%|██▊       | 3140/11268 [2:22:51<6:31:13,  2.89s/it]

{'loss': 0.5018, 'grad_norm': 46.3458366394043, 'learning_rate': 3.6066737664181756e-06, 'epoch': 0.84}


 28%|██▊       | 3150/11268 [2:23:22<6:43:11,  2.98s/it]

{'loss': 0.5572, 'grad_norm': 69.2251968383789, 'learning_rate': 3.6022364217252397e-06, 'epoch': 0.84}


 28%|██▊       | 3160/11268 [2:23:51<6:26:27,  2.86s/it]

{'loss': 0.2832, 'grad_norm': 23.1965274810791, 'learning_rate': 3.597799077032304e-06, 'epoch': 0.84}


 28%|██▊       | 3170/11268 [2:24:19<6:31:42,  2.90s/it]

{'loss': 0.4113, 'grad_norm': 64.3228988647461, 'learning_rate': 3.5933617323393683e-06, 'epoch': 0.84}


 28%|██▊       | 3180/11268 [2:24:49<6:27:33,  2.88s/it]

{'loss': 0.2894, 'grad_norm': 19.056867599487305, 'learning_rate': 3.5889243876464324e-06, 'epoch': 0.85}


 28%|██▊       | 3190/11268 [2:25:17<6:25:01,  2.86s/it]

{'loss': 0.4683, 'grad_norm': 16.736753463745117, 'learning_rate': 3.584487042953497e-06, 'epoch': 0.85}


 28%|██▊       | 3200/11268 [2:25:46<6:26:51,  2.88s/it]

{'loss': 0.4359, 'grad_norm': 21.99988555908203, 'learning_rate': 3.580049698260561e-06, 'epoch': 0.85}


 28%|██▊       | 3210/11268 [2:26:14<6:23:53,  2.86s/it]

{'loss': 0.4584, 'grad_norm': 66.83743286132812, 'learning_rate': 3.575612353567625e-06, 'epoch': 0.85}


 29%|██▊       | 3220/11268 [2:26:44<6:32:19,  2.92s/it]

{'loss': 0.4841, 'grad_norm': 136.4222869873047, 'learning_rate': 3.57117500887469e-06, 'epoch': 0.86}


 29%|██▊       | 3230/11268 [2:27:12<6:22:16,  2.85s/it]

{'loss': 0.5208, 'grad_norm': 48.702186584472656, 'learning_rate': 3.566737664181754e-06, 'epoch': 0.86}


 29%|██▉       | 3240/11268 [2:27:41<6:22:15,  2.86s/it]

{'loss': 0.3549, 'grad_norm': 17.272945404052734, 'learning_rate': 3.5623003194888182e-06, 'epoch': 0.86}


 29%|██▉       | 3250/11268 [2:28:11<6:30:35,  2.92s/it]

{'loss': 0.397, 'grad_norm': 55.59714126586914, 'learning_rate': 3.5578629747958828e-06, 'epoch': 0.87}


 29%|██▉       | 3260/11268 [2:28:40<6:26:22,  2.89s/it]

{'loss': 0.5792, 'grad_norm': 24.468503952026367, 'learning_rate': 3.553425630102947e-06, 'epoch': 0.87}


 29%|██▉       | 3270/11268 [2:29:10<6:28:28,  2.91s/it]

{'loss': 0.4701, 'grad_norm': 20.15557289123535, 'learning_rate': 3.548988285410011e-06, 'epoch': 0.87}


 29%|██▉       | 3280/11268 [2:29:39<6:18:28,  2.84s/it]

{'loss': 0.4938, 'grad_norm': 3.6037347316741943, 'learning_rate': 3.5445509407170755e-06, 'epoch': 0.87}


 29%|██▉       | 3290/11268 [2:30:07<6:20:53,  2.86s/it]

{'loss': 0.4292, 'grad_norm': 36.691715240478516, 'learning_rate': 3.5401135960241396e-06, 'epoch': 0.88}


 29%|██▉       | 3300/11268 [2:30:36<6:23:00,  2.88s/it]

{'loss': 0.5849, 'grad_norm': 38.12320327758789, 'learning_rate': 3.5356762513312037e-06, 'epoch': 0.88}


 29%|██▉       | 3310/11268 [2:31:06<6:29:40,  2.94s/it]

{'loss': 0.2545, 'grad_norm': 31.022741317749023, 'learning_rate': 3.5312389066382678e-06, 'epoch': 0.88}


 29%|██▉       | 3320/11268 [2:31:35<6:24:56,  2.91s/it]

{'loss': 0.4597, 'grad_norm': 24.874244689941406, 'learning_rate': 3.5268015619453323e-06, 'epoch': 0.88}


 30%|██▉       | 3330/11268 [2:32:04<6:19:44,  2.87s/it]

{'loss': 0.4707, 'grad_norm': 33.82793426513672, 'learning_rate': 3.5223642172523964e-06, 'epoch': 0.89}


 30%|██▉       | 3340/11268 [2:32:32<6:14:41,  2.84s/it]

{'loss': 0.3651, 'grad_norm': 25.76340103149414, 'learning_rate': 3.5179268725594605e-06, 'epoch': 0.89}


 30%|██▉       | 3350/11268 [2:33:01<6:22:26,  2.90s/it]

{'loss': 0.4041, 'grad_norm': 36.18081283569336, 'learning_rate': 3.513489527866525e-06, 'epoch': 0.89}


 30%|██▉       | 3360/11268 [2:33:33<6:59:11,  3.18s/it]

{'loss': 0.4119, 'grad_norm': 43.22450256347656, 'learning_rate': 3.509052183173589e-06, 'epoch': 0.89}


 30%|██▉       | 3370/11268 [2:34:04<6:43:29,  3.07s/it]

{'loss': 0.4748, 'grad_norm': 25.002256393432617, 'learning_rate': 3.504614838480653e-06, 'epoch': 0.9}


 30%|██▉       | 3380/11268 [2:34:32<6:13:46,  2.84s/it]

{'loss': 0.352, 'grad_norm': 3.837367534637451, 'learning_rate': 3.5001774937877177e-06, 'epoch': 0.9}


 30%|███       | 3390/11268 [2:35:04<6:42:18,  3.06s/it]

{'loss': 0.2963, 'grad_norm': 33.43516159057617, 'learning_rate': 3.495740149094782e-06, 'epoch': 0.9}


 30%|███       | 3400/11268 [2:35:34<6:26:19,  2.95s/it]

{'loss': 0.4292, 'grad_norm': 46.475669860839844, 'learning_rate': 3.491302804401846e-06, 'epoch': 0.91}


 30%|███       | 3410/11268 [2:36:04<6:29:18,  2.97s/it]

{'loss': 0.527, 'grad_norm': 8.306092262268066, 'learning_rate': 3.48686545970891e-06, 'epoch': 0.91}


 30%|███       | 3420/11268 [2:36:37<7:55:04,  3.63s/it]

{'loss': 0.5575, 'grad_norm': 52.78518295288086, 'learning_rate': 3.482428115015975e-06, 'epoch': 0.91}


 30%|███       | 3430/11268 [2:37:07<6:34:52,  3.02s/it]

{'loss': 0.5689, 'grad_norm': 39.20545959472656, 'learning_rate': 3.477990770323039e-06, 'epoch': 0.91}


 31%|███       | 3440/11268 [2:37:38<6:29:16,  2.98s/it]

{'loss': 0.4101, 'grad_norm': 19.458885192871094, 'learning_rate': 3.4735534256301036e-06, 'epoch': 0.92}


 31%|███       | 3450/11268 [2:38:06<6:05:50,  2.81s/it]

{'loss': 0.4009, 'grad_norm': 55.984554290771484, 'learning_rate': 3.4691160809371677e-06, 'epoch': 0.92}


 31%|███       | 3460/11268 [2:38:34<6:05:26,  2.81s/it]

{'loss': 0.4437, 'grad_norm': 23.385419845581055, 'learning_rate': 3.4646787362442318e-06, 'epoch': 0.92}


 31%|███       | 3470/11268 [2:39:02<6:04:51,  2.81s/it]

{'loss': 0.4475, 'grad_norm': 18.056705474853516, 'learning_rate': 3.4602413915512963e-06, 'epoch': 0.92}


 31%|███       | 3480/11268 [2:39:31<6:06:24,  2.82s/it]

{'loss': 0.4416, 'grad_norm': 41.02918243408203, 'learning_rate': 3.4558040468583604e-06, 'epoch': 0.93}


 31%|███       | 3490/11268 [2:39:59<6:13:39,  2.88s/it]

{'loss': 0.4073, 'grad_norm': 13.734370231628418, 'learning_rate': 3.4513667021654245e-06, 'epoch': 0.93}


 31%|███       | 3500/11268 [2:40:28<6:12:38,  2.88s/it]

{'loss': 0.429, 'grad_norm': 40.19343948364258, 'learning_rate': 3.4469293574724886e-06, 'epoch': 0.93}


 31%|███       | 3510/11268 [2:40:56<6:12:39,  2.88s/it]

{'loss': 0.5276, 'grad_norm': 15.645760536193848, 'learning_rate': 3.442492012779553e-06, 'epoch': 0.93}


 31%|███       | 3520/11268 [2:41:25<6:13:10,  2.89s/it]

{'loss': 0.5557, 'grad_norm': 49.22931671142578, 'learning_rate': 3.4380546680866172e-06, 'epoch': 0.94}


 31%|███▏      | 3530/11268 [2:41:55<6:19:00,  2.94s/it]

{'loss': 0.4071, 'grad_norm': 20.097721099853516, 'learning_rate': 3.4336173233936813e-06, 'epoch': 0.94}


 31%|███▏      | 3540/11268 [2:42:24<6:11:23,  2.88s/it]

{'loss': 0.448, 'grad_norm': 33.681278228759766, 'learning_rate': 3.429179978700746e-06, 'epoch': 0.94}


 32%|███▏      | 3550/11268 [2:42:52<6:04:22,  2.83s/it]

{'loss': 0.5137, 'grad_norm': 19.279861450195312, 'learning_rate': 3.42474263400781e-06, 'epoch': 0.95}


 32%|███▏      | 3560/11268 [2:43:22<6:37:55,  3.10s/it]

{'loss': 0.4027, 'grad_norm': 17.787982940673828, 'learning_rate': 3.420305289314874e-06, 'epoch': 0.95}


 32%|███▏      | 3570/11268 [2:43:51<6:07:52,  2.87s/it]

{'loss': 0.6265, 'grad_norm': 46.412715911865234, 'learning_rate': 3.4158679446219386e-06, 'epoch': 0.95}


 32%|███▏      | 3580/11268 [2:44:20<6:20:31,  2.97s/it]

{'loss': 0.5602, 'grad_norm': 30.365901947021484, 'learning_rate': 3.4114305999290027e-06, 'epoch': 0.95}


 32%|███▏      | 3590/11268 [2:44:49<6:03:42,  2.84s/it]

{'loss': 0.4984, 'grad_norm': 41.739784240722656, 'learning_rate': 3.4069932552360668e-06, 'epoch': 0.96}


 32%|███▏      | 3600/11268 [2:45:17<6:02:18,  2.83s/it]

{'loss': 0.4311, 'grad_norm': 8.452544212341309, 'learning_rate': 3.402555910543131e-06, 'epoch': 0.96}


 32%|███▏      | 3610/11268 [2:45:46<6:10:06,  2.90s/it]

{'loss': 0.4545, 'grad_norm': 14.705548286437988, 'learning_rate': 3.3981185658501954e-06, 'epoch': 0.96}


 32%|███▏      | 3620/11268 [2:46:15<6:05:14,  2.87s/it]

{'loss': 0.3465, 'grad_norm': 24.49713897705078, 'learning_rate': 3.3936812211572595e-06, 'epoch': 0.96}


 32%|███▏      | 3630/11268 [2:46:44<6:06:40,  2.88s/it]

{'loss': 0.4129, 'grad_norm': 35.99775314331055, 'learning_rate': 3.3892438764643244e-06, 'epoch': 0.97}


 32%|███▏      | 3640/11268 [2:47:13<6:21:19,  3.00s/it]

{'loss': 0.4973, 'grad_norm': 56.255348205566406, 'learning_rate': 3.3848065317713885e-06, 'epoch': 0.97}


 32%|███▏      | 3650/11268 [2:47:43<6:16:45,  2.97s/it]

{'loss': 0.286, 'grad_norm': 38.99000930786133, 'learning_rate': 3.3803691870784526e-06, 'epoch': 0.97}


 32%|███▏      | 3660/11268 [2:48:12<6:07:36,  2.90s/it]

{'loss': 0.5008, 'grad_norm': 29.624326705932617, 'learning_rate': 3.375931842385517e-06, 'epoch': 0.97}


 33%|███▎      | 3670/11268 [2:48:42<6:14:19,  2.96s/it]

{'loss': 0.4797, 'grad_norm': 50.39006042480469, 'learning_rate': 3.3714944976925812e-06, 'epoch': 0.98}


 33%|███▎      | 3680/11268 [2:49:10<5:57:33,  2.83s/it]

{'loss': 0.5441, 'grad_norm': 48.023006439208984, 'learning_rate': 3.3670571529996453e-06, 'epoch': 0.98}


 33%|███▎      | 3690/11268 [2:49:39<5:57:43,  2.83s/it]

{'loss': 0.4854, 'grad_norm': 32.83159637451172, 'learning_rate': 3.36261980830671e-06, 'epoch': 0.98}


 33%|███▎      | 3700/11268 [2:50:07<5:52:57,  2.80s/it]

{'loss': 0.5452, 'grad_norm': 57.853755950927734, 'learning_rate': 3.358182463613774e-06, 'epoch': 0.99}


 33%|███▎      | 3710/11268 [2:50:35<5:56:35,  2.83s/it]

{'loss': 0.3642, 'grad_norm': 24.01287841796875, 'learning_rate': 3.353745118920838e-06, 'epoch': 0.99}


 33%|███▎      | 3720/11268 [2:51:04<5:59:52,  2.86s/it]

{'loss': 0.4246, 'grad_norm': 15.74479866027832, 'learning_rate': 3.349307774227902e-06, 'epoch': 0.99}


 33%|███▎      | 3730/11268 [2:51:32<5:57:08,  2.84s/it]

{'loss': 0.4976, 'grad_norm': 38.5820198059082, 'learning_rate': 3.3448704295349667e-06, 'epoch': 0.99}


 33%|███▎      | 3740/11268 [2:52:01<5:54:35,  2.83s/it]

{'loss': 0.4632, 'grad_norm': 23.191646575927734, 'learning_rate': 3.3404330848420308e-06, 'epoch': 1.0}


 33%|███▎      | 3750/11268 [2:52:29<5:54:35,  2.83s/it]

{'loss': 0.3077, 'grad_norm': 20.22792625427246, 'learning_rate': 3.335995740149095e-06, 'epoch': 1.0}


                                                        
 33%|███▎      | 3756/11268 [2:59:17<5:35:30,  2.68s/it]

{'eval_loss': 0.4010973274707794, 'eval_runtime': 391.327, 'eval_samples_per_second': 9.598, 'eval_steps_per_second': 1.201, 'epoch': 1.0}


 33%|███▎      | 3760/11268 [2:59:32<90:31:12, 43.40s/it]  

{'loss': 0.4895, 'grad_norm': 24.37016487121582, 'learning_rate': 3.3315583954561594e-06, 'epoch': 1.0}


 33%|███▎      | 3770/11268 [3:00:00<8:20:57,  4.01s/it] 

{'loss': 0.331, 'grad_norm': 53.734981536865234, 'learning_rate': 3.3271210507632235e-06, 'epoch': 1.0}


 34%|███▎      | 3780/11268 [3:00:27<5:49:08,  2.80s/it]

{'loss': 0.2463, 'grad_norm': 53.42403793334961, 'learning_rate': 3.3226837060702876e-06, 'epoch': 1.01}


 34%|███▎      | 3790/11268 [3:00:55<5:42:05,  2.74s/it]

{'loss': 0.4091, 'grad_norm': 51.97494125366211, 'learning_rate': 3.318246361377352e-06, 'epoch': 1.01}


 34%|███▎      | 3800/11268 [3:01:22<5:43:57,  2.76s/it]

{'loss': 0.2977, 'grad_norm': 28.96375274658203, 'learning_rate': 3.313809016684416e-06, 'epoch': 1.01}


 34%|███▍      | 3810/11268 [3:01:50<5:48:02,  2.80s/it]

{'loss': 0.4292, 'grad_norm': 48.554866790771484, 'learning_rate': 3.3093716719914803e-06, 'epoch': 1.01}


 34%|███▍      | 3820/11268 [3:02:18<5:40:27,  2.74s/it]

{'loss': 0.3101, 'grad_norm': 39.90666198730469, 'learning_rate': 3.3049343272985444e-06, 'epoch': 1.02}


 34%|███▍      | 3830/11268 [3:02:45<5:36:30,  2.71s/it]

{'loss': 0.3791, 'grad_norm': 58.565547943115234, 'learning_rate': 3.300496982605609e-06, 'epoch': 1.02}


 34%|███▍      | 3840/11268 [3:03:12<5:36:18,  2.72s/it]

{'loss': 0.4418, 'grad_norm': 53.34403991699219, 'learning_rate': 3.2960596379126734e-06, 'epoch': 1.02}


 34%|███▍      | 3850/11268 [3:03:41<5:52:09,  2.85s/it]

{'loss': 0.3763, 'grad_norm': 73.85496520996094, 'learning_rate': 3.291622293219738e-06, 'epoch': 1.03}


 34%|███▍      | 3860/11268 [3:04:09<5:45:32,  2.80s/it]

{'loss': 0.6791, 'grad_norm': 46.47140884399414, 'learning_rate': 3.287184948526802e-06, 'epoch': 1.03}


 34%|███▍      | 3870/11268 [3:04:38<5:48:13,  2.82s/it]

{'loss': 0.5107, 'grad_norm': 35.71063995361328, 'learning_rate': 3.282747603833866e-06, 'epoch': 1.03}


 34%|███▍      | 3880/11268 [3:05:09<6:15:11,  3.05s/it]

{'loss': 0.3675, 'grad_norm': 20.168060302734375, 'learning_rate': 3.2783102591409307e-06, 'epoch': 1.03}


 35%|███▍      | 3890/11268 [3:05:37<5:46:41,  2.82s/it]

{'loss': 0.3953, 'grad_norm': 27.247339248657227, 'learning_rate': 3.2738729144479948e-06, 'epoch': 1.04}


 35%|███▍      | 3900/11268 [3:06:07<6:03:20,  2.96s/it]

{'loss': 0.4062, 'grad_norm': 94.36700439453125, 'learning_rate': 3.269435569755059e-06, 'epoch': 1.04}


 35%|███▍      | 3910/11268 [3:06:36<6:03:38,  2.97s/it]

{'loss': 0.5071, 'grad_norm': 35.7130012512207, 'learning_rate': 3.264998225062123e-06, 'epoch': 1.04}


 35%|███▍      | 3920/11268 [3:07:05<5:46:01,  2.83s/it]

{'loss': 0.3961, 'grad_norm': 35.89786911010742, 'learning_rate': 3.2605608803691875e-06, 'epoch': 1.04}


 35%|███▍      | 3930/11268 [3:07:36<6:11:20,  3.04s/it]

{'loss': 0.4472, 'grad_norm': 38.03929901123047, 'learning_rate': 3.2561235356762516e-06, 'epoch': 1.05}


 35%|███▍      | 3940/11268 [3:08:04<5:41:24,  2.80s/it]

{'loss': 0.3322, 'grad_norm': 30.750192642211914, 'learning_rate': 3.2516861909833157e-06, 'epoch': 1.05}


 35%|███▌      | 3950/11268 [3:08:34<5:54:01,  2.90s/it]

{'loss': 0.4264, 'grad_norm': 19.738998413085938, 'learning_rate': 3.2472488462903802e-06, 'epoch': 1.05}


 35%|███▌      | 3960/11268 [3:09:03<5:48:18,  2.86s/it]

{'loss': 0.471, 'grad_norm': 55.47446823120117, 'learning_rate': 3.2428115015974443e-06, 'epoch': 1.05}


 35%|███▌      | 3970/11268 [3:09:32<5:56:42,  2.93s/it]

{'loss': 0.4537, 'grad_norm': 49.76363754272461, 'learning_rate': 3.2383741569045084e-06, 'epoch': 1.06}


 35%|███▌      | 3980/11268 [3:10:03<6:34:06,  3.24s/it]

{'loss': 0.3749, 'grad_norm': 80.31526947021484, 'learning_rate': 3.233936812211573e-06, 'epoch': 1.06}


 35%|███▌      | 3990/11268 [3:10:34<6:25:39,  3.18s/it]

{'loss': 0.2977, 'grad_norm': 53.54792022705078, 'learning_rate': 3.229499467518637e-06, 'epoch': 1.06}


 35%|███▌      | 4000/11268 [3:11:08<6:35:53,  3.27s/it]

{'loss': 0.4078, 'grad_norm': 82.70904541015625, 'learning_rate': 3.225062122825701e-06, 'epoch': 1.06}


 36%|███▌      | 4010/11268 [3:11:40<6:36:50,  3.28s/it]

{'loss': 0.4449, 'grad_norm': 29.12423324584961, 'learning_rate': 3.2206247781327652e-06, 'epoch': 1.07}


 36%|███▌      | 4020/11268 [3:12:10<5:58:26,  2.97s/it]

{'loss': 0.5086, 'grad_norm': 58.45504379272461, 'learning_rate': 3.2161874334398298e-06, 'epoch': 1.07}


 36%|███▌      | 4030/11268 [3:12:40<5:40:12,  2.82s/it]

{'loss': 0.245, 'grad_norm': 21.932579040527344, 'learning_rate': 3.211750088746894e-06, 'epoch': 1.07}


 36%|███▌      | 4040/11268 [3:13:11<5:51:56,  2.92s/it]

{'loss': 0.3531, 'grad_norm': 12.67576789855957, 'learning_rate': 3.207312744053959e-06, 'epoch': 1.08}


 36%|███▌      | 4050/11268 [3:13:42<6:11:45,  3.09s/it]

{'loss': 0.3535, 'grad_norm': 3.022346258163452, 'learning_rate': 3.202875399361023e-06, 'epoch': 1.08}


 36%|███▌      | 4060/11268 [3:14:14<6:18:50,  3.15s/it]

{'loss': 0.4069, 'grad_norm': 50.03273391723633, 'learning_rate': 3.198438054668087e-06, 'epoch': 1.08}


 36%|███▌      | 4070/11268 [3:14:46<6:12:04,  3.10s/it]

{'loss': 0.4915, 'grad_norm': 23.05666160583496, 'learning_rate': 3.1940007099751515e-06, 'epoch': 1.08}


 36%|███▌      | 4080/11268 [3:15:17<6:13:33,  3.12s/it]

{'loss': 0.3975, 'grad_norm': 49.80161666870117, 'learning_rate': 3.1895633652822156e-06, 'epoch': 1.09}


 36%|███▋      | 4090/11268 [3:15:48<6:07:42,  3.07s/it]

{'loss': 0.3995, 'grad_norm': 45.57204055786133, 'learning_rate': 3.1851260205892797e-06, 'epoch': 1.09}


 36%|███▋      | 4100/11268 [3:16:21<6:24:17,  3.22s/it]

{'loss': 0.4173, 'grad_norm': 29.443382263183594, 'learning_rate': 3.180688675896344e-06, 'epoch': 1.09}


 36%|███▋      | 4110/11268 [3:16:50<5:41:05,  2.86s/it]

{'loss': 0.3886, 'grad_norm': 20.2729434967041, 'learning_rate': 3.1762513312034083e-06, 'epoch': 1.09}


 37%|███▋      | 4120/11268 [3:17:20<5:50:09,  2.94s/it]

{'loss': 0.3734, 'grad_norm': 60.771549224853516, 'learning_rate': 3.1718139865104724e-06, 'epoch': 1.1}


 37%|███▋      | 4130/11268 [3:17:49<5:38:59,  2.85s/it]

{'loss': 0.4133, 'grad_norm': 44.5765495300293, 'learning_rate': 3.1673766418175365e-06, 'epoch': 1.1}


 37%|███▋      | 4140/11268 [3:18:19<6:22:24,  3.22s/it]

{'loss': 0.5197, 'grad_norm': 83.75230407714844, 'learning_rate': 3.162939297124601e-06, 'epoch': 1.1}


 37%|███▋      | 4150/11268 [3:18:54<6:30:02,  3.29s/it]

{'loss': 0.2907, 'grad_norm': 32.55630874633789, 'learning_rate': 3.158501952431665e-06, 'epoch': 1.1}


 37%|███▋      | 4160/11268 [3:19:22<5:23:53,  2.73s/it]

{'loss': 0.4218, 'grad_norm': 7.132206916809082, 'learning_rate': 3.1540646077387292e-06, 'epoch': 1.11}


 37%|███▋      | 4170/11268 [3:19:51<5:49:21,  2.95s/it]

{'loss': 0.3169, 'grad_norm': 85.82096862792969, 'learning_rate': 3.1496272630457938e-06, 'epoch': 1.11}


 37%|███▋      | 4180/11268 [3:20:19<5:31:24,  2.81s/it]

{'loss': 0.3032, 'grad_norm': 44.675106048583984, 'learning_rate': 3.145189918352858e-06, 'epoch': 1.11}


 37%|███▋      | 4190/11268 [3:20:46<5:07:20,  2.61s/it]

{'loss': 0.421, 'grad_norm': 16.726261138916016, 'learning_rate': 3.140752573659922e-06, 'epoch': 1.12}


 37%|███▋      | 4200/11268 [3:21:11<4:54:03,  2.50s/it]

{'loss': 0.3087, 'grad_norm': 25.914119720458984, 'learning_rate': 3.136315228966986e-06, 'epoch': 1.12}


 37%|███▋      | 4210/11268 [3:21:39<5:58:42,  3.05s/it]

{'loss': 0.389, 'grad_norm': 33.73661804199219, 'learning_rate': 3.1318778842740506e-06, 'epoch': 1.12}


 37%|███▋      | 4220/11268 [3:22:06<5:07:56,  2.62s/it]

{'loss': 0.3949, 'grad_norm': 70.91206359863281, 'learning_rate': 3.1274405395811147e-06, 'epoch': 1.12}


 38%|███▊      | 4230/11268 [3:22:31<4:52:12,  2.49s/it]

{'loss': 0.384, 'grad_norm': 24.275022506713867, 'learning_rate': 3.1230031948881788e-06, 'epoch': 1.13}


 38%|███▊      | 4240/11268 [3:22:56<4:55:46,  2.53s/it]

{'loss': 0.3381, 'grad_norm': 18.02881622314453, 'learning_rate': 3.1185658501952433e-06, 'epoch': 1.13}


 38%|███▊      | 4250/11268 [3:23:20<4:43:23,  2.42s/it]

{'loss': 0.3784, 'grad_norm': 17.50286865234375, 'learning_rate': 3.114128505502308e-06, 'epoch': 1.13}


 38%|███▊      | 4260/11268 [3:23:44<4:39:51,  2.40s/it]

{'loss': 0.5182, 'grad_norm': 58.238224029541016, 'learning_rate': 3.1096911608093723e-06, 'epoch': 1.13}


 38%|███▊      | 4270/11268 [3:24:09<4:46:06,  2.45s/it]

{'loss': 0.3208, 'grad_norm': 36.494564056396484, 'learning_rate': 3.1052538161164364e-06, 'epoch': 1.14}


 38%|███▊      | 4280/11268 [3:24:33<4:45:53,  2.45s/it]

{'loss': 0.386, 'grad_norm': 5.250776767730713, 'learning_rate': 3.1008164714235005e-06, 'epoch': 1.14}


 38%|███▊      | 4290/11268 [3:24:58<4:48:21,  2.48s/it]

{'loss': 0.5409, 'grad_norm': 12.714916229248047, 'learning_rate': 3.0963791267305646e-06, 'epoch': 1.14}


 38%|███▊      | 4300/11268 [3:25:23<4:43:53,  2.44s/it]

{'loss': 0.4072, 'grad_norm': 19.4248046875, 'learning_rate': 3.091941782037629e-06, 'epoch': 1.14}


 38%|███▊      | 4310/11268 [3:25:48<4:53:40,  2.53s/it]

{'loss': 0.3945, 'grad_norm': 63.79132843017578, 'learning_rate': 3.0875044373446933e-06, 'epoch': 1.15}


 38%|███▊      | 4320/11268 [3:26:13<4:47:32,  2.48s/it]

{'loss': 0.4618, 'grad_norm': 45.68960189819336, 'learning_rate': 3.0830670926517574e-06, 'epoch': 1.15}


 38%|███▊      | 4330/11268 [3:26:38<4:56:42,  2.57s/it]

{'loss': 0.3013, 'grad_norm': 32.63360595703125, 'learning_rate': 3.078629747958822e-06, 'epoch': 1.15}


 39%|███▊      | 4340/11268 [3:27:03<4:47:45,  2.49s/it]

{'loss': 0.3819, 'grad_norm': 60.17452621459961, 'learning_rate': 3.074192403265886e-06, 'epoch': 1.16}


 39%|███▊      | 4350/11268 [3:27:28<4:47:34,  2.49s/it]

{'loss': 0.3872, 'grad_norm': 45.550968170166016, 'learning_rate': 3.06975505857295e-06, 'epoch': 1.16}


 39%|███▊      | 4360/11268 [3:27:53<4:46:41,  2.49s/it]

{'loss': 0.4543, 'grad_norm': 43.52312088012695, 'learning_rate': 3.0653177138800146e-06, 'epoch': 1.16}


 39%|███▉      | 4370/11268 [3:28:20<5:08:38,  2.68s/it]

{'loss': 0.4621, 'grad_norm': 32.9071159362793, 'learning_rate': 3.0608803691870787e-06, 'epoch': 1.16}


 39%|███▉      | 4380/11268 [3:28:47<4:56:38,  2.58s/it]

{'loss': 0.2536, 'grad_norm': 18.797021865844727, 'learning_rate': 3.056443024494143e-06, 'epoch': 1.17}


 39%|███▉      | 4390/11268 [3:29:13<4:57:44,  2.60s/it]

{'loss': 0.4395, 'grad_norm': 29.822704315185547, 'learning_rate': 3.052005679801207e-06, 'epoch': 1.17}


 39%|███▉      | 4400/11268 [3:29:39<5:05:24,  2.67s/it]

{'loss': 0.3256, 'grad_norm': 42.53751754760742, 'learning_rate': 3.0475683351082714e-06, 'epoch': 1.17}


 39%|███▉      | 4410/11268 [3:30:05<4:56:16,  2.59s/it]

{'loss': 0.2939, 'grad_norm': 19.648696899414062, 'learning_rate': 3.0431309904153355e-06, 'epoch': 1.17}


 39%|███▉      | 4420/11268 [3:30:32<4:58:11,  2.61s/it]

{'loss': 0.3974, 'grad_norm': 35.35326385498047, 'learning_rate': 3.0386936457223996e-06, 'epoch': 1.18}


 39%|███▉      | 4430/11268 [3:30:58<5:06:34,  2.69s/it]

{'loss': 0.3006, 'grad_norm': 14.91307258605957, 'learning_rate': 3.034256301029464e-06, 'epoch': 1.18}


 39%|███▉      | 4440/11268 [3:31:26<5:22:03,  2.83s/it]

{'loss': 0.3188, 'grad_norm': 17.40831756591797, 'learning_rate': 3.0298189563365282e-06, 'epoch': 1.18}


 39%|███▉      | 4450/11268 [3:31:53<5:07:20,  2.70s/it]

{'loss': 0.4533, 'grad_norm': 37.886653900146484, 'learning_rate': 3.0253816116435923e-06, 'epoch': 1.18}


 40%|███▉      | 4460/11268 [3:32:19<4:57:31,  2.62s/it]

{'loss': 0.3325, 'grad_norm': 49.2622184753418, 'learning_rate': 3.0209442669506573e-06, 'epoch': 1.19}


 40%|███▉      | 4470/11268 [3:32:44<4:46:24,  2.53s/it]

{'loss': 0.3037, 'grad_norm': 34.96827697753906, 'learning_rate': 3.0165069222577214e-06, 'epoch': 1.19}


 40%|███▉      | 4480/11268 [3:33:10<4:46:51,  2.54s/it]

{'loss': 0.3211, 'grad_norm': 21.063220977783203, 'learning_rate': 3.0120695775647855e-06, 'epoch': 1.19}


 40%|███▉      | 4490/11268 [3:33:35<4:46:08,  2.53s/it]

{'loss': 0.5351, 'grad_norm': 59.07707214355469, 'learning_rate': 3.00763223287185e-06, 'epoch': 1.2}


 40%|███▉      | 4500/11268 [3:34:03<5:15:32,  2.80s/it]

{'loss': 0.2543, 'grad_norm': 108.94097900390625, 'learning_rate': 3.003194888178914e-06, 'epoch': 1.2}


 40%|████      | 4510/11268 [3:34:29<4:51:44,  2.59s/it]

{'loss': 0.5126, 'grad_norm': 72.97836303710938, 'learning_rate': 2.998757543485978e-06, 'epoch': 1.2}


 40%|████      | 4520/11268 [3:34:55<4:50:31,  2.58s/it]

{'loss': 0.2882, 'grad_norm': 38.91378402709961, 'learning_rate': 2.9943201987930427e-06, 'epoch': 1.2}


 40%|████      | 4530/11268 [3:35:21<4:46:22,  2.55s/it]

{'loss': 0.5089, 'grad_norm': 76.03022003173828, 'learning_rate': 2.989882854100107e-06, 'epoch': 1.21}


 40%|████      | 4540/11268 [3:35:48<5:19:26,  2.85s/it]

{'loss': 0.3408, 'grad_norm': 8.401103973388672, 'learning_rate': 2.985445509407171e-06, 'epoch': 1.21}


 40%|████      | 4550/11268 [3:36:15<4:58:53,  2.67s/it]

{'loss': 0.3178, 'grad_norm': 11.905583381652832, 'learning_rate': 2.9810081647142354e-06, 'epoch': 1.21}


 40%|████      | 4560/11268 [3:36:42<5:26:17,  2.92s/it]

{'loss': 0.5802, 'grad_norm': 5.930868625640869, 'learning_rate': 2.9765708200212995e-06, 'epoch': 1.21}


 41%|████      | 4570/11268 [3:37:09<5:03:07,  2.72s/it]

{'loss': 0.4175, 'grad_norm': 51.42832565307617, 'learning_rate': 2.9721334753283636e-06, 'epoch': 1.22}


 41%|████      | 4580/11268 [3:37:35<4:37:09,  2.49s/it]

{'loss': 0.4067, 'grad_norm': 71.15629577636719, 'learning_rate': 2.967696130635428e-06, 'epoch': 1.22}


 41%|████      | 4590/11268 [3:37:58<4:06:38,  2.22s/it]

{'loss': 0.4262, 'grad_norm': 31.330825805664062, 'learning_rate': 2.9632587859424922e-06, 'epoch': 1.22}


 41%|████      | 4600/11268 [3:38:19<3:57:02,  2.13s/it]

{'loss': 0.3853, 'grad_norm': 48.36967086791992, 'learning_rate': 2.9588214412495563e-06, 'epoch': 1.22}


 41%|████      | 4610/11268 [3:38:40<3:49:07,  2.06s/it]

{'loss': 0.3924, 'grad_norm': 23.711320877075195, 'learning_rate': 2.9543840965566204e-06, 'epoch': 1.23}


 41%|████      | 4620/11268 [3:39:01<3:55:54,  2.13s/it]

{'loss': 0.363, 'grad_norm': 45.54877471923828, 'learning_rate': 2.949946751863685e-06, 'epoch': 1.23}


 41%|████      | 4630/11268 [3:39:23<4:01:28,  2.18s/it]

{'loss': 0.2775, 'grad_norm': 34.75027084350586, 'learning_rate': 2.945509407170749e-06, 'epoch': 1.23}


 41%|████      | 4640/11268 [3:39:44<3:59:36,  2.17s/it]

{'loss': 0.4013, 'grad_norm': 92.7309341430664, 'learning_rate': 2.941072062477813e-06, 'epoch': 1.24}


 41%|████▏     | 4650/11268 [3:40:06<3:56:35,  2.15s/it]

{'loss': 0.3302, 'grad_norm': 6.587616920471191, 'learning_rate': 2.9366347177848777e-06, 'epoch': 1.24}


 41%|████▏     | 4660/11268 [3:40:27<3:56:47,  2.15s/it]

{'loss': 0.5266, 'grad_norm': 55.009342193603516, 'learning_rate': 2.9321973730919418e-06, 'epoch': 1.24}


 41%|████▏     | 4670/11268 [3:40:49<4:02:10,  2.20s/it]

{'loss': 0.2891, 'grad_norm': 19.210548400878906, 'learning_rate': 2.9277600283990067e-06, 'epoch': 1.24}


 42%|████▏     | 4680/11268 [3:41:10<3:55:57,  2.15s/it]

{'loss': 0.3926, 'grad_norm': 71.92720794677734, 'learning_rate': 2.923322683706071e-06, 'epoch': 1.25}


 42%|████▏     | 4690/11268 [3:41:32<3:56:44,  2.16s/it]

{'loss': 0.3793, 'grad_norm': 29.53013038635254, 'learning_rate': 2.918885339013135e-06, 'epoch': 1.25}


 42%|████▏     | 4700/11268 [3:41:53<3:55:42,  2.15s/it]

{'loss': 0.4606, 'grad_norm': 66.53036499023438, 'learning_rate': 2.914447994320199e-06, 'epoch': 1.25}


 42%|████▏     | 4710/11268 [3:42:15<3:56:30,  2.16s/it]

{'loss': 0.4766, 'grad_norm': 62.39105987548828, 'learning_rate': 2.9100106496272635e-06, 'epoch': 1.25}


 42%|████▏     | 4720/11268 [3:42:37<3:54:12,  2.15s/it]

{'loss': 0.4049, 'grad_norm': 1.6803525686264038, 'learning_rate': 2.9055733049343276e-06, 'epoch': 1.26}


 42%|████▏     | 4730/11268 [3:42:58<3:56:00,  2.17s/it]

{'loss': 0.5353, 'grad_norm': 24.308237075805664, 'learning_rate': 2.9011359602413917e-06, 'epoch': 1.26}


 42%|████▏     | 4740/11268 [3:43:20<3:55:26,  2.16s/it]

{'loss': 0.4769, 'grad_norm': 61.788597106933594, 'learning_rate': 2.8966986155484563e-06, 'epoch': 1.26}


 42%|████▏     | 4750/11268 [3:43:42<3:54:33,  2.16s/it]

{'loss': 0.4012, 'grad_norm': 60.700103759765625, 'learning_rate': 2.8922612708555204e-06, 'epoch': 1.26}


 42%|████▏     | 4760/11268 [3:44:03<3:55:26,  2.17s/it]

{'loss': 0.2974, 'grad_norm': 50.121177673339844, 'learning_rate': 2.8878239261625845e-06, 'epoch': 1.27}


 42%|████▏     | 4770/11268 [3:44:25<3:54:28,  2.17s/it]

{'loss': 0.5166, 'grad_norm': 56.65994644165039, 'learning_rate': 2.883386581469649e-06, 'epoch': 1.27}


 42%|████▏     | 4780/11268 [3:44:47<3:55:14,  2.18s/it]

{'loss': 0.3719, 'grad_norm': 47.86198425292969, 'learning_rate': 2.878949236776713e-06, 'epoch': 1.27}


 43%|████▎     | 4790/11268 [3:45:09<3:55:58,  2.19s/it]

{'loss': 0.2031, 'grad_norm': 17.415645599365234, 'learning_rate': 2.874511892083777e-06, 'epoch': 1.28}


 43%|████▎     | 4800/11268 [3:45:31<3:58:29,  2.21s/it]

{'loss': 0.3492, 'grad_norm': 24.741268157958984, 'learning_rate': 2.8700745473908413e-06, 'epoch': 1.28}


 43%|████▎     | 4810/11268 [3:45:53<3:56:31,  2.20s/it]

{'loss': 0.3182, 'grad_norm': 41.743743896484375, 'learning_rate': 2.865637202697906e-06, 'epoch': 1.28}


 43%|████▎     | 4820/11268 [3:46:15<3:56:00,  2.20s/it]

{'loss': 0.3966, 'grad_norm': 39.02400207519531, 'learning_rate': 2.86119985800497e-06, 'epoch': 1.28}


 43%|████▎     | 4830/11268 [3:46:37<3:56:10,  2.20s/it]

{'loss': 0.6672, 'grad_norm': 26.04469871520996, 'learning_rate': 2.856762513312034e-06, 'epoch': 1.29}


 43%|████▎     | 4840/11268 [3:46:59<3:53:28,  2.18s/it]

{'loss': 0.4201, 'grad_norm': 88.40325927734375, 'learning_rate': 2.8523251686190985e-06, 'epoch': 1.29}


 43%|████▎     | 4850/11268 [3:47:21<3:59:34,  2.24s/it]

{'loss': 0.5281, 'grad_norm': 21.478443145751953, 'learning_rate': 2.8478878239261626e-06, 'epoch': 1.29}


 43%|████▎     | 4860/11268 [3:47:43<3:52:05,  2.17s/it]

{'loss': 0.4107, 'grad_norm': 66.9277114868164, 'learning_rate': 2.8434504792332267e-06, 'epoch': 1.29}


 43%|████▎     | 4870/11268 [3:48:05<3:54:21,  2.20s/it]

{'loss': 0.3604, 'grad_norm': 15.34659481048584, 'learning_rate': 2.8390131345402917e-06, 'epoch': 1.3}


 43%|████▎     | 4880/11268 [3:48:27<3:57:05,  2.23s/it]

{'loss': 0.2633, 'grad_norm': 37.452537536621094, 'learning_rate': 2.8345757898473558e-06, 'epoch': 1.3}


 43%|████▎     | 4890/11268 [3:48:49<3:53:25,  2.20s/it]

{'loss': 0.4409, 'grad_norm': 33.15935134887695, 'learning_rate': 2.83013844515442e-06, 'epoch': 1.3}


 43%|████▎     | 4900/11268 [3:49:11<3:53:08,  2.20s/it]

{'loss': 0.4413, 'grad_norm': 49.37697982788086, 'learning_rate': 2.8257011004614844e-06, 'epoch': 1.3}


 44%|████▎     | 4910/11268 [3:49:34<3:59:51,  2.26s/it]

{'loss': 0.304, 'grad_norm': 3.118769884109497, 'learning_rate': 2.8212637557685485e-06, 'epoch': 1.31}


 44%|████▎     | 4920/11268 [3:49:56<4:02:09,  2.29s/it]

{'loss': 0.4082, 'grad_norm': 20.165117263793945, 'learning_rate': 2.8168264110756126e-06, 'epoch': 1.31}


 44%|████▍     | 4930/11268 [3:50:19<3:58:05,  2.25s/it]

{'loss': 0.2349, 'grad_norm': 52.98624038696289, 'learning_rate': 2.812389066382677e-06, 'epoch': 1.31}


 44%|████▍     | 4940/11268 [3:50:41<3:55:29,  2.23s/it]

{'loss': 0.3856, 'grad_norm': 20.26984977722168, 'learning_rate': 2.807951721689741e-06, 'epoch': 1.32}


 44%|████▍     | 4950/11268 [3:51:04<3:56:03,  2.24s/it]

{'loss': 0.4064, 'grad_norm': 17.35883903503418, 'learning_rate': 2.8035143769968053e-06, 'epoch': 1.32}


 44%|████▍     | 4960/11268 [3:51:27<4:04:35,  2.33s/it]

{'loss': 0.4243, 'grad_norm': 55.4739990234375, 'learning_rate': 2.79907703230387e-06, 'epoch': 1.32}


 44%|████▍     | 4970/11268 [3:51:51<4:12:36,  2.41s/it]

{'loss': 0.4577, 'grad_norm': 57.40946578979492, 'learning_rate': 2.794639687610934e-06, 'epoch': 1.32}


 44%|████▍     | 4980/11268 [3:52:15<4:16:14,  2.45s/it]

{'loss': 0.3182, 'grad_norm': 53.673954010009766, 'learning_rate': 2.790202342917998e-06, 'epoch': 1.33}


 44%|████▍     | 4990/11268 [3:52:40<4:09:47,  2.39s/it]

{'loss': 0.2015, 'grad_norm': 37.89504623413086, 'learning_rate': 2.785764998225062e-06, 'epoch': 1.33}


 44%|████▍     | 5000/11268 [3:53:03<3:55:03,  2.25s/it]

{'loss': 0.3429, 'grad_norm': 58.81607437133789, 'learning_rate': 2.7813276535321266e-06, 'epoch': 1.33}


 44%|████▍     | 5010/11268 [3:53:25<3:52:37,  2.23s/it]

{'loss': 0.4401, 'grad_norm': 61.05684280395508, 'learning_rate': 2.7768903088391907e-06, 'epoch': 1.33}


 45%|████▍     | 5020/11268 [3:53:48<4:03:31,  2.34s/it]

{'loss': 0.3691, 'grad_norm': 38.03738784790039, 'learning_rate': 2.772452964146255e-06, 'epoch': 1.34}


 45%|████▍     | 5030/11268 [3:54:11<3:56:23,  2.27s/it]

{'loss': 0.4365, 'grad_norm': 23.519515991210938, 'learning_rate': 2.7680156194533194e-06, 'epoch': 1.34}


 45%|████▍     | 5040/11268 [3:54:33<3:55:56,  2.27s/it]

{'loss': 0.3614, 'grad_norm': 5.398426532745361, 'learning_rate': 2.7635782747603834e-06, 'epoch': 1.34}


 45%|████▍     | 5050/11268 [3:54:56<3:54:05,  2.26s/it]

{'loss': 0.3434, 'grad_norm': 94.55555725097656, 'learning_rate': 2.7591409300674475e-06, 'epoch': 1.34}


 45%|████▍     | 5060/11268 [3:55:20<4:03:10,  2.35s/it]

{'loss': 0.3295, 'grad_norm': 13.663138389587402, 'learning_rate': 2.754703585374512e-06, 'epoch': 1.35}


 45%|████▍     | 5070/11268 [3:55:43<4:00:48,  2.33s/it]

{'loss': 0.3202, 'grad_norm': 18.806346893310547, 'learning_rate': 2.750266240681576e-06, 'epoch': 1.35}


 45%|████▌     | 5080/11268 [3:56:08<4:02:44,  2.35s/it]

{'loss': 0.4979, 'grad_norm': 50.802101135253906, 'learning_rate': 2.7458288959886407e-06, 'epoch': 1.35}


 45%|████▌     | 5090/11268 [3:56:31<4:00:16,  2.33s/it]

{'loss': 0.4015, 'grad_norm': 42.24506378173828, 'learning_rate': 2.741391551295705e-06, 'epoch': 1.36}


 45%|████▌     | 5100/11268 [3:56:54<3:54:30,  2.28s/it]

{'loss': 0.2794, 'grad_norm': 9.65519905090332, 'learning_rate': 2.7369542066027693e-06, 'epoch': 1.36}


 45%|████▌     | 5110/11268 [3:57:17<4:05:55,  2.40s/it]

{'loss': 0.3299, 'grad_norm': 23.062803268432617, 'learning_rate': 2.7325168619098334e-06, 'epoch': 1.36}


 45%|████▌     | 5120/11268 [3:57:40<3:49:51,  2.24s/it]

{'loss': 0.3362, 'grad_norm': 42.74171447753906, 'learning_rate': 2.728079517216898e-06, 'epoch': 1.36}


 46%|████▌     | 5130/11268 [3:58:02<3:48:23,  2.23s/it]

{'loss': 0.416, 'grad_norm': 1.1335097551345825, 'learning_rate': 2.723642172523962e-06, 'epoch': 1.37}


 46%|████▌     | 5140/11268 [3:58:25<3:50:46,  2.26s/it]

{'loss': 0.4816, 'grad_norm': 44.670101165771484, 'learning_rate': 2.719204827831026e-06, 'epoch': 1.37}


 46%|████▌     | 5150/11268 [3:58:48<3:51:37,  2.27s/it]

{'loss': 0.3793, 'grad_norm': 18.282821655273438, 'learning_rate': 2.7147674831380906e-06, 'epoch': 1.37}


 46%|████▌     | 5160/11268 [3:59:11<3:54:37,  2.30s/it]

{'loss': 0.4056, 'grad_norm': 38.96583557128906, 'learning_rate': 2.7103301384451547e-06, 'epoch': 1.37}


 46%|████▌     | 5170/11268 [3:59:34<4:01:55,  2.38s/it]

{'loss': 0.4056, 'grad_norm': 73.3528823852539, 'learning_rate': 2.705892793752219e-06, 'epoch': 1.38}


 46%|████▌     | 5180/11268 [3:59:58<4:01:08,  2.38s/it]

{'loss': 0.3328, 'grad_norm': 12.709755897521973, 'learning_rate': 2.701455449059283e-06, 'epoch': 1.38}


 46%|████▌     | 5190/11268 [4:00:22<3:57:34,  2.35s/it]

{'loss': 0.45, 'grad_norm': 42.40044021606445, 'learning_rate': 2.6970181043663475e-06, 'epoch': 1.38}


 46%|████▌     | 5200/11268 [4:00:45<3:52:38,  2.30s/it]

{'loss': 0.3236, 'grad_norm': 30.970508575439453, 'learning_rate': 2.6925807596734116e-06, 'epoch': 1.38}


 46%|████▌     | 5210/11268 [4:01:08<3:49:10,  2.27s/it]

{'loss': 0.3149, 'grad_norm': 46.15846633911133, 'learning_rate': 2.6881434149804757e-06, 'epoch': 1.39}


 46%|████▋     | 5220/11268 [4:01:31<3:57:47,  2.36s/it]

{'loss': 0.4892, 'grad_norm': 26.226940155029297, 'learning_rate': 2.68370607028754e-06, 'epoch': 1.39}


 46%|████▋     | 5230/11268 [4:01:55<3:54:33,  2.33s/it]

{'loss': 0.2178, 'grad_norm': 19.278440475463867, 'learning_rate': 2.6792687255946043e-06, 'epoch': 1.39}


 47%|████▋     | 5240/11268 [4:02:18<3:52:21,  2.31s/it]

{'loss': 0.3859, 'grad_norm': 44.71614456176758, 'learning_rate': 2.6748313809016684e-06, 'epoch': 1.4}


 47%|████▋     | 5250/11268 [4:02:42<3:53:05,  2.32s/it]

{'loss': 0.3681, 'grad_norm': 24.44068717956543, 'learning_rate': 2.670394036208733e-06, 'epoch': 1.4}


 47%|████▋     | 5260/11268 [4:03:05<3:52:43,  2.32s/it]

{'loss': 0.3337, 'grad_norm': 53.29160690307617, 'learning_rate': 2.665956691515797e-06, 'epoch': 1.4}


 47%|████▋     | 5270/11268 [4:03:28<3:50:44,  2.31s/it]

{'loss': 0.3785, 'grad_norm': 48.54678726196289, 'learning_rate': 2.661519346822861e-06, 'epoch': 1.4}


 47%|████▋     | 5280/11268 [4:03:52<3:52:23,  2.33s/it]

{'loss': 0.2488, 'grad_norm': 26.18408966064453, 'learning_rate': 2.657082002129925e-06, 'epoch': 1.41}


 47%|████▋     | 5290/11268 [4:04:16<4:00:24,  2.41s/it]

{'loss': 0.2565, 'grad_norm': 64.42898559570312, 'learning_rate': 2.65264465743699e-06, 'epoch': 1.41}


 47%|████▋     | 5300/11268 [4:04:40<3:59:15,  2.41s/it]

{'loss': 0.495, 'grad_norm': 74.4629898071289, 'learning_rate': 2.6482073127440542e-06, 'epoch': 1.41}


 47%|████▋     | 5310/11268 [4:05:04<3:54:45,  2.36s/it]

{'loss': 0.5005, 'grad_norm': 62.5654411315918, 'learning_rate': 2.6437699680511188e-06, 'epoch': 1.41}


 47%|████▋     | 5320/11268 [4:05:29<4:11:44,  2.54s/it]

{'loss': 0.4535, 'grad_norm': 1.9694045782089233, 'learning_rate': 2.639332623358183e-06, 'epoch': 1.42}


 47%|████▋     | 5330/11268 [4:05:54<4:03:23,  2.46s/it]

{'loss': 0.2567, 'grad_norm': 55.817508697509766, 'learning_rate': 2.634895278665247e-06, 'epoch': 1.42}


 47%|████▋     | 5340/11268 [4:06:18<3:53:14,  2.36s/it]

{'loss': 0.2602, 'grad_norm': 28.193262100219727, 'learning_rate': 2.6304579339723115e-06, 'epoch': 1.42}


 47%|████▋     | 5350/11268 [4:06:42<3:56:17,  2.40s/it]

{'loss': 0.2867, 'grad_norm': 21.148727416992188, 'learning_rate': 2.6260205892793756e-06, 'epoch': 1.42}


 48%|████▊     | 5360/11268 [4:07:06<3:49:34,  2.33s/it]

{'loss': 0.2097, 'grad_norm': 1.262163519859314, 'learning_rate': 2.6215832445864397e-06, 'epoch': 1.43}


 48%|████▊     | 5370/11268 [4:07:30<3:53:16,  2.37s/it]

{'loss': 0.2318, 'grad_norm': 8.631014823913574, 'learning_rate': 2.6171458998935038e-06, 'epoch': 1.43}


 48%|████▊     | 5380/11268 [4:07:53<3:52:59,  2.37s/it]

{'loss': 0.3175, 'grad_norm': 72.81652069091797, 'learning_rate': 2.6127085552005683e-06, 'epoch': 1.43}


 48%|████▊     | 5390/11268 [4:08:18<4:01:37,  2.47s/it]

{'loss': 0.302, 'grad_norm': 38.914085388183594, 'learning_rate': 2.6082712105076324e-06, 'epoch': 1.44}


 48%|████▊     | 5400/11268 [4:08:42<3:53:08,  2.38s/it]

{'loss': 0.5973, 'grad_norm': 67.58287048339844, 'learning_rate': 2.6038338658146965e-06, 'epoch': 1.44}


 48%|████▊     | 5410/11268 [4:09:06<3:53:22,  2.39s/it]

{'loss': 0.4405, 'grad_norm': 148.8922576904297, 'learning_rate': 2.599396521121761e-06, 'epoch': 1.44}


 48%|████▊     | 5420/11268 [4:09:30<3:53:31,  2.40s/it]

{'loss': 0.3936, 'grad_norm': 34.74929428100586, 'learning_rate': 2.594959176428825e-06, 'epoch': 1.44}


 48%|████▊     | 5430/11268 [4:09:55<3:56:10,  2.43s/it]

{'loss': 0.2189, 'grad_norm': 14.037424087524414, 'learning_rate': 2.590521831735889e-06, 'epoch': 1.45}


 48%|████▊     | 5440/11268 [4:10:18<3:50:46,  2.38s/it]

{'loss': 0.3137, 'grad_norm': 5.392988681793213, 'learning_rate': 2.5860844870429537e-06, 'epoch': 1.45}


 48%|████▊     | 5450/11268 [4:10:42<3:50:54,  2.38s/it]

{'loss': 0.5976, 'grad_norm': 54.35594177246094, 'learning_rate': 2.581647142350018e-06, 'epoch': 1.45}


 48%|████▊     | 5460/11268 [4:11:06<3:49:22,  2.37s/it]

{'loss': 0.4212, 'grad_norm': 10.110573768615723, 'learning_rate': 2.577209797657082e-06, 'epoch': 1.45}


 49%|████▊     | 5470/11268 [4:11:30<4:03:33,  2.52s/it]

{'loss': 0.5271, 'grad_norm': 97.60591125488281, 'learning_rate': 2.572772452964146e-06, 'epoch': 1.46}


 49%|████▊     | 5480/11268 [4:11:54<3:48:01,  2.36s/it]

{'loss': 0.3833, 'grad_norm': 78.51748657226562, 'learning_rate': 2.5683351082712105e-06, 'epoch': 1.46}


 49%|████▊     | 5490/11268 [4:12:18<3:53:37,  2.43s/it]

{'loss': 0.4865, 'grad_norm': 18.49489974975586, 'learning_rate': 2.563897763578275e-06, 'epoch': 1.46}


 49%|████▉     | 5500/11268 [4:12:43<4:02:00,  2.52s/it]

{'loss': 0.1895, 'grad_norm': 65.34386444091797, 'learning_rate': 2.5594604188853396e-06, 'epoch': 1.46}


 49%|████▉     | 5510/11268 [4:13:07<3:47:59,  2.38s/it]

{'loss': 0.3546, 'grad_norm': 14.89499568939209, 'learning_rate': 2.5550230741924037e-06, 'epoch': 1.47}


 49%|████▉     | 5520/11268 [4:13:30<3:42:32,  2.32s/it]

{'loss': 0.2224, 'grad_norm': 67.33487701416016, 'learning_rate': 2.550585729499468e-06, 'epoch': 1.47}


 49%|████▉     | 5530/11268 [4:13:53<3:43:37,  2.34s/it]

{'loss': 0.333, 'grad_norm': 59.60548400878906, 'learning_rate': 2.5461483848065323e-06, 'epoch': 1.47}


 49%|████▉     | 5540/11268 [4:14:17<3:46:34,  2.37s/it]

{'loss': 0.5795, 'grad_norm': 60.33387756347656, 'learning_rate': 2.5417110401135964e-06, 'epoch': 1.47}


 49%|████▉     | 5550/11268 [4:14:42<3:46:49,  2.38s/it]

{'loss': 0.4306, 'grad_norm': 22.29596710205078, 'learning_rate': 2.5372736954206605e-06, 'epoch': 1.48}


 49%|████▉     | 5560/11268 [4:15:05<3:42:24,  2.34s/it]

{'loss': 0.4097, 'grad_norm': 29.321380615234375, 'learning_rate': 2.532836350727725e-06, 'epoch': 1.48}


 49%|████▉     | 5570/11268 [4:15:28<3:40:35,  2.32s/it]

{'loss': 0.2729, 'grad_norm': 0.48686346411705017, 'learning_rate': 2.528399006034789e-06, 'epoch': 1.48}


 50%|████▉     | 5580/11268 [4:15:51<3:39:36,  2.32s/it]

{'loss': 0.5437, 'grad_norm': 57.16348648071289, 'learning_rate': 2.5239616613418532e-06, 'epoch': 1.49}


 50%|████▉     | 5590/11268 [4:16:14<3:37:15,  2.30s/it]

{'loss': 0.3588, 'grad_norm': 67.26814270019531, 'learning_rate': 2.5195243166489173e-06, 'epoch': 1.49}


 50%|████▉     | 5600/11268 [4:16:38<3:46:03,  2.39s/it]

{'loss': 0.3797, 'grad_norm': 26.68613052368164, 'learning_rate': 2.515086971955982e-06, 'epoch': 1.49}


 50%|████▉     | 5610/11268 [4:17:01<3:36:49,  2.30s/it]

{'loss': 0.3654, 'grad_norm': 15.83231258392334, 'learning_rate': 2.510649627263046e-06, 'epoch': 1.49}


 50%|████▉     | 5620/11268 [4:17:24<3:37:21,  2.31s/it]

{'loss': 0.3257, 'grad_norm': 91.63920593261719, 'learning_rate': 2.50621228257011e-06, 'epoch': 1.5}


 50%|████▉     | 5630/11268 [4:17:48<3:39:06,  2.33s/it]

{'loss': 0.3135, 'grad_norm': 38.38882827758789, 'learning_rate': 2.5017749378771746e-06, 'epoch': 1.5}


 50%|█████     | 5640/11268 [4:18:11<3:37:37,  2.32s/it]

{'loss': 0.3344, 'grad_norm': 6.710525989532471, 'learning_rate': 2.4973375931842387e-06, 'epoch': 1.5}


 50%|█████     | 5650/11268 [4:18:34<3:37:11,  2.32s/it]

{'loss': 0.4441, 'grad_norm': 31.802005767822266, 'learning_rate': 2.492900248491303e-06, 'epoch': 1.5}


 50%|█████     | 5660/11268 [4:18:57<3:34:03,  2.29s/it]

{'loss': 0.7027, 'grad_norm': 62.77358627319336, 'learning_rate': 2.4884629037983673e-06, 'epoch': 1.51}


 50%|█████     | 5670/11268 [4:19:20<3:34:09,  2.30s/it]

{'loss': 0.4285, 'grad_norm': 3.811572551727295, 'learning_rate': 2.4840255591054314e-06, 'epoch': 1.51}


 50%|█████     | 5680/11268 [4:19:43<3:35:38,  2.32s/it]

{'loss': 0.4392, 'grad_norm': 47.65203857421875, 'learning_rate': 2.479588214412496e-06, 'epoch': 1.51}


 50%|█████     | 5690/11268 [4:20:06<3:35:48,  2.32s/it]

{'loss': 0.5708, 'grad_norm': 13.71064567565918, 'learning_rate': 2.47515086971956e-06, 'epoch': 1.51}


 51%|█████     | 5700/11268 [4:20:30<3:35:11,  2.32s/it]

{'loss': 0.7059, 'grad_norm': 116.10169982910156, 'learning_rate': 2.470713525026624e-06, 'epoch': 1.52}


 51%|█████     | 5710/11268 [4:20:53<3:36:15,  2.33s/it]

{'loss': 0.4132, 'grad_norm': 24.96187973022461, 'learning_rate': 2.4662761803336886e-06, 'epoch': 1.52}


 51%|█████     | 5720/11268 [4:21:16<3:31:34,  2.29s/it]

{'loss': 0.3916, 'grad_norm': 6.301574230194092, 'learning_rate': 2.4618388356407527e-06, 'epoch': 1.52}


 51%|█████     | 5730/11268 [4:21:40<3:36:13,  2.34s/it]

{'loss': 0.1712, 'grad_norm': 66.5113754272461, 'learning_rate': 2.457401490947817e-06, 'epoch': 1.53}


 51%|█████     | 5740/11268 [4:22:03<3:35:48,  2.34s/it]

{'loss': 0.4512, 'grad_norm': 1.1590064764022827, 'learning_rate': 2.4529641462548813e-06, 'epoch': 1.53}


 51%|█████     | 5750/11268 [4:22:28<3:51:27,  2.52s/it]

{'loss': 0.3576, 'grad_norm': 84.05433654785156, 'learning_rate': 2.448526801561946e-06, 'epoch': 1.53}


 51%|█████     | 5760/11268 [4:22:52<3:37:07,  2.37s/it]

{'loss': 0.4766, 'grad_norm': 12.33926773071289, 'learning_rate': 2.44408945686901e-06, 'epoch': 1.53}


 51%|█████     | 5770/11268 [4:23:15<3:33:07,  2.33s/it]

{'loss': 0.5107, 'grad_norm': 10.728660583496094, 'learning_rate': 2.439652112176074e-06, 'epoch': 1.54}


 51%|█████▏    | 5780/11268 [4:23:38<3:32:39,  2.33s/it]

{'loss': 0.4151, 'grad_norm': 79.01044464111328, 'learning_rate': 2.435214767483138e-06, 'epoch': 1.54}


 51%|█████▏    | 5790/11268 [4:24:02<3:35:48,  2.36s/it]

{'loss': 0.2835, 'grad_norm': 43.35115432739258, 'learning_rate': 2.4307774227902027e-06, 'epoch': 1.54}


 51%|█████▏    | 5800/11268 [4:24:25<3:34:42,  2.36s/it]

{'loss': 0.2557, 'grad_norm': 47.672882080078125, 'learning_rate': 2.4263400780972668e-06, 'epoch': 1.54}


 52%|█████▏    | 5810/11268 [4:24:49<3:31:16,  2.32s/it]

{'loss': 0.3627, 'grad_norm': 47.75846862792969, 'learning_rate': 2.421902733404331e-06, 'epoch': 1.55}


 52%|█████▏    | 5820/11268 [4:25:12<3:30:33,  2.32s/it]

{'loss': 0.6354, 'grad_norm': 71.76559448242188, 'learning_rate': 2.4174653887113954e-06, 'epoch': 1.55}


 52%|█████▏    | 5830/11268 [4:25:36<3:36:37,  2.39s/it]

{'loss': 0.4252, 'grad_norm': 45.219818115234375, 'learning_rate': 2.4130280440184595e-06, 'epoch': 1.55}


 52%|█████▏    | 5840/11268 [4:26:00<3:32:53,  2.35s/it]

{'loss': 0.4574, 'grad_norm': 60.581390380859375, 'learning_rate': 2.4085906993255236e-06, 'epoch': 1.55}


 52%|█████▏    | 5850/11268 [4:26:23<3:36:01,  2.39s/it]

{'loss': 0.3814, 'grad_norm': 50.423282623291016, 'learning_rate': 2.404153354632588e-06, 'epoch': 1.56}


 52%|█████▏    | 5860/11268 [4:26:47<3:31:46,  2.35s/it]

{'loss': 0.4046, 'grad_norm': 48.1116828918457, 'learning_rate': 2.3997160099396522e-06, 'epoch': 1.56}


 52%|█████▏    | 5870/11268 [4:27:10<3:30:45,  2.34s/it]

{'loss': 0.3961, 'grad_norm': 60.15108108520508, 'learning_rate': 2.3952786652467167e-06, 'epoch': 1.56}


 52%|█████▏    | 5880/11268 [4:27:34<3:30:39,  2.35s/it]

{'loss': 0.3043, 'grad_norm': 57.97117233276367, 'learning_rate': 2.390841320553781e-06, 'epoch': 1.57}


 52%|█████▏    | 5890/11268 [4:27:57<3:31:52,  2.36s/it]

{'loss': 0.4291, 'grad_norm': 67.44349670410156, 'learning_rate': 2.386403975860845e-06, 'epoch': 1.57}


 52%|█████▏    | 5900/11268 [4:28:21<3:28:58,  2.34s/it]

{'loss': 0.586, 'grad_norm': 16.61199951171875, 'learning_rate': 2.3819666311679095e-06, 'epoch': 1.57}


 52%|█████▏    | 5910/11268 [4:28:44<3:28:18,  2.33s/it]

{'loss': 0.3939, 'grad_norm': 72.90435791015625, 'learning_rate': 2.3775292864749736e-06, 'epoch': 1.57}


 53%|█████▎    | 5920/11268 [4:29:07<3:28:55,  2.34s/it]

{'loss': 0.4731, 'grad_norm': 13.542293548583984, 'learning_rate': 2.3730919417820377e-06, 'epoch': 1.58}


 53%|█████▎    | 5930/11268 [4:29:31<3:29:03,  2.35s/it]

{'loss': 0.3008, 'grad_norm': 79.02220153808594, 'learning_rate': 2.368654597089102e-06, 'epoch': 1.58}


 53%|█████▎    | 5940/11268 [4:29:56<3:56:46,  2.67s/it]

{'loss': 0.3233, 'grad_norm': 8.137725830078125, 'learning_rate': 2.3642172523961663e-06, 'epoch': 1.58}


 53%|█████▎    | 5950/11268 [4:30:21<3:33:23,  2.41s/it]

{'loss': 0.3562, 'grad_norm': 44.669395446777344, 'learning_rate': 2.3597799077032304e-06, 'epoch': 1.58}


 53%|█████▎    | 5960/11268 [4:30:45<3:34:48,  2.43s/it]

{'loss': 0.3489, 'grad_norm': 2.6869125366210938, 'learning_rate': 2.355342563010295e-06, 'epoch': 1.59}


 53%|█████▎    | 5970/11268 [4:31:09<3:30:34,  2.38s/it]

{'loss': 0.5328, 'grad_norm': 41.042076110839844, 'learning_rate': 2.350905218317359e-06, 'epoch': 1.59}


 53%|█████▎    | 5980/11268 [4:31:33<3:29:14,  2.37s/it]

{'loss': 0.4808, 'grad_norm': 1.4201816320419312, 'learning_rate': 2.3464678736244235e-06, 'epoch': 1.59}


 53%|█████▎    | 5990/11268 [4:31:56<3:27:00,  2.35s/it]

{'loss': 0.4753, 'grad_norm': 89.9078598022461, 'learning_rate': 2.3420305289314876e-06, 'epoch': 1.59}


 53%|█████▎    | 6000/11268 [4:32:20<3:33:30,  2.43s/it]

{'loss': 0.5337, 'grad_norm': 63.90070343017578, 'learning_rate': 2.3375931842385517e-06, 'epoch': 1.6}


 53%|█████▎    | 6010/11268 [4:32:44<3:29:58,  2.40s/it]

{'loss': 0.4904, 'grad_norm': 26.923553466796875, 'learning_rate': 2.3331558395456162e-06, 'epoch': 1.6}


 53%|█████▎    | 6020/11268 [4:33:09<3:41:49,  2.54s/it]

{'loss': 0.4771, 'grad_norm': 55.201473236083984, 'learning_rate': 2.3287184948526803e-06, 'epoch': 1.6}


 54%|█████▎    | 6030/11268 [4:33:34<3:34:32,  2.46s/it]

{'loss': 0.3157, 'grad_norm': 28.148975372314453, 'learning_rate': 2.3242811501597444e-06, 'epoch': 1.61}


 54%|█████▎    | 6040/11268 [4:33:58<3:33:02,  2.44s/it]

{'loss': 0.3538, 'grad_norm': 54.23725509643555, 'learning_rate': 2.319843805466809e-06, 'epoch': 1.61}


 54%|█████▎    | 6050/11268 [4:34:23<3:32:10,  2.44s/it]

{'loss': 0.4193, 'grad_norm': 35.689388275146484, 'learning_rate': 2.315406460773873e-06, 'epoch': 1.61}


 54%|█████▍    | 6060/11268 [4:34:48<3:33:33,  2.46s/it]

{'loss': 0.4713, 'grad_norm': 93.01923370361328, 'learning_rate': 2.3109691160809376e-06, 'epoch': 1.61}


 54%|█████▍    | 6070/11268 [4:35:12<3:31:29,  2.44s/it]

{'loss': 0.3949, 'grad_norm': 71.91993713378906, 'learning_rate': 2.3065317713880017e-06, 'epoch': 1.62}


 54%|█████▍    | 6080/11268 [4:35:36<3:26:32,  2.39s/it]

{'loss': 0.3017, 'grad_norm': 134.8788299560547, 'learning_rate': 2.3020944266950658e-06, 'epoch': 1.62}


 54%|█████▍    | 6090/11268 [4:36:00<3:25:09,  2.38s/it]

{'loss': 0.3954, 'grad_norm': 18.717525482177734, 'learning_rate': 2.2976570820021303e-06, 'epoch': 1.62}


 54%|█████▍    | 6100/11268 [4:36:24<3:31:57,  2.46s/it]

{'loss': 0.397, 'grad_norm': 58.103179931640625, 'learning_rate': 2.2932197373091944e-06, 'epoch': 1.62}


 54%|█████▍    | 6110/11268 [4:36:48<3:26:26,  2.40s/it]

{'loss': 0.29, 'grad_norm': 37.77821350097656, 'learning_rate': 2.2887823926162585e-06, 'epoch': 1.63}


 54%|█████▍    | 6120/11268 [4:37:13<3:29:46,  2.44s/it]

{'loss': 0.33, 'grad_norm': 46.78099822998047, 'learning_rate': 2.284345047923323e-06, 'epoch': 1.63}


 54%|█████▍    | 6130/11268 [4:37:36<3:22:24,  2.36s/it]

{'loss': 0.3925, 'grad_norm': 40.923824310302734, 'learning_rate': 2.279907703230387e-06, 'epoch': 1.63}


 54%|█████▍    | 6140/11268 [4:38:00<3:22:26,  2.37s/it]

{'loss': 0.3028, 'grad_norm': 65.53398895263672, 'learning_rate': 2.275470358537451e-06, 'epoch': 1.63}


 55%|█████▍    | 6150/11268 [4:38:24<3:29:31,  2.46s/it]

{'loss': 0.4098, 'grad_norm': 7.815479278564453, 'learning_rate': 2.2710330138445157e-06, 'epoch': 1.64}


 55%|█████▍    | 6160/11268 [4:38:49<3:28:29,  2.45s/it]

{'loss': 0.1592, 'grad_norm': 32.87179946899414, 'learning_rate': 2.26659566915158e-06, 'epoch': 1.64}


 55%|█████▍    | 6170/11268 [4:39:13<3:21:49,  2.38s/it]

{'loss': 0.4901, 'grad_norm': 41.67914962768555, 'learning_rate': 2.2621583244586443e-06, 'epoch': 1.64}


 55%|█████▍    | 6180/11268 [4:39:37<3:24:42,  2.41s/it]

{'loss': 0.3775, 'grad_norm': 131.9647674560547, 'learning_rate': 2.2577209797657084e-06, 'epoch': 1.65}


 55%|█████▍    | 6190/11268 [4:40:01<3:23:52,  2.41s/it]

{'loss': 0.3421, 'grad_norm': 5.2909159660339355, 'learning_rate': 2.2532836350727725e-06, 'epoch': 1.65}


 55%|█████▌    | 6200/11268 [4:40:26<3:30:07,  2.49s/it]

{'loss': 0.5184, 'grad_norm': 2.528217077255249, 'learning_rate': 2.248846290379837e-06, 'epoch': 1.65}


 55%|█████▌    | 6210/11268 [4:40:50<3:28:57,  2.48s/it]

{'loss': 0.4295, 'grad_norm': 200.53085327148438, 'learning_rate': 2.244408945686901e-06, 'epoch': 1.65}


 55%|█████▌    | 6220/11268 [4:41:16<3:32:53,  2.53s/it]

{'loss': 0.4585, 'grad_norm': 87.9400405883789, 'learning_rate': 2.2399716009939653e-06, 'epoch': 1.66}


 55%|█████▌    | 6230/11268 [4:41:41<3:29:39,  2.50s/it]

{'loss': 0.3509, 'grad_norm': 67.28507995605469, 'learning_rate': 2.2355342563010298e-06, 'epoch': 1.66}


 55%|█████▌    | 6240/11268 [4:42:07<3:26:13,  2.46s/it]

{'loss': 0.5828, 'grad_norm': 44.125972747802734, 'learning_rate': 2.231096911608094e-06, 'epoch': 1.66}


 55%|█████▌    | 6250/11268 [4:42:31<3:23:56,  2.44s/it]

{'loss': 0.2661, 'grad_norm': 8.366595268249512, 'learning_rate': 2.226659566915158e-06, 'epoch': 1.66}


 56%|█████▌    | 6260/11268 [4:42:55<3:19:21,  2.39s/it]

{'loss': 0.2754, 'grad_norm': 17.8718318939209, 'learning_rate': 2.222222222222222e-06, 'epoch': 1.67}


 56%|█████▌    | 6270/11268 [4:43:19<3:20:07,  2.40s/it]

{'loss': 0.3202, 'grad_norm': 25.51759147644043, 'learning_rate': 2.2177848775292866e-06, 'epoch': 1.67}


 56%|█████▌    | 6280/11268 [4:43:43<3:18:27,  2.39s/it]

{'loss': 0.3134, 'grad_norm': 116.83739471435547, 'learning_rate': 2.213347532836351e-06, 'epoch': 1.67}


 56%|█████▌    | 6290/11268 [4:44:08<3:16:07,  2.36s/it]

{'loss': 0.3301, 'grad_norm': 55.138797760009766, 'learning_rate': 2.2089101881434152e-06, 'epoch': 1.67}


 56%|█████▌    | 6300/11268 [4:44:31<3:13:10,  2.33s/it]

{'loss': 0.4146, 'grad_norm': 62.49728775024414, 'learning_rate': 2.2044728434504793e-06, 'epoch': 1.68}


 56%|█████▌    | 6310/11268 [4:44:55<3:12:29,  2.33s/it]

{'loss': 0.3644, 'grad_norm': 39.887271881103516, 'learning_rate': 2.200035498757544e-06, 'epoch': 1.68}


 56%|█████▌    | 6320/11268 [4:45:18<3:13:13,  2.34s/it]

{'loss': 0.2316, 'grad_norm': 51.77403259277344, 'learning_rate': 2.195598154064608e-06, 'epoch': 1.68}


 56%|█████▌    | 6330/11268 [4:45:42<3:13:29,  2.35s/it]

{'loss': 0.2236, 'grad_norm': 29.805233001708984, 'learning_rate': 2.191160809371672e-06, 'epoch': 1.69}


 56%|█████▋    | 6340/11268 [4:46:05<3:13:22,  2.35s/it]

{'loss': 0.3946, 'grad_norm': 110.7286376953125, 'learning_rate': 2.1867234646787366e-06, 'epoch': 1.69}


 56%|█████▋    | 6350/11268 [4:46:31<3:25:49,  2.51s/it]

{'loss': 0.397, 'grad_norm': 7.167260646820068, 'learning_rate': 2.1822861199858007e-06, 'epoch': 1.69}


 56%|█████▋    | 6360/11268 [4:46:55<3:18:48,  2.43s/it]

{'loss': 0.2771, 'grad_norm': 35.95744705200195, 'learning_rate': 2.1778487752928648e-06, 'epoch': 1.69}


 57%|█████▋    | 6370/11268 [4:47:22<3:47:22,  2.79s/it]

{'loss': 0.5002, 'grad_norm': 79.4239273071289, 'learning_rate': 2.1734114305999293e-06, 'epoch': 1.7}


 57%|█████▋    | 6380/11268 [4:47:49<3:38:27,  2.68s/it]

{'loss': 0.438, 'grad_norm': 25.363061904907227, 'learning_rate': 2.1689740859069934e-06, 'epoch': 1.7}


 57%|█████▋    | 6390/11268 [4:48:14<3:21:35,  2.48s/it]

{'loss': 0.3968, 'grad_norm': 40.0933837890625, 'learning_rate': 2.164536741214058e-06, 'epoch': 1.7}


 57%|█████▋    | 6400/11268 [4:48:39<3:30:40,  2.60s/it]

{'loss': 0.408, 'grad_norm': 54.05282974243164, 'learning_rate': 2.160099396521122e-06, 'epoch': 1.7}


 57%|█████▋    | 6410/11268 [4:49:05<3:28:45,  2.58s/it]

{'loss': 0.4197, 'grad_norm': 38.28696823120117, 'learning_rate': 2.155662051828186e-06, 'epoch': 1.71}


 57%|█████▋    | 6420/11268 [4:49:28<3:07:34,  2.32s/it]

{'loss': 0.3579, 'grad_norm': 21.76958465576172, 'learning_rate': 2.1512247071352506e-06, 'epoch': 1.71}


 57%|█████▋    | 6430/11268 [4:49:51<3:10:04,  2.36s/it]

{'loss': 0.4702, 'grad_norm': 47.16743087768555, 'learning_rate': 2.1467873624423147e-06, 'epoch': 1.71}


 57%|█████▋    | 6440/11268 [4:50:16<3:14:07,  2.41s/it]

{'loss': 0.377, 'grad_norm': 49.87248229980469, 'learning_rate': 2.142350017749379e-06, 'epoch': 1.71}


 57%|█████▋    | 6450/11268 [4:50:43<3:38:59,  2.73s/it]

{'loss': 0.4398, 'grad_norm': 6.606771469116211, 'learning_rate': 2.1379126730564433e-06, 'epoch': 1.72}


 57%|█████▋    | 6460/11268 [4:51:13<4:20:57,  3.26s/it]

{'loss': 0.5081, 'grad_norm': 47.57036209106445, 'learning_rate': 2.1334753283635074e-06, 'epoch': 1.72}


 57%|█████▋    | 6470/11268 [4:51:44<3:50:10,  2.88s/it]

{'loss': 0.2404, 'grad_norm': 43.59548568725586, 'learning_rate': 2.1290379836705715e-06, 'epoch': 1.72}


 58%|█████▊    | 6480/11268 [4:52:11<3:33:35,  2.68s/it]

{'loss': 0.349, 'grad_norm': 76.43495178222656, 'learning_rate': 2.124600638977636e-06, 'epoch': 1.73}


 58%|█████▊    | 6490/11268 [4:52:38<3:32:29,  2.67s/it]

{'loss': 0.3139, 'grad_norm': 9.13902473449707, 'learning_rate': 2.1201632942847e-06, 'epoch': 1.73}


 58%|█████▊    | 6500/11268 [4:53:04<3:27:35,  2.61s/it]

{'loss': 0.3026, 'grad_norm': 6.037651538848877, 'learning_rate': 2.1157259495917647e-06, 'epoch': 1.73}


 58%|█████▊    | 6510/11268 [4:53:31<3:29:45,  2.65s/it]

{'loss': 0.3791, 'grad_norm': 25.021121978759766, 'learning_rate': 2.1112886048988288e-06, 'epoch': 1.73}


 58%|█████▊    | 6520/11268 [4:53:59<3:58:08,  3.01s/it]

{'loss': 0.3124, 'grad_norm': 7.0810065269470215, 'learning_rate': 2.106851260205893e-06, 'epoch': 1.74}


 58%|█████▊    | 6530/11268 [4:54:28<3:47:41,  2.88s/it]

{'loss': 0.3059, 'grad_norm': 71.46830749511719, 'learning_rate': 2.1024139155129574e-06, 'epoch': 1.74}


 58%|█████▊    | 6540/11268 [4:54:55<3:32:41,  2.70s/it]

{'loss': 0.2401, 'grad_norm': 35.76477813720703, 'learning_rate': 2.0979765708200215e-06, 'epoch': 1.74}


 58%|█████▊    | 6550/11268 [4:55:22<3:27:34,  2.64s/it]

{'loss': 0.4158, 'grad_norm': 64.08877563476562, 'learning_rate': 2.0935392261270856e-06, 'epoch': 1.74}


 58%|█████▊    | 6560/11268 [4:55:49<3:36:30,  2.76s/it]

{'loss': 0.2728, 'grad_norm': 0.7380656599998474, 'learning_rate': 2.0891018814341497e-06, 'epoch': 1.75}


 58%|█████▊    | 6570/11268 [4:56:16<3:29:21,  2.67s/it]

{'loss': 0.3754, 'grad_norm': 20.05269432067871, 'learning_rate': 2.084664536741214e-06, 'epoch': 1.75}


 58%|█████▊    | 6580/11268 [4:56:43<3:30:24,  2.69s/it]

{'loss': 0.2884, 'grad_norm': 8.07951831817627, 'learning_rate': 2.0802271920482787e-06, 'epoch': 1.75}


 58%|█████▊    | 6590/11268 [4:57:10<3:30:15,  2.70s/it]

{'loss': 0.5066, 'grad_norm': 33.27769470214844, 'learning_rate': 2.075789847355343e-06, 'epoch': 1.75}


 59%|█████▊    | 6600/11268 [4:57:37<3:28:47,  2.68s/it]

{'loss': 0.2625, 'grad_norm': 54.88376998901367, 'learning_rate': 2.071352502662407e-06, 'epoch': 1.76}


 59%|█████▊    | 6610/11268 [4:58:05<3:36:26,  2.79s/it]

{'loss': 0.2864, 'grad_norm': 83.93997192382812, 'learning_rate': 2.0669151579694714e-06, 'epoch': 1.76}


 59%|█████▉    | 6620/11268 [4:58:33<3:38:13,  2.82s/it]

{'loss': 0.243, 'grad_norm': 28.251733779907227, 'learning_rate': 2.0624778132765355e-06, 'epoch': 1.76}


 59%|█████▉    | 6630/11268 [4:59:00<3:32:24,  2.75s/it]

{'loss': 0.3056, 'grad_norm': 11.35056209564209, 'learning_rate': 2.0580404685835996e-06, 'epoch': 1.77}


 59%|█████▉    | 6640/11268 [4:59:28<3:34:57,  2.79s/it]

{'loss': 0.3891, 'grad_norm': 39.22128677368164, 'learning_rate': 2.053603123890664e-06, 'epoch': 1.77}


 59%|█████▉    | 6650/11268 [4:59:56<3:40:22,  2.86s/it]

{'loss': 0.4854, 'grad_norm': 24.096765518188477, 'learning_rate': 2.0491657791977283e-06, 'epoch': 1.77}


 59%|█████▉    | 6660/11268 [5:00:23<3:32:54,  2.77s/it]

{'loss': 0.3963, 'grad_norm': 42.6016731262207, 'learning_rate': 2.0447284345047924e-06, 'epoch': 1.77}


 59%|█████▉    | 6670/11268 [5:00:51<3:37:04,  2.83s/it]

{'loss': 0.3167, 'grad_norm': 30.725244522094727, 'learning_rate': 2.0402910898118565e-06, 'epoch': 1.78}


 59%|█████▉    | 6680/11268 [5:01:19<3:26:46,  2.70s/it]

{'loss': 0.325, 'grad_norm': 34.02895736694336, 'learning_rate': 2.035853745118921e-06, 'epoch': 1.78}


 59%|█████▉    | 6690/11268 [5:01:46<3:24:55,  2.69s/it]

{'loss': 0.3661, 'grad_norm': 19.365659713745117, 'learning_rate': 2.0314164004259855e-06, 'epoch': 1.78}


 59%|█████▉    | 6700/11268 [5:02:13<3:24:05,  2.68s/it]

{'loss': 0.515, 'grad_norm': 61.94493865966797, 'learning_rate': 2.0269790557330496e-06, 'epoch': 1.78}


 60%|█████▉    | 6710/11268 [5:02:40<3:22:31,  2.67s/it]

{'loss': 0.4211, 'grad_norm': 85.5712890625, 'learning_rate': 2.0225417110401137e-06, 'epoch': 1.79}


 60%|█████▉    | 6720/11268 [5:03:07<3:35:30,  2.84s/it]

{'loss': 0.3279, 'grad_norm': 43.722721099853516, 'learning_rate': 2.0181043663471782e-06, 'epoch': 1.79}


 60%|█████▉    | 6730/11268 [5:03:34<3:26:31,  2.73s/it]

{'loss': 0.2575, 'grad_norm': 90.56026458740234, 'learning_rate': 2.0136670216542423e-06, 'epoch': 1.79}


 60%|█████▉    | 6740/11268 [5:04:02<3:31:05,  2.80s/it]

{'loss': 0.4455, 'grad_norm': 8.048587799072266, 'learning_rate': 2.0092296769613064e-06, 'epoch': 1.79}


 60%|█████▉    | 6750/11268 [5:04:31<3:32:09,  2.82s/it]

{'loss': 0.3261, 'grad_norm': 41.22207260131836, 'learning_rate': 2.0047923322683705e-06, 'epoch': 1.8}


 60%|█████▉    | 6760/11268 [5:04:59<3:25:05,  2.73s/it]

{'loss': 0.4426, 'grad_norm': 29.55959701538086, 'learning_rate': 2.000354987575435e-06, 'epoch': 1.8}


 60%|██████    | 6770/11268 [5:05:26<3:18:33,  2.65s/it]

{'loss': 0.4013, 'grad_norm': 45.04960632324219, 'learning_rate': 1.995917642882499e-06, 'epoch': 1.8}


 60%|██████    | 6780/11268 [5:05:52<3:21:14,  2.69s/it]

{'loss': 0.4687, 'grad_norm': 63.23904037475586, 'learning_rate': 1.9914802981895632e-06, 'epoch': 1.81}


 60%|██████    | 6790/11268 [5:06:20<3:31:03,  2.83s/it]

{'loss': 0.465, 'grad_norm': 49.02014923095703, 'learning_rate': 1.9870429534966278e-06, 'epoch': 1.81}


 60%|██████    | 6800/11268 [5:06:48<3:30:33,  2.83s/it]

{'loss': 0.2825, 'grad_norm': 29.04024314880371, 'learning_rate': 1.9826056088036923e-06, 'epoch': 1.81}


 60%|██████    | 6810/11268 [5:07:16<3:24:11,  2.75s/it]

{'loss': 0.4703, 'grad_norm': 97.53395080566406, 'learning_rate': 1.9781682641107564e-06, 'epoch': 1.81}


 61%|██████    | 6820/11268 [5:07:44<3:24:06,  2.75s/it]

{'loss': 0.3025, 'grad_norm': 39.52357482910156, 'learning_rate': 1.9737309194178205e-06, 'epoch': 1.82}


 61%|██████    | 6830/11268 [5:08:11<3:14:48,  2.63s/it]

{'loss': 0.4479, 'grad_norm': 43.74717712402344, 'learning_rate': 1.969293574724885e-06, 'epoch': 1.82}


 61%|██████    | 6840/11268 [5:08:38<3:28:50,  2.83s/it]

{'loss': 0.2219, 'grad_norm': 51.20553970336914, 'learning_rate': 1.964856230031949e-06, 'epoch': 1.82}


 61%|██████    | 6850/11268 [5:09:06<3:20:42,  2.73s/it]

{'loss': 0.4179, 'grad_norm': 43.11589050292969, 'learning_rate': 1.960418885339013e-06, 'epoch': 1.82}


 61%|██████    | 6860/11268 [5:09:33<3:18:06,  2.70s/it]

{'loss': 0.4402, 'grad_norm': 15.990487098693848, 'learning_rate': 1.9559815406460773e-06, 'epoch': 1.83}


 61%|██████    | 6870/11268 [5:10:00<3:17:30,  2.69s/it]

{'loss': 0.2956, 'grad_norm': 79.59407806396484, 'learning_rate': 1.951544195953142e-06, 'epoch': 1.83}


 61%|██████    | 6880/11268 [5:10:27<3:15:07,  2.67s/it]

{'loss': 0.2556, 'grad_norm': 79.91435241699219, 'learning_rate': 1.947106851260206e-06, 'epoch': 1.83}


 61%|██████    | 6890/11268 [5:10:54<3:17:01,  2.70s/it]

{'loss': 0.3791, 'grad_norm': 36.0528678894043, 'learning_rate': 1.9426695065672704e-06, 'epoch': 1.83}


 61%|██████    | 6900/11268 [5:11:21<3:15:53,  2.69s/it]

{'loss': 0.5045, 'grad_norm': 53.18592834472656, 'learning_rate': 1.9382321618743345e-06, 'epoch': 1.84}


 61%|██████▏   | 6910/11268 [5:11:48<3:16:35,  2.71s/it]

{'loss': 0.251, 'grad_norm': 28.174659729003906, 'learning_rate': 1.933794817181399e-06, 'epoch': 1.84}


 61%|██████▏   | 6920/11268 [5:12:15<3:12:49,  2.66s/it]

{'loss': 0.3903, 'grad_norm': 91.26044464111328, 'learning_rate': 1.929357472488463e-06, 'epoch': 1.84}


 62%|██████▏   | 6930/11268 [5:12:42<3:12:35,  2.66s/it]

{'loss': 0.2075, 'grad_norm': 52.09424591064453, 'learning_rate': 1.9249201277955272e-06, 'epoch': 1.85}


 62%|██████▏   | 6940/11268 [5:13:09<3:11:40,  2.66s/it]

{'loss': 0.3913, 'grad_norm': 46.893714904785156, 'learning_rate': 1.9204827831025918e-06, 'epoch': 1.85}


 62%|██████▏   | 6950/11268 [5:13:36<3:11:01,  2.65s/it]

{'loss': 0.1606, 'grad_norm': 85.90911102294922, 'learning_rate': 1.916045438409656e-06, 'epoch': 1.85}


 62%|██████▏   | 6960/11268 [5:14:03<3:13:14,  2.69s/it]

{'loss': 0.2813, 'grad_norm': 2.6444146633148193, 'learning_rate': 1.91160809371672e-06, 'epoch': 1.85}


 62%|██████▏   | 6970/11268 [5:14:30<3:13:25,  2.70s/it]

{'loss': 0.3612, 'grad_norm': 0.4849480092525482, 'learning_rate': 1.9071707490237843e-06, 'epoch': 1.86}


 62%|██████▏   | 6980/11268 [5:14:58<3:18:36,  2.78s/it]

{'loss': 0.3383, 'grad_norm': 81.86653137207031, 'learning_rate': 1.9027334043308484e-06, 'epoch': 1.86}


 62%|██████▏   | 6990/11268 [5:15:25<3:12:34,  2.70s/it]

{'loss': 0.5331, 'grad_norm': 70.43437194824219, 'learning_rate': 1.8982960596379127e-06, 'epoch': 1.86}


 62%|██████▏   | 7000/11268 [5:15:52<3:15:06,  2.74s/it]

{'loss': 0.3872, 'grad_norm': 93.09194946289062, 'learning_rate': 1.8938587149449772e-06, 'epoch': 1.86}


 62%|██████▏   | 7010/11268 [5:16:19<3:10:30,  2.68s/it]

{'loss': 0.4318, 'grad_norm': 1.219547152519226, 'learning_rate': 1.8894213702520415e-06, 'epoch': 1.87}


 62%|██████▏   | 7020/11268 [5:16:47<3:11:09,  2.70s/it]

{'loss': 0.4622, 'grad_norm': 77.21363067626953, 'learning_rate': 1.8849840255591056e-06, 'epoch': 1.87}


 62%|██████▏   | 7030/11268 [5:17:14<3:12:30,  2.73s/it]

{'loss': 0.2822, 'grad_norm': 14.271868705749512, 'learning_rate': 1.88054668086617e-06, 'epoch': 1.87}


 62%|██████▏   | 7040/11268 [5:17:42<3:12:58,  2.74s/it]

{'loss': 0.4507, 'grad_norm': 44.3095703125, 'learning_rate': 1.876109336173234e-06, 'epoch': 1.87}


 63%|██████▎   | 7050/11268 [5:18:09<3:13:29,  2.75s/it]

{'loss': 0.3678, 'grad_norm': 59.71763610839844, 'learning_rate': 1.8716719914802983e-06, 'epoch': 1.88}


 63%|██████▎   | 7060/11268 [5:18:36<3:07:03,  2.67s/it]

{'loss': 0.531, 'grad_norm': 84.87511444091797, 'learning_rate': 1.8672346467873626e-06, 'epoch': 1.88}


 63%|██████▎   | 7070/11268 [5:19:04<3:16:08,  2.80s/it]

{'loss': 0.2502, 'grad_norm': 29.216161727905273, 'learning_rate': 1.8627973020944267e-06, 'epoch': 1.88}


 63%|██████▎   | 7080/11268 [5:19:32<3:16:36,  2.82s/it]

{'loss': 0.34, 'grad_norm': 25.902713775634766, 'learning_rate': 1.858359957401491e-06, 'epoch': 1.88}


 63%|██████▎   | 7090/11268 [5:19:59<3:06:16,  2.67s/it]

{'loss': 0.3078, 'grad_norm': 76.64691162109375, 'learning_rate': 1.8539226127085551e-06, 'epoch': 1.89}


 63%|██████▎   | 7100/11268 [5:20:26<3:07:17,  2.70s/it]

{'loss': 0.2755, 'grad_norm': 50.414451599121094, 'learning_rate': 1.8494852680156197e-06, 'epoch': 1.89}


 63%|██████▎   | 7110/11268 [5:20:54<3:14:51,  2.81s/it]

{'loss': 0.3238, 'grad_norm': 41.31875228881836, 'learning_rate': 1.845047923322684e-06, 'epoch': 1.89}


 63%|██████▎   | 7120/11268 [5:21:23<3:12:15,  2.78s/it]

{'loss': 0.3134, 'grad_norm': 4.913923263549805, 'learning_rate': 1.840610578629748e-06, 'epoch': 1.9}


 63%|██████▎   | 7130/11268 [5:21:50<3:05:38,  2.69s/it]

{'loss': 0.362, 'grad_norm': 64.60235595703125, 'learning_rate': 1.8361732339368124e-06, 'epoch': 1.9}


 63%|██████▎   | 7140/11268 [5:22:16<3:02:14,  2.65s/it]

{'loss': 0.2105, 'grad_norm': 32.756526947021484, 'learning_rate': 1.8317358892438767e-06, 'epoch': 1.9}


 63%|██████▎   | 7150/11268 [5:22:44<3:05:35,  2.70s/it]

{'loss': 0.5158, 'grad_norm': 42.97222900390625, 'learning_rate': 1.8272985445509408e-06, 'epoch': 1.9}


 64%|██████▎   | 7160/11268 [5:23:11<3:02:49,  2.67s/it]

{'loss': 0.4249, 'grad_norm': 66.98497009277344, 'learning_rate': 1.8228611998580051e-06, 'epoch': 1.91}


 64%|██████▎   | 7170/11268 [5:23:38<3:07:47,  2.75s/it]

{'loss': 0.3424, 'grad_norm': 52.125022888183594, 'learning_rate': 1.8184238551650692e-06, 'epoch': 1.91}


 64%|██████▎   | 7180/11268 [5:24:07<3:11:39,  2.81s/it]

{'loss': 0.2219, 'grad_norm': 40.963932037353516, 'learning_rate': 1.8139865104721335e-06, 'epoch': 1.91}


 64%|██████▍   | 7190/11268 [5:24:35<3:15:18,  2.87s/it]

{'loss': 0.4898, 'grad_norm': 26.110610961914062, 'learning_rate': 1.8095491657791978e-06, 'epoch': 1.91}


 64%|██████▍   | 7200/11268 [5:25:03<3:07:52,  2.77s/it]

{'loss': 0.4794, 'grad_norm': 62.95829391479492, 'learning_rate': 1.8051118210862623e-06, 'epoch': 1.92}


 64%|██████▍   | 7210/11268 [5:25:31<3:09:30,  2.80s/it]

{'loss': 0.4656, 'grad_norm': 69.13304901123047, 'learning_rate': 1.8006744763933264e-06, 'epoch': 1.92}


 64%|██████▍   | 7220/11268 [5:25:59<3:07:08,  2.77s/it]

{'loss': 0.5496, 'grad_norm': 82.6933364868164, 'learning_rate': 1.7962371317003908e-06, 'epoch': 1.92}


 64%|██████▍   | 7230/11268 [5:26:27<3:15:47,  2.91s/it]

{'loss': 0.2939, 'grad_norm': 37.5575065612793, 'learning_rate': 1.7917997870074549e-06, 'epoch': 1.92}


 64%|██████▍   | 7240/11268 [5:26:54<3:05:11,  2.76s/it]

{'loss': 0.3707, 'grad_norm': 49.96902847290039, 'learning_rate': 1.7873624423145192e-06, 'epoch': 1.93}


 64%|██████▍   | 7250/11268 [5:27:22<3:08:16,  2.81s/it]

{'loss': 0.2701, 'grad_norm': 79.97505950927734, 'learning_rate': 1.7829250976215835e-06, 'epoch': 1.93}


 64%|██████▍   | 7260/11268 [5:27:50<3:10:06,  2.85s/it]

{'loss': 0.4132, 'grad_norm': 57.69758605957031, 'learning_rate': 1.7784877529286476e-06, 'epoch': 1.93}


 65%|██████▍   | 7270/11268 [5:28:22<3:17:42,  2.97s/it]

{'loss': 0.2273, 'grad_norm': 0.8374597430229187, 'learning_rate': 1.7740504082357119e-06, 'epoch': 1.94}


 65%|██████▍   | 7280/11268 [5:28:51<3:07:17,  2.82s/it]

{'loss': 0.329, 'grad_norm': 13.859219551086426, 'learning_rate': 1.769613063542776e-06, 'epoch': 1.94}


 65%|██████▍   | 7290/11268 [5:29:19<3:04:28,  2.78s/it]

{'loss': 0.3534, 'grad_norm': 56.059635162353516, 'learning_rate': 1.7651757188498403e-06, 'epoch': 1.94}


 65%|██████▍   | 7300/11268 [5:29:47<3:03:21,  2.77s/it]

{'loss': 0.461, 'grad_norm': 48.91998291015625, 'learning_rate': 1.7607383741569046e-06, 'epoch': 1.94}


 65%|██████▍   | 7310/11268 [5:30:15<2:58:47,  2.71s/it]

{'loss': 0.5919, 'grad_norm': 128.54881286621094, 'learning_rate': 1.7563010294639691e-06, 'epoch': 1.95}


 65%|██████▍   | 7320/11268 [5:30:43<3:01:09,  2.75s/it]

{'loss': 0.3616, 'grad_norm': 59.10694122314453, 'learning_rate': 1.7518636847710332e-06, 'epoch': 1.95}


 65%|██████▌   | 7330/11268 [5:31:10<2:56:20,  2.69s/it]

{'loss': 0.4074, 'grad_norm': 49.39072036743164, 'learning_rate': 1.7474263400780975e-06, 'epoch': 1.95}


 65%|██████▌   | 7340/11268 [5:31:37<2:56:45,  2.70s/it]

{'loss': 0.3047, 'grad_norm': 21.689308166503906, 'learning_rate': 1.7429889953851616e-06, 'epoch': 1.95}


 65%|██████▌   | 7350/11268 [5:32:04<2:56:26,  2.70s/it]

{'loss': 0.2323, 'grad_norm': 97.16775512695312, 'learning_rate': 1.738551650692226e-06, 'epoch': 1.96}


 65%|██████▌   | 7360/11268 [5:32:31<2:53:32,  2.66s/it]

{'loss': 0.3336, 'grad_norm': 39.6245231628418, 'learning_rate': 1.7341143059992903e-06, 'epoch': 1.96}


 65%|██████▌   | 7370/11268 [5:32:58<2:56:22,  2.71s/it]

{'loss': 0.3158, 'grad_norm': 81.37751770019531, 'learning_rate': 1.7296769613063543e-06, 'epoch': 1.96}


 65%|██████▌   | 7380/11268 [5:33:25<2:53:44,  2.68s/it]

{'loss': 0.3438, 'grad_norm': 8.210901260375977, 'learning_rate': 1.7252396166134187e-06, 'epoch': 1.96}


 66%|██████▌   | 7390/11268 [5:33:53<3:01:35,  2.81s/it]

{'loss': 0.4774, 'grad_norm': 60.9276123046875, 'learning_rate': 1.7208022719204828e-06, 'epoch': 1.97}


 66%|██████▌   | 7400/11268 [5:34:21<3:02:23,  2.83s/it]

{'loss': 0.2424, 'grad_norm': 36.85103225708008, 'learning_rate': 1.716364927227547e-06, 'epoch': 1.97}


 66%|██████▌   | 7410/11268 [5:34:49<2:54:40,  2.72s/it]

{'loss': 0.2775, 'grad_norm': 1.969171404838562, 'learning_rate': 1.7119275825346116e-06, 'epoch': 1.97}


 66%|██████▌   | 7420/11268 [5:35:16<2:51:59,  2.68s/it]

{'loss': 0.271, 'grad_norm': 20.650897979736328, 'learning_rate': 1.7074902378416757e-06, 'epoch': 1.98}


 66%|██████▌   | 7430/11268 [5:35:44<2:59:11,  2.80s/it]

{'loss': 0.4138, 'grad_norm': 45.896629333496094, 'learning_rate': 1.70305289314874e-06, 'epoch': 1.98}


 66%|██████▌   | 7440/11268 [5:36:13<3:04:33,  2.89s/it]

{'loss': 0.3364, 'grad_norm': 32.696041107177734, 'learning_rate': 1.6986155484558043e-06, 'epoch': 1.98}


 66%|██████▌   | 7450/11268 [5:36:41<3:00:30,  2.84s/it]

{'loss': 0.5304, 'grad_norm': 22.776897430419922, 'learning_rate': 1.6941782037628684e-06, 'epoch': 1.98}


 66%|██████▌   | 7460/11268 [5:37:09<2:57:34,  2.80s/it]

{'loss': 0.2505, 'grad_norm': 1.038737416267395, 'learning_rate': 1.6897408590699327e-06, 'epoch': 1.99}


 66%|██████▋   | 7470/11268 [5:37:37<2:54:42,  2.76s/it]

{'loss': 0.544, 'grad_norm': 58.575130462646484, 'learning_rate': 1.6853035143769968e-06, 'epoch': 1.99}


 66%|██████▋   | 7480/11268 [5:38:05<2:53:14,  2.74s/it]

{'loss': 0.3231, 'grad_norm': 73.77245330810547, 'learning_rate': 1.6808661696840611e-06, 'epoch': 1.99}


 66%|██████▋   | 7490/11268 [5:38:34<3:00:19,  2.86s/it]

{'loss': 0.31, 'grad_norm': 45.046348571777344, 'learning_rate': 1.6764288249911254e-06, 'epoch': 1.99}


 67%|██████▋   | 7500/11268 [5:39:01<2:50:33,  2.72s/it]

{'loss': 0.3738, 'grad_norm': 28.5229549407959, 'learning_rate': 1.6719914802981895e-06, 'epoch': 2.0}


 67%|██████▋   | 7510/11268 [5:39:29<2:52:51,  2.76s/it]

{'loss': 0.3686, 'grad_norm': 65.60608673095703, 'learning_rate': 1.667554135605254e-06, 'epoch': 2.0}


                                                        
 67%|██████▋   | 7512/11268 [5:46:16<2:30:20,  2.40s/it]

{'eval_loss': 0.47438761591911316, 'eval_runtime': 403.5495, 'eval_samples_per_second': 9.307, 'eval_steps_per_second': 1.165, 'epoch': 2.0}


 67%|██████▋   | 7520/11268 [5:46:42<13:24:34, 12.88s/it]  

{'loss': 0.3166, 'grad_norm': 26.378429412841797, 'learning_rate': 1.6631167909123184e-06, 'epoch': 2.0}


 67%|██████▋   | 7530/11268 [5:47:11<3:13:09,  3.10s/it] 

{'loss': 0.4263, 'grad_norm': 91.15678405761719, 'learning_rate': 1.6586794462193825e-06, 'epoch': 2.0}


 67%|██████▋   | 7540/11268 [5:47:42<3:24:57,  3.30s/it]

{'loss': 0.4199, 'grad_norm': 21.11326026916504, 'learning_rate': 1.6542421015264468e-06, 'epoch': 2.01}


 67%|██████▋   | 7550/11268 [5:48:14<3:10:07,  3.07s/it]

{'loss': 0.5184, 'grad_norm': 41.017635345458984, 'learning_rate': 1.649804756833511e-06, 'epoch': 2.01}


 67%|██████▋   | 7560/11268 [5:48:42<2:52:49,  2.80s/it]

{'loss': 0.3005, 'grad_norm': 114.3299560546875, 'learning_rate': 1.6453674121405752e-06, 'epoch': 2.01}


 67%|██████▋   | 7570/11268 [5:49:11<3:00:08,  2.92s/it]

{'loss': 0.2585, 'grad_norm': 37.46210861206055, 'learning_rate': 1.6409300674476395e-06, 'epoch': 2.02}


 67%|██████▋   | 7580/11268 [5:49:40<3:00:27,  2.94s/it]

{'loss': 0.2475, 'grad_norm': 39.61613082885742, 'learning_rate': 1.6364927227547036e-06, 'epoch': 2.02}


 67%|██████▋   | 7590/11268 [5:50:08<2:50:17,  2.78s/it]

{'loss': 0.1778, 'grad_norm': 6.157140254974365, 'learning_rate': 1.632055378061768e-06, 'epoch': 2.02}


 67%|██████▋   | 7600/11268 [5:50:36<2:48:46,  2.76s/it]

{'loss': 0.3512, 'grad_norm': 2.3043344020843506, 'learning_rate': 1.6276180333688322e-06, 'epoch': 2.02}


 68%|██████▊   | 7610/11268 [5:51:04<2:51:43,  2.82s/it]

{'loss': 0.4343, 'grad_norm': 30.168964385986328, 'learning_rate': 1.6231806886758963e-06, 'epoch': 2.03}


 68%|██████▊   | 7620/11268 [5:51:34<2:57:33,  2.92s/it]

{'loss': 0.4734, 'grad_norm': 11.638833999633789, 'learning_rate': 1.6187433439829608e-06, 'epoch': 2.03}


 68%|██████▊   | 7630/11268 [5:52:02<2:49:43,  2.80s/it]

{'loss': 0.5308, 'grad_norm': 71.37164306640625, 'learning_rate': 1.6143059992900251e-06, 'epoch': 2.03}


 68%|██████▊   | 7640/11268 [5:52:30<2:45:12,  2.73s/it]

{'loss': 0.1341, 'grad_norm': 26.738872528076172, 'learning_rate': 1.6098686545970892e-06, 'epoch': 2.03}


 68%|██████▊   | 7650/11268 [5:52:58<2:47:41,  2.78s/it]

{'loss': 0.2951, 'grad_norm': 40.717315673828125, 'learning_rate': 1.6054313099041535e-06, 'epoch': 2.04}


 68%|██████▊   | 7660/11268 [5:53:27<2:57:39,  2.95s/it]

{'loss': 0.3577, 'grad_norm': 14.68603515625, 'learning_rate': 1.6009939652112176e-06, 'epoch': 2.04}


 68%|██████▊   | 7670/11268 [5:53:55<2:50:03,  2.84s/it]

{'loss': 0.4244, 'grad_norm': 22.423070907592773, 'learning_rate': 1.596556620518282e-06, 'epoch': 2.04}


 68%|██████▊   | 7680/11268 [5:54:23<2:46:43,  2.79s/it]

{'loss': 0.4115, 'grad_norm': 62.12194061279297, 'learning_rate': 1.5921192758253463e-06, 'epoch': 2.04}


 68%|██████▊   | 7690/11268 [5:54:51<2:46:15,  2.79s/it]

{'loss': 0.3723, 'grad_norm': 29.944477081298828, 'learning_rate': 1.5876819311324104e-06, 'epoch': 2.05}


 68%|██████▊   | 7700/11268 [5:55:20<2:46:06,  2.79s/it]

{'loss': 0.2605, 'grad_norm': 3.1298699378967285, 'learning_rate': 1.5832445864394747e-06, 'epoch': 2.05}


 68%|██████▊   | 7710/11268 [5:55:48<2:52:00,  2.90s/it]

{'loss': 0.3164, 'grad_norm': 65.9732666015625, 'learning_rate': 1.5788072417465388e-06, 'epoch': 2.05}


 69%|██████▊   | 7720/11268 [5:56:17<2:48:08,  2.84s/it]

{'loss': 0.4169, 'grad_norm': 19.84172821044922, 'learning_rate': 1.5743698970536033e-06, 'epoch': 2.06}


 69%|██████▊   | 7730/11268 [5:56:45<2:46:24,  2.82s/it]

{'loss': 0.3071, 'grad_norm': 14.790493965148926, 'learning_rate': 1.5699325523606676e-06, 'epoch': 2.06}


 69%|██████▊   | 7740/11268 [5:57:13<2:44:02,  2.79s/it]

{'loss': 0.2838, 'grad_norm': 9.732852935791016, 'learning_rate': 1.565495207667732e-06, 'epoch': 2.06}


 69%|██████▉   | 7750/11268 [5:57:41<2:42:40,  2.77s/it]

{'loss': 0.2729, 'grad_norm': 50.517601013183594, 'learning_rate': 1.561057862974796e-06, 'epoch': 2.06}


 69%|██████▉   | 7760/11268 [5:58:09<2:39:25,  2.73s/it]

{'loss': 0.2735, 'grad_norm': 7.943413257598877, 'learning_rate': 1.5566205182818603e-06, 'epoch': 2.07}


 69%|██████▉   | 7770/11268 [5:58:36<2:37:58,  2.71s/it]

{'loss': 0.4377, 'grad_norm': 33.786155700683594, 'learning_rate': 1.5521831735889244e-06, 'epoch': 2.07}


 69%|██████▉   | 7780/11268 [5:59:03<2:36:26,  2.69s/it]

{'loss': 0.5391, 'grad_norm': 30.29157257080078, 'learning_rate': 1.5477458288959887e-06, 'epoch': 2.07}


 69%|██████▉   | 7790/11268 [5:59:32<2:43:36,  2.82s/it]

{'loss': 0.3222, 'grad_norm': 133.77589416503906, 'learning_rate': 1.543308484203053e-06, 'epoch': 2.07}


 69%|██████▉   | 7800/11268 [5:59:59<2:36:49,  2.71s/it]

{'loss': 0.198, 'grad_norm': 93.00948333740234, 'learning_rate': 1.5388711395101171e-06, 'epoch': 2.08}


 69%|██████▉   | 7810/11268 [6:00:27<2:40:28,  2.78s/it]

{'loss': 0.2997, 'grad_norm': 93.88084411621094, 'learning_rate': 1.5344337948171814e-06, 'epoch': 2.08}


 69%|██████▉   | 7820/11268 [6:00:55<2:41:29,  2.81s/it]

{'loss': 0.4361, 'grad_norm': 52.75319290161133, 'learning_rate': 1.529996450124246e-06, 'epoch': 2.08}


 69%|██████▉   | 7830/11268 [6:01:22<2:38:04,  2.76s/it]

{'loss': 0.2733, 'grad_norm': 0.6808093190193176, 'learning_rate': 1.52555910543131e-06, 'epoch': 2.08}


 70%|██████▉   | 7840/11268 [6:01:51<2:40:53,  2.82s/it]

{'loss': 0.2644, 'grad_norm': 73.07148742675781, 'learning_rate': 1.5211217607383744e-06, 'epoch': 2.09}


 70%|██████▉   | 7850/11268 [6:02:20<2:42:00,  2.84s/it]

{'loss': 0.2334, 'grad_norm': 79.00353240966797, 'learning_rate': 1.5166844160454387e-06, 'epoch': 2.09}


 70%|██████▉   | 7860/11268 [6:02:47<2:31:54,  2.67s/it]

{'loss': 0.3736, 'grad_norm': 43.11014175415039, 'learning_rate': 1.5122470713525028e-06, 'epoch': 2.09}


 70%|██████▉   | 7870/11268 [6:03:14<2:33:31,  2.71s/it]

{'loss': 0.2614, 'grad_norm': 39.60625076293945, 'learning_rate': 1.507809726659567e-06, 'epoch': 2.1}


 70%|██████▉   | 7880/11268 [6:03:41<2:31:37,  2.69s/it]

{'loss': 0.4369, 'grad_norm': 89.19490814208984, 'learning_rate': 1.5033723819666312e-06, 'epoch': 2.1}


 70%|███████   | 7890/11268 [6:04:08<2:28:06,  2.63s/it]

{'loss': 0.1336, 'grad_norm': 2.3707337379455566, 'learning_rate': 1.4989350372736955e-06, 'epoch': 2.1}


 70%|███████   | 7900/11268 [6:04:35<2:34:05,  2.75s/it]

{'loss': 0.1827, 'grad_norm': 14.61378002166748, 'learning_rate': 1.4944976925807598e-06, 'epoch': 2.1}


 70%|███████   | 7910/11268 [6:05:03<2:33:26,  2.74s/it]

{'loss': 0.405, 'grad_norm': 38.94618225097656, 'learning_rate': 1.490060347887824e-06, 'epoch': 2.11}


 70%|███████   | 7920/11268 [6:05:29<2:27:05,  2.64s/it]

{'loss': 0.2406, 'grad_norm': 44.02756118774414, 'learning_rate': 1.4856230031948882e-06, 'epoch': 2.11}


 70%|███████   | 7930/11268 [6:05:56<2:27:21,  2.65s/it]

{'loss': 0.4707, 'grad_norm': 50.54716491699219, 'learning_rate': 1.4811856585019527e-06, 'epoch': 2.11}


 70%|███████   | 7940/11268 [6:06:24<2:32:00,  2.74s/it]

{'loss': 0.2586, 'grad_norm': 172.08480834960938, 'learning_rate': 1.4767483138090168e-06, 'epoch': 2.11}


 71%|███████   | 7950/11268 [6:06:51<2:32:33,  2.76s/it]

{'loss': 0.5491, 'grad_norm': 60.15922927856445, 'learning_rate': 1.4723109691160812e-06, 'epoch': 2.12}


 71%|███████   | 7960/11268 [6:07:22<2:41:22,  2.93s/it]

{'loss': 0.2265, 'grad_norm': 62.24797058105469, 'learning_rate': 1.4678736244231453e-06, 'epoch': 2.12}


 71%|███████   | 7970/11268 [6:07:50<2:36:22,  2.85s/it]

{'loss': 0.4568, 'grad_norm': 14.371028900146484, 'learning_rate': 1.4634362797302096e-06, 'epoch': 2.12}


 71%|███████   | 7980/11268 [6:08:19<2:35:10,  2.83s/it]

{'loss': 0.2947, 'grad_norm': 27.533344268798828, 'learning_rate': 1.4589989350372739e-06, 'epoch': 2.12}


 71%|███████   | 7990/11268 [6:08:49<2:38:58,  2.91s/it]

{'loss': 0.3385, 'grad_norm': 73.42343139648438, 'learning_rate': 1.454561590344338e-06, 'epoch': 2.13}


 71%|███████   | 8000/11268 [6:09:19<2:36:09,  2.87s/it]

{'loss': 0.4446, 'grad_norm': 19.754472732543945, 'learning_rate': 1.4501242456514023e-06, 'epoch': 2.13}


 71%|███████   | 8010/11268 [6:09:46<2:28:14,  2.73s/it]

{'loss': 0.4168, 'grad_norm': 64.18115997314453, 'learning_rate': 1.4456869009584664e-06, 'epoch': 2.13}


 71%|███████   | 8020/11268 [6:10:14<2:25:01,  2.68s/it]

{'loss': 0.3474, 'grad_norm': 21.499650955200195, 'learning_rate': 1.4412495562655307e-06, 'epoch': 2.14}


 71%|███████▏  | 8030/11268 [6:10:41<2:27:02,  2.72s/it]

{'loss': 0.1895, 'grad_norm': 32.37458419799805, 'learning_rate': 1.4368122115725952e-06, 'epoch': 2.14}


 71%|███████▏  | 8040/11268 [6:11:09<2:29:46,  2.78s/it]

{'loss': 0.2208, 'grad_norm': 32.375526428222656, 'learning_rate': 1.4323748668796595e-06, 'epoch': 2.14}


 71%|███████▏  | 8050/11268 [6:11:37<2:24:41,  2.70s/it]

{'loss': 0.4518, 'grad_norm': 33.109500885009766, 'learning_rate': 1.4279375221867236e-06, 'epoch': 2.14}


 72%|███████▏  | 8060/11268 [6:12:04<2:24:41,  2.71s/it]

{'loss': 0.1687, 'grad_norm': 1.6406779289245605, 'learning_rate': 1.423500177493788e-06, 'epoch': 2.15}


 72%|███████▏  | 8070/11268 [6:12:31<2:28:30,  2.79s/it]

{'loss': 0.2228, 'grad_norm': 63.71483612060547, 'learning_rate': 1.419062832800852e-06, 'epoch': 2.15}


 72%|███████▏  | 8080/11268 [6:12:59<2:33:58,  2.90s/it]

{'loss': 0.437, 'grad_norm': 9.45456314086914, 'learning_rate': 1.4146254881079163e-06, 'epoch': 2.15}


 72%|███████▏  | 8090/11268 [6:13:26<2:22:53,  2.70s/it]

{'loss': 0.2974, 'grad_norm': 30.26764488220215, 'learning_rate': 1.4101881434149806e-06, 'epoch': 2.15}


 72%|███████▏  | 8100/11268 [6:13:55<2:26:48,  2.78s/it]

{'loss': 0.3065, 'grad_norm': 1.237131118774414, 'learning_rate': 1.4057507987220447e-06, 'epoch': 2.16}


 72%|███████▏  | 8110/11268 [6:14:22<2:24:41,  2.75s/it]

{'loss': 0.2004, 'grad_norm': 42.559425354003906, 'learning_rate': 1.401313454029109e-06, 'epoch': 2.16}


 72%|███████▏  | 8120/11268 [6:14:50<2:25:46,  2.78s/it]

{'loss': 0.5586, 'grad_norm': 80.3755111694336, 'learning_rate': 1.3968761093361732e-06, 'epoch': 2.16}


 72%|███████▏  | 8130/11268 [6:15:19<2:29:21,  2.86s/it]

{'loss': 0.3076, 'grad_norm': 57.830501556396484, 'learning_rate': 1.3924387646432377e-06, 'epoch': 2.16}


 72%|███████▏  | 8140/11268 [6:15:47<2:29:58,  2.88s/it]

{'loss': 0.4747, 'grad_norm': 58.93600845336914, 'learning_rate': 1.388001419950302e-06, 'epoch': 2.17}


 72%|███████▏  | 8150/11268 [6:16:14<2:18:25,  2.66s/it]

{'loss': 0.4512, 'grad_norm': 81.73784637451172, 'learning_rate': 1.383564075257366e-06, 'epoch': 2.17}


 72%|███████▏  | 8160/11268 [6:16:42<2:25:46,  2.81s/it]

{'loss': 0.2375, 'grad_norm': 29.510807037353516, 'learning_rate': 1.3791267305644304e-06, 'epoch': 2.17}


 73%|███████▎  | 8170/11268 [6:17:12<2:29:51,  2.90s/it]

{'loss': 0.2681, 'grad_norm': 60.39370346069336, 'learning_rate': 1.3746893858714947e-06, 'epoch': 2.18}


 73%|███████▎  | 8180/11268 [6:17:39<2:20:30,  2.73s/it]

{'loss': 0.3148, 'grad_norm': 44.198333740234375, 'learning_rate': 1.3702520411785588e-06, 'epoch': 2.18}


 73%|███████▎  | 8190/11268 [6:18:07<2:18:57,  2.71s/it]

{'loss': 0.3967, 'grad_norm': 57.9879264831543, 'learning_rate': 1.3658146964856231e-06, 'epoch': 2.18}


 73%|███████▎  | 8200/11268 [6:18:34<2:16:47,  2.68s/it]

{'loss': 0.2486, 'grad_norm': 51.035762786865234, 'learning_rate': 1.3613773517926874e-06, 'epoch': 2.18}


 73%|███████▎  | 8210/11268 [6:19:01<2:25:25,  2.85s/it]

{'loss': 0.429, 'grad_norm': 135.99371337890625, 'learning_rate': 1.3569400070997515e-06, 'epoch': 2.19}


 73%|███████▎  | 8220/11268 [6:19:29<2:20:26,  2.76s/it]

{'loss': 0.2672, 'grad_norm': 23.029678344726562, 'learning_rate': 1.3525026624068158e-06, 'epoch': 2.19}


 73%|███████▎  | 8230/11268 [6:19:57<2:26:31,  2.89s/it]

{'loss': 0.4306, 'grad_norm': 61.28646469116211, 'learning_rate': 1.34806531771388e-06, 'epoch': 2.19}


 73%|███████▎  | 8240/11268 [6:20:26<2:27:28,  2.92s/it]

{'loss': 0.5059, 'grad_norm': 46.95869445800781, 'learning_rate': 1.3436279730209445e-06, 'epoch': 2.19}


 73%|███████▎  | 8250/11268 [6:20:54<2:23:17,  2.85s/it]

{'loss': 0.4466, 'grad_norm': 76.78046417236328, 'learning_rate': 1.3391906283280088e-06, 'epoch': 2.2}


 73%|███████▎  | 8260/11268 [6:21:22<2:15:34,  2.70s/it]

{'loss': 0.3715, 'grad_norm': 22.82947540283203, 'learning_rate': 1.3347532836350729e-06, 'epoch': 2.2}


 73%|███████▎  | 8270/11268 [6:21:49<2:14:47,  2.70s/it]

{'loss': 0.4125, 'grad_norm': 5.087149620056152, 'learning_rate': 1.3303159389421372e-06, 'epoch': 2.2}


 73%|███████▎  | 8280/11268 [6:22:17<2:17:47,  2.77s/it]

{'loss': 0.3961, 'grad_norm': 13.68127727508545, 'learning_rate': 1.3258785942492015e-06, 'epoch': 2.2}


 74%|███████▎  | 8290/11268 [6:22:44<2:11:17,  2.65s/it]

{'loss': 0.3209, 'grad_norm': 54.94505310058594, 'learning_rate': 1.3214412495562656e-06, 'epoch': 2.21}


 74%|███████▎  | 8300/11268 [6:23:12<2:16:03,  2.75s/it]

{'loss': 0.5213, 'grad_norm': 20.320938110351562, 'learning_rate': 1.3170039048633299e-06, 'epoch': 2.21}


 74%|███████▎  | 8310/11268 [6:23:42<2:25:09,  2.94s/it]

{'loss': 0.261, 'grad_norm': 70.37028503417969, 'learning_rate': 1.312566560170394e-06, 'epoch': 2.21}


 74%|███████▍  | 8320/11268 [6:24:10<2:22:03,  2.89s/it]

{'loss': 0.4304, 'grad_norm': 60.987300872802734, 'learning_rate': 1.3081292154774583e-06, 'epoch': 2.22}


 74%|███████▍  | 8330/11268 [6:24:39<2:17:56,  2.82s/it]

{'loss': 0.4544, 'grad_norm': 54.544456481933594, 'learning_rate': 1.3036918707845226e-06, 'epoch': 2.22}


 74%|███████▍  | 8340/11268 [6:25:06<2:13:00,  2.73s/it]

{'loss': 0.5723, 'grad_norm': 41.2563362121582, 'learning_rate': 1.2992545260915871e-06, 'epoch': 2.22}


 74%|███████▍  | 8350/11268 [6:25:33<2:10:20,  2.68s/it]

{'loss': 0.3617, 'grad_norm': 95.9351806640625, 'learning_rate': 1.2948171813986512e-06, 'epoch': 2.22}


 74%|███████▍  | 8360/11268 [6:26:00<2:13:26,  2.75s/it]

{'loss': 0.2618, 'grad_norm': 50.362369537353516, 'learning_rate': 1.2903798367057155e-06, 'epoch': 2.23}


 74%|███████▍  | 8370/11268 [6:26:28<2:09:58,  2.69s/it]

{'loss': 0.3082, 'grad_norm': 40.291202545166016, 'learning_rate': 1.2859424920127796e-06, 'epoch': 2.23}


 74%|███████▍  | 8380/11268 [6:26:55<2:08:48,  2.68s/it]

{'loss': 0.2968, 'grad_norm': 21.96148109436035, 'learning_rate': 1.281505147319844e-06, 'epoch': 2.23}


 74%|███████▍  | 8390/11268 [6:27:23<2:18:37,  2.89s/it]

{'loss': 0.3989, 'grad_norm': 10.188529014587402, 'learning_rate': 1.2770678026269083e-06, 'epoch': 2.23}


 75%|███████▍  | 8400/11268 [6:27:52<2:13:49,  2.80s/it]

{'loss': 0.4309, 'grad_norm': 45.57920455932617, 'learning_rate': 1.2726304579339724e-06, 'epoch': 2.24}


 75%|███████▍  | 8410/11268 [6:28:19<2:10:31,  2.74s/it]

{'loss': 0.2811, 'grad_norm': 4.269969940185547, 'learning_rate': 1.2681931132410367e-06, 'epoch': 2.24}


 75%|███████▍  | 8420/11268 [6:28:47<2:11:05,  2.76s/it]

{'loss': 0.2658, 'grad_norm': 70.66199493408203, 'learning_rate': 1.2637557685481008e-06, 'epoch': 2.24}


 75%|███████▍  | 8430/11268 [6:29:14<2:09:55,  2.75s/it]

{'loss': 0.3379, 'grad_norm': 52.74956130981445, 'learning_rate': 1.259318423855165e-06, 'epoch': 2.24}


 75%|███████▍  | 8440/11268 [6:29:43<2:09:19,  2.74s/it]

{'loss': 0.3653, 'grad_norm': 5.084788799285889, 'learning_rate': 1.2548810791622294e-06, 'epoch': 2.25}


 75%|███████▍  | 8450/11268 [6:30:12<2:19:48,  2.98s/it]

{'loss': 0.2167, 'grad_norm': 121.50921630859375, 'learning_rate': 1.2504437344692937e-06, 'epoch': 2.25}


 75%|███████▌  | 8460/11268 [6:30:40<2:09:11,  2.76s/it]

{'loss': 0.3323, 'grad_norm': 60.10844039916992, 'learning_rate': 1.2460063897763578e-06, 'epoch': 2.25}


 75%|███████▌  | 8470/11268 [6:31:07<2:07:19,  2.73s/it]

{'loss': 0.2778, 'grad_norm': 6.163202285766602, 'learning_rate': 1.2415690450834223e-06, 'epoch': 2.26}


 75%|███████▌  | 8480/11268 [6:31:35<2:11:48,  2.84s/it]

{'loss': 0.4383, 'grad_norm': 112.34961700439453, 'learning_rate': 1.2371317003904864e-06, 'epoch': 2.26}


 75%|███████▌  | 8490/11268 [6:32:04<2:08:10,  2.77s/it]

{'loss': 0.418, 'grad_norm': 12.736183166503906, 'learning_rate': 1.2326943556975507e-06, 'epoch': 2.26}


 75%|███████▌  | 8500/11268 [6:32:31<2:02:25,  2.65s/it]

{'loss': 0.1804, 'grad_norm': 8.21741771697998, 'learning_rate': 1.2282570110046148e-06, 'epoch': 2.26}


 76%|███████▌  | 8510/11268 [6:32:57<2:02:43,  2.67s/it]

{'loss': 0.4049, 'grad_norm': 41.37981033325195, 'learning_rate': 1.2238196663116791e-06, 'epoch': 2.27}


 76%|███████▌  | 8520/11268 [6:33:24<2:03:33,  2.70s/it]

{'loss': 0.2615, 'grad_norm': 64.41077423095703, 'learning_rate': 1.2193823216187434e-06, 'epoch': 2.27}


 76%|███████▌  | 8530/11268 [6:33:51<2:02:34,  2.69s/it]

{'loss': 0.3994, 'grad_norm': 0.2976609170436859, 'learning_rate': 1.2149449769258077e-06, 'epoch': 2.27}


 76%|███████▌  | 8540/11268 [6:34:18<2:02:31,  2.69s/it]

{'loss': 0.4818, 'grad_norm': 27.00863265991211, 'learning_rate': 1.210507632232872e-06, 'epoch': 2.27}


 76%|███████▌  | 8550/11268 [6:34:44<1:55:23,  2.55s/it]

{'loss': 0.2352, 'grad_norm': 1.7631244659423828, 'learning_rate': 1.2060702875399362e-06, 'epoch': 2.28}


 76%|███████▌  | 8560/11268 [6:35:11<1:58:49,  2.63s/it]

{'loss': 0.4692, 'grad_norm': 76.33423614501953, 'learning_rate': 1.2016329428470005e-06, 'epoch': 2.28}


 76%|███████▌  | 8570/11268 [6:35:38<2:01:47,  2.71s/it]

{'loss': 0.1781, 'grad_norm': 38.88909912109375, 'learning_rate': 1.1971955981540646e-06, 'epoch': 2.28}


 76%|███████▌  | 8580/11268 [6:36:05<2:01:49,  2.72s/it]

{'loss': 0.3678, 'grad_norm': 39.53713607788086, 'learning_rate': 1.192758253461129e-06, 'epoch': 2.28}


 76%|███████▌  | 8590/11268 [6:36:33<1:59:48,  2.68s/it]

{'loss': 0.3092, 'grad_norm': 81.74105834960938, 'learning_rate': 1.1883209087681932e-06, 'epoch': 2.29}


 76%|███████▋  | 8600/11268 [6:36:59<1:58:55,  2.67s/it]

{'loss': 0.2128, 'grad_norm': 1.307457447052002, 'learning_rate': 1.1838835640752575e-06, 'epoch': 2.29}


 76%|███████▋  | 8610/11268 [6:37:26<1:56:50,  2.64s/it]

{'loss': 0.3453, 'grad_norm': 54.59809494018555, 'learning_rate': 1.1794462193823216e-06, 'epoch': 2.29}


 76%|███████▋  | 8620/11268 [6:37:52<1:54:36,  2.60s/it]

{'loss': 0.3488, 'grad_norm': 65.46979522705078, 'learning_rate': 1.175008874689386e-06, 'epoch': 2.29}


 77%|███████▋  | 8630/11268 [6:38:19<1:57:11,  2.67s/it]

{'loss': 0.3413, 'grad_norm': 166.42062377929688, 'learning_rate': 1.1705715299964502e-06, 'epoch': 2.3}


 77%|███████▋  | 8640/11268 [6:38:45<1:56:02,  2.65s/it]

{'loss': 0.4634, 'grad_norm': 25.346879959106445, 'learning_rate': 1.1661341853035145e-06, 'epoch': 2.3}


 77%|███████▋  | 8650/11268 [6:39:12<1:57:34,  2.69s/it]

{'loss': 0.2361, 'grad_norm': 6.274576187133789, 'learning_rate': 1.1616968406105786e-06, 'epoch': 2.3}


 77%|███████▋  | 8660/11268 [6:39:39<1:56:22,  2.68s/it]

{'loss': 0.1937, 'grad_norm': 37.43172836303711, 'learning_rate': 1.157259495917643e-06, 'epoch': 2.31}


 77%|███████▋  | 8670/11268 [6:40:06<1:55:38,  2.67s/it]

{'loss': 0.4039, 'grad_norm': 36.06031036376953, 'learning_rate': 1.1528221512247072e-06, 'epoch': 2.31}


 77%|███████▋  | 8680/11268 [6:40:33<1:54:38,  2.66s/it]

{'loss': 0.1055, 'grad_norm': 22.266021728515625, 'learning_rate': 1.1483848065317716e-06, 'epoch': 2.31}


 77%|███████▋  | 8690/11268 [6:41:00<1:55:39,  2.69s/it]

{'loss': 0.2927, 'grad_norm': 16.590906143188477, 'learning_rate': 1.1439474618388359e-06, 'epoch': 2.31}


 77%|███████▋  | 8700/11268 [6:41:28<1:55:53,  2.71s/it]

{'loss': 0.3186, 'grad_norm': 59.96759796142578, 'learning_rate': 1.1395101171459e-06, 'epoch': 2.32}


 77%|███████▋  | 8710/11268 [6:41:54<1:53:36,  2.66s/it]

{'loss': 0.3535, 'grad_norm': 41.1589241027832, 'learning_rate': 1.1350727724529643e-06, 'epoch': 2.32}


 77%|███████▋  | 8720/11268 [6:42:21<1:54:02,  2.69s/it]

{'loss': 0.3277, 'grad_norm': 73.5610122680664, 'learning_rate': 1.1306354277600284e-06, 'epoch': 2.32}


 77%|███████▋  | 8730/11268 [6:42:48<1:53:08,  2.67s/it]

{'loss': 0.1857, 'grad_norm': 171.79421997070312, 'learning_rate': 1.1261980830670929e-06, 'epoch': 2.32}


 78%|███████▊  | 8740/11268 [6:43:14<1:51:52,  2.66s/it]

{'loss': 0.1792, 'grad_norm': 0.38770583271980286, 'learning_rate': 1.121760738374157e-06, 'epoch': 2.33}


 78%|███████▊  | 8750/11268 [6:43:41<1:50:24,  2.63s/it]

{'loss': 0.4184, 'grad_norm': 0.4586333930492401, 'learning_rate': 1.1173233936812213e-06, 'epoch': 2.33}


 78%|███████▊  | 8760/11268 [6:44:07<1:51:49,  2.68s/it]

{'loss': 0.3396, 'grad_norm': 26.98359489440918, 'learning_rate': 1.1128860489882854e-06, 'epoch': 2.33}


 78%|███████▊  | 8770/11268 [6:44:34<1:49:53,  2.64s/it]

{'loss': 0.3565, 'grad_norm': 150.01803588867188, 'learning_rate': 1.1084487042953497e-06, 'epoch': 2.33}


 78%|███████▊  | 8780/11268 [6:45:01<1:50:26,  2.66s/it]

{'loss': 0.3788, 'grad_norm': 48.533321380615234, 'learning_rate': 1.104011359602414e-06, 'epoch': 2.34}


 78%|███████▊  | 8790/11268 [6:45:28<1:50:37,  2.68s/it]

{'loss': 0.1593, 'grad_norm': 15.197497367858887, 'learning_rate': 1.0995740149094783e-06, 'epoch': 2.34}


 78%|███████▊  | 8800/11268 [6:45:55<1:50:03,  2.68s/it]

{'loss': 0.339, 'grad_norm': 49.83061218261719, 'learning_rate': 1.0951366702165424e-06, 'epoch': 2.34}


 78%|███████▊  | 8810/11268 [6:46:22<1:50:08,  2.69s/it]

{'loss': 0.2012, 'grad_norm': 4.998590469360352, 'learning_rate': 1.0906993255236067e-06, 'epoch': 2.35}


 78%|███████▊  | 8820/11268 [6:46:49<1:50:46,  2.72s/it]

{'loss': 0.4908, 'grad_norm': 59.11191940307617, 'learning_rate': 1.086261980830671e-06, 'epoch': 2.35}


 78%|███████▊  | 8830/11268 [6:47:16<1:48:28,  2.67s/it]

{'loss': 0.4135, 'grad_norm': 97.33016204833984, 'learning_rate': 1.0818246361377354e-06, 'epoch': 2.35}


 78%|███████▊  | 8840/11268 [6:47:43<1:48:11,  2.67s/it]

{'loss': 0.2891, 'grad_norm': 16.779008865356445, 'learning_rate': 1.0773872914447995e-06, 'epoch': 2.35}


 79%|███████▊  | 8850/11268 [6:48:10<1:47:05,  2.66s/it]

{'loss': 0.4171, 'grad_norm': 16.85049819946289, 'learning_rate': 1.0729499467518638e-06, 'epoch': 2.36}


 79%|███████▊  | 8860/11268 [6:48:38<1:51:13,  2.77s/it]

{'loss': 0.0782, 'grad_norm': 12.636563301086426, 'learning_rate': 1.068512602058928e-06, 'epoch': 2.36}


 79%|███████▊  | 8870/11268 [6:49:04<1:46:15,  2.66s/it]

{'loss': 0.526, 'grad_norm': 66.75408172607422, 'learning_rate': 1.0640752573659922e-06, 'epoch': 2.36}


 79%|███████▉  | 8880/11268 [6:49:31<1:45:08,  2.64s/it]

{'loss': 0.3373, 'grad_norm': 119.18498992919922, 'learning_rate': 1.0596379126730565e-06, 'epoch': 2.36}


 79%|███████▉  | 8890/11268 [6:49:57<1:45:27,  2.66s/it]

{'loss': 0.4328, 'grad_norm': 57.00706100463867, 'learning_rate': 1.0552005679801208e-06, 'epoch': 2.37}


 79%|███████▉  | 8900/11268 [6:50:24<1:45:08,  2.66s/it]

{'loss': 0.3077, 'grad_norm': 10.878923416137695, 'learning_rate': 1.050763223287185e-06, 'epoch': 2.37}


 79%|███████▉  | 8910/11268 [6:50:52<1:46:44,  2.72s/it]

{'loss': 0.2148, 'grad_norm': 35.98958206176758, 'learning_rate': 1.0463258785942492e-06, 'epoch': 2.37}


 79%|███████▉  | 8920/11268 [6:51:19<1:43:40,  2.65s/it]

{'loss': 0.3278, 'grad_norm': 36.568267822265625, 'learning_rate': 1.0418885339013135e-06, 'epoch': 2.37}


 79%|███████▉  | 8930/11268 [6:51:45<1:44:31,  2.68s/it]

{'loss': 0.1513, 'grad_norm': 1.899151086807251, 'learning_rate': 1.0374511892083778e-06, 'epoch': 2.38}


 79%|███████▉  | 8940/11268 [6:52:12<1:45:21,  2.72s/it]

{'loss': 0.3151, 'grad_norm': 6.037137985229492, 'learning_rate': 1.0330138445154421e-06, 'epoch': 2.38}


 79%|███████▉  | 8950/11268 [6:52:40<1:44:03,  2.69s/it]

{'loss': 0.2968, 'grad_norm': 90.53223419189453, 'learning_rate': 1.0285764998225062e-06, 'epoch': 2.38}


 80%|███████▉  | 8960/11268 [6:53:07<1:46:44,  2.77s/it]

{'loss': 0.4029, 'grad_norm': 40.93388748168945, 'learning_rate': 1.0241391551295705e-06, 'epoch': 2.39}


 80%|███████▉  | 8970/11268 [6:53:34<1:44:43,  2.73s/it]

{'loss': 0.2149, 'grad_norm': 7.770800590515137, 'learning_rate': 1.0197018104366349e-06, 'epoch': 2.39}


 80%|███████▉  | 8980/11268 [6:54:02<1:44:17,  2.74s/it]

{'loss': 0.3083, 'grad_norm': 65.45437622070312, 'learning_rate': 1.015264465743699e-06, 'epoch': 2.39}


 80%|███████▉  | 8990/11268 [6:54:29<1:45:26,  2.78s/it]

{'loss': 0.33, 'grad_norm': 6.447240829467773, 'learning_rate': 1.0108271210507633e-06, 'epoch': 2.39}


 80%|███████▉  | 9000/11268 [6:54:57<1:42:18,  2.71s/it]

{'loss': 0.2873, 'grad_norm': 7.17038106918335, 'learning_rate': 1.0063897763578276e-06, 'epoch': 2.4}


 80%|███████▉  | 9010/11268 [6:55:24<1:42:51,  2.73s/it]

{'loss': 0.435, 'grad_norm': 79.7839584350586, 'learning_rate': 1.0019524316648919e-06, 'epoch': 2.4}


 80%|████████  | 9020/11268 [6:55:53<1:47:06,  2.86s/it]

{'loss': 0.3136, 'grad_norm': 37.716487884521484, 'learning_rate': 9.97515086971956e-07, 'epoch': 2.4}


 80%|████████  | 9030/11268 [6:56:22<1:46:29,  2.86s/it]

{'loss': 0.4376, 'grad_norm': 6.039687633514404, 'learning_rate': 9.930777422790203e-07, 'epoch': 2.4}


 80%|████████  | 9040/11268 [6:56:50<1:43:53,  2.80s/it]

{'loss': 0.4761, 'grad_norm': 25.577791213989258, 'learning_rate': 9.886403975860846e-07, 'epoch': 2.41}


 80%|████████  | 9050/11268 [6:57:17<1:40:04,  2.71s/it]

{'loss': 0.2236, 'grad_norm': 52.851806640625, 'learning_rate': 9.84203052893149e-07, 'epoch': 2.41}


 80%|████████  | 9060/11268 [6:57:44<1:39:46,  2.71s/it]

{'loss': 0.2659, 'grad_norm': 80.6332778930664, 'learning_rate': 9.79765708200213e-07, 'epoch': 2.41}


 80%|████████  | 9070/11268 [6:58:11<1:36:55,  2.65s/it]

{'loss': 0.3181, 'grad_norm': 38.83652114868164, 'learning_rate': 9.753283635072773e-07, 'epoch': 2.41}


 81%|████████  | 9080/11268 [6:58:37<1:35:35,  2.62s/it]

{'loss': 0.2794, 'grad_norm': 0.8906875252723694, 'learning_rate': 9.708910188143416e-07, 'epoch': 2.42}


 81%|████████  | 9090/11268 [6:59:04<1:35:28,  2.63s/it]

{'loss': 0.2209, 'grad_norm': 116.91139221191406, 'learning_rate': 9.66453674121406e-07, 'epoch': 2.42}


 81%|████████  | 9100/11268 [6:59:31<1:37:18,  2.69s/it]

{'loss': 0.2559, 'grad_norm': 14.292043685913086, 'learning_rate': 9.6201632942847e-07, 'epoch': 2.42}


 81%|████████  | 9110/11268 [6:59:58<1:35:17,  2.65s/it]

{'loss': 0.2744, 'grad_norm': 67.68082427978516, 'learning_rate': 9.575789847355343e-07, 'epoch': 2.43}


 81%|████████  | 9120/11268 [7:00:24<1:35:41,  2.67s/it]

{'loss': 0.2239, 'grad_norm': 12.612003326416016, 'learning_rate': 9.531416400425985e-07, 'epoch': 2.43}


 81%|████████  | 9130/11268 [7:00:52<1:40:46,  2.83s/it]

{'loss': 0.3452, 'grad_norm': 1.7973004579544067, 'learning_rate': 9.487042953496628e-07, 'epoch': 2.43}


 81%|████████  | 9140/11268 [7:01:19<1:35:23,  2.69s/it]

{'loss': 0.4021, 'grad_norm': 0.9656375646591187, 'learning_rate': 9.442669506567272e-07, 'epoch': 2.43}


 81%|████████  | 9150/11268 [7:01:46<1:32:56,  2.63s/it]

{'loss': 0.3442, 'grad_norm': 3.358820915222168, 'learning_rate': 9.398296059637914e-07, 'epoch': 2.44}


 81%|████████▏ | 9160/11268 [7:02:14<1:37:13,  2.77s/it]

{'loss': 0.2391, 'grad_norm': 100.93452453613281, 'learning_rate': 9.353922612708556e-07, 'epoch': 2.44}


 81%|████████▏ | 9170/11268 [7:02:42<1:33:51,  2.68s/it]

{'loss': 0.291, 'grad_norm': 117.56649017333984, 'learning_rate': 9.309549165779199e-07, 'epoch': 2.44}


 81%|████████▏ | 9180/11268 [7:03:09<1:35:15,  2.74s/it]

{'loss': 0.6248, 'grad_norm': 56.34371566772461, 'learning_rate': 9.265175718849841e-07, 'epoch': 2.44}


 82%|████████▏ | 9190/11268 [7:03:36<1:34:14,  2.72s/it]

{'loss': 0.2366, 'grad_norm': 64.37759399414062, 'learning_rate': 9.220802271920483e-07, 'epoch': 2.45}


 82%|████████▏ | 9200/11268 [7:04:03<1:31:32,  2.66s/it]

{'loss': 0.4483, 'grad_norm': 94.62747955322266, 'learning_rate': 9.176428824991126e-07, 'epoch': 2.45}


 82%|████████▏ | 9210/11268 [7:04:29<1:30:51,  2.65s/it]

{'loss': 0.2189, 'grad_norm': 99.13091278076172, 'learning_rate': 9.132055378061769e-07, 'epoch': 2.45}


 82%|████████▏ | 9220/11268 [7:04:55<1:27:04,  2.55s/it]

{'loss': 0.1891, 'grad_norm': 68.71235656738281, 'learning_rate': 9.087681931132411e-07, 'epoch': 2.45}


 82%|████████▏ | 9230/11268 [7:05:20<1:25:49,  2.53s/it]

{'loss': 0.3748, 'grad_norm': 6.734892845153809, 'learning_rate': 9.043308484203053e-07, 'epoch': 2.46}


 82%|████████▏ | 9240/11268 [7:05:46<1:28:34,  2.62s/it]

{'loss': 0.5959, 'grad_norm': 119.26941680908203, 'learning_rate': 8.998935037273695e-07, 'epoch': 2.46}


 82%|████████▏ | 9250/11268 [7:06:12<1:27:41,  2.61s/it]

{'loss': 0.3791, 'grad_norm': 49.0540657043457, 'learning_rate': 8.954561590344339e-07, 'epoch': 2.46}


 82%|████████▏ | 9260/11268 [7:06:38<1:29:43,  2.68s/it]

{'loss': 0.2666, 'grad_norm': 17.25588035583496, 'learning_rate': 8.910188143414981e-07, 'epoch': 2.47}


 82%|████████▏ | 9270/11268 [7:07:06<1:30:06,  2.71s/it]

{'loss': 0.3117, 'grad_norm': 194.16539001464844, 'learning_rate': 8.865814696485624e-07, 'epoch': 2.47}


 82%|████████▏ | 9280/11268 [7:07:33<1:29:43,  2.71s/it]

{'loss': 0.3272, 'grad_norm': 51.76094436645508, 'learning_rate': 8.821441249556266e-07, 'epoch': 2.47}


 82%|████████▏ | 9290/11268 [7:07:59<1:27:38,  2.66s/it]

{'loss': 0.279, 'grad_norm': 56.34033203125, 'learning_rate': 8.777067802626909e-07, 'epoch': 2.47}


 83%|████████▎ | 9300/11268 [7:08:26<1:25:54,  2.62s/it]

{'loss': 0.2763, 'grad_norm': 2.9528307914733887, 'learning_rate': 8.732694355697552e-07, 'epoch': 2.48}


 83%|████████▎ | 9310/11268 [7:08:52<1:24:21,  2.59s/it]

{'loss': 0.4771, 'grad_norm': 46.18729782104492, 'learning_rate': 8.688320908768194e-07, 'epoch': 2.48}


 83%|████████▎ | 9320/11268 [7:09:18<1:23:31,  2.57s/it]

{'loss': 0.4361, 'grad_norm': 31.736501693725586, 'learning_rate': 8.643947461838837e-07, 'epoch': 2.48}


 83%|████████▎ | 9330/11268 [7:09:43<1:22:13,  2.55s/it]

{'loss': 0.3094, 'grad_norm': 73.28038787841797, 'learning_rate': 8.599574014909479e-07, 'epoch': 2.48}


 83%|████████▎ | 9340/11268 [7:10:09<1:23:17,  2.59s/it]

{'loss': 0.275, 'grad_norm': 2.6036853790283203, 'learning_rate': 8.555200567980121e-07, 'epoch': 2.49}


 83%|████████▎ | 9350/11268 [7:10:35<1:23:46,  2.62s/it]

{'loss': 0.2206, 'grad_norm': 93.02667999267578, 'learning_rate': 8.510827121050764e-07, 'epoch': 2.49}


 83%|████████▎ | 9360/11268 [7:11:03<1:28:16,  2.78s/it]

{'loss': 0.2628, 'grad_norm': 87.68504333496094, 'learning_rate': 8.466453674121407e-07, 'epoch': 2.49}


 83%|████████▎ | 9370/11268 [7:11:30<1:24:16,  2.66s/it]

{'loss': 0.368, 'grad_norm': 93.88429260253906, 'learning_rate': 8.422080227192049e-07, 'epoch': 2.49}


 83%|████████▎ | 9380/11268 [7:11:56<1:22:25,  2.62s/it]

{'loss': 0.33, 'grad_norm': 3.50577449798584, 'learning_rate': 8.377706780262691e-07, 'epoch': 2.5}


 83%|████████▎ | 9390/11268 [7:12:22<1:20:56,  2.59s/it]

{'loss': 0.5373, 'grad_norm': 68.62669372558594, 'learning_rate': 8.333333333333333e-07, 'epoch': 2.5}


 83%|████████▎ | 9400/11268 [7:12:49<1:24:37,  2.72s/it]

{'loss': 0.3451, 'grad_norm': 20.60932159423828, 'learning_rate': 8.288959886403977e-07, 'epoch': 2.5}


 84%|████████▎ | 9410/11268 [7:13:15<1:20:30,  2.60s/it]

{'loss': 0.2337, 'grad_norm': 82.46580505371094, 'learning_rate': 8.24458643947462e-07, 'epoch': 2.51}


 84%|████████▎ | 9420/11268 [7:13:42<1:22:25,  2.68s/it]

{'loss': 0.4516, 'grad_norm': 61.24949645996094, 'learning_rate': 8.200212992545262e-07, 'epoch': 2.51}


 84%|████████▎ | 9430/11268 [7:14:09<1:21:06,  2.65s/it]

{'loss': 0.2128, 'grad_norm': 12.209386825561523, 'learning_rate': 8.155839545615904e-07, 'epoch': 2.51}


 84%|████████▍ | 9440/11268 [7:14:34<1:17:44,  2.55s/it]

{'loss': 0.5264, 'grad_norm': 84.36724090576172, 'learning_rate': 8.111466098686547e-07, 'epoch': 2.51}


 84%|████████▍ | 9450/11268 [7:15:00<1:18:00,  2.57s/it]

{'loss': 0.492, 'grad_norm': 62.201927185058594, 'learning_rate': 8.067092651757189e-07, 'epoch': 2.52}


 84%|████████▍ | 9460/11268 [7:15:28<1:23:10,  2.76s/it]

{'loss': 0.3594, 'grad_norm': 6.080994129180908, 'learning_rate': 8.022719204827832e-07, 'epoch': 2.52}


 84%|████████▍ | 9470/11268 [7:15:55<1:25:51,  2.86s/it]

{'loss': 0.3427, 'grad_norm': 35.73909378051758, 'learning_rate': 7.978345757898474e-07, 'epoch': 2.52}


 84%|████████▍ | 9480/11268 [7:16:24<1:22:01,  2.75s/it]

{'loss': 0.3352, 'grad_norm': 22.564987182617188, 'learning_rate': 7.933972310969117e-07, 'epoch': 2.52}


 84%|████████▍ | 9490/11268 [7:16:51<1:21:58,  2.77s/it]

{'loss': 0.2974, 'grad_norm': 68.36481475830078, 'learning_rate': 7.889598864039759e-07, 'epoch': 2.53}


 84%|████████▍ | 9500/11268 [7:17:18<1:16:38,  2.60s/it]

{'loss': 0.3419, 'grad_norm': 60.874752044677734, 'learning_rate': 7.845225417110401e-07, 'epoch': 2.53}


 84%|████████▍ | 9510/11268 [7:17:44<1:17:23,  2.64s/it]

{'loss': 0.2034, 'grad_norm': 140.6918182373047, 'learning_rate': 7.800851970181045e-07, 'epoch': 2.53}


 84%|████████▍ | 9520/11268 [7:18:11<1:17:12,  2.65s/it]

{'loss': 0.3991, 'grad_norm': 150.61904907226562, 'learning_rate': 7.756478523251687e-07, 'epoch': 2.53}


 85%|████████▍ | 9530/11268 [7:18:37<1:16:07,  2.63s/it]

{'loss': 0.2804, 'grad_norm': 0.20538708567619324, 'learning_rate': 7.712105076322329e-07, 'epoch': 2.54}


 85%|████████▍ | 9540/11268 [7:19:04<1:16:01,  2.64s/it]

{'loss': 0.3249, 'grad_norm': 40.67469787597656, 'learning_rate': 7.667731629392971e-07, 'epoch': 2.54}


 85%|████████▍ | 9550/11268 [7:19:30<1:16:11,  2.66s/it]

{'loss': 0.3817, 'grad_norm': 14.013001441955566, 'learning_rate': 7.623358182463613e-07, 'epoch': 2.54}


 85%|████████▍ | 9560/11268 [7:19:57<1:15:40,  2.66s/it]

{'loss': 0.4018, 'grad_norm': 21.24835777282715, 'learning_rate': 7.578984735534258e-07, 'epoch': 2.55}


 85%|████████▍ | 9570/11268 [7:20:23<1:14:10,  2.62s/it]

{'loss': 0.4157, 'grad_norm': 41.788124084472656, 'learning_rate': 7.5346112886049e-07, 'epoch': 2.55}


 85%|████████▌ | 9580/11268 [7:20:50<1:15:43,  2.69s/it]

{'loss': 0.1778, 'grad_norm': 45.847564697265625, 'learning_rate': 7.490237841675542e-07, 'epoch': 2.55}


 85%|████████▌ | 9590/11268 [7:21:18<1:19:07,  2.83s/it]

{'loss': 0.2743, 'grad_norm': 74.53057098388672, 'learning_rate': 7.445864394746185e-07, 'epoch': 2.55}


 85%|████████▌ | 9600/11268 [7:21:44<1:13:50,  2.66s/it]

{'loss': 0.3841, 'grad_norm': 7.452534198760986, 'learning_rate': 7.401490947816827e-07, 'epoch': 2.56}


 85%|████████▌ | 9610/11268 [7:22:12<1:13:59,  2.68s/it]

{'loss': 0.2768, 'grad_norm': 60.342926025390625, 'learning_rate': 7.35711750088747e-07, 'epoch': 2.56}


 85%|████████▌ | 9620/11268 [7:22:38<1:12:16,  2.63s/it]

{'loss': 0.2609, 'grad_norm': 10.100431442260742, 'learning_rate': 7.312744053958112e-07, 'epoch': 2.56}


 85%|████████▌ | 9630/11268 [7:29:45<39:22:56, 86.55s/it] 

{'loss': 0.3244, 'grad_norm': 53.0055046081543, 'learning_rate': 7.268370607028755e-07, 'epoch': 2.56}


 86%|████████▌ | 9640/11268 [7:30:02<1:49:43,  4.04s/it] 

{'loss': 0.1972, 'grad_norm': 85.27549743652344, 'learning_rate': 7.223997160099397e-07, 'epoch': 2.57}


 86%|████████▌ | 9650/11268 [7:30:23<1:08:27,  2.54s/it]

{'loss': 0.2893, 'grad_norm': 41.10486602783203, 'learning_rate': 7.179623713170039e-07, 'epoch': 2.57}


 86%|████████▌ | 9660/11268 [7:46:00<7:51:53, 17.61s/it]   

{'loss': 0.3454, 'grad_norm': 6.552296161651611, 'learning_rate': 7.135250266240683e-07, 'epoch': 2.57}


 86%|████████▌ | 9670/11268 [7:46:16<56:08,  2.11s/it]  

{'loss': 0.2313, 'grad_norm': 4.15765905380249, 'learning_rate': 7.090876819311325e-07, 'epoch': 2.57}


 86%|████████▌ | 9680/11268 [7:56:38<28:11:20, 63.90s/it] 

{'loss': 0.2318, 'grad_norm': 58.251922607421875, 'learning_rate': 7.046503372381967e-07, 'epoch': 2.58}


 86%|████████▌ | 9690/11268 [7:56:55<1:30:59,  3.46s/it] 

{'loss': 0.3679, 'grad_norm': 17.74720001220703, 'learning_rate': 7.002129925452609e-07, 'epoch': 2.58}


 86%|████████▌ | 9700/11268 [7:57:12<45:35,  1.74s/it]  

{'loss': 0.3076, 'grad_norm': 79.9828872680664, 'learning_rate': 6.957756478523251e-07, 'epoch': 2.58}


 86%|████████▌ | 9710/11268 [7:57:29<43:51,  1.69s/it]

{'loss': 0.3656, 'grad_norm': 63.478145599365234, 'learning_rate': 6.913383031593896e-07, 'epoch': 2.59}


 86%|████████▋ | 9720/11268 [7:57:46<43:22,  1.68s/it]

{'loss': 0.3312, 'grad_norm': 71.57747650146484, 'learning_rate': 6.869009584664538e-07, 'epoch': 2.59}


 86%|████████▋ | 9730/11268 [7:58:03<43:40,  1.70s/it]

{'loss': 0.1449, 'grad_norm': 23.598224639892578, 'learning_rate': 6.82463613773518e-07, 'epoch': 2.59}


 86%|████████▋ | 9740/11268 [7:58:20<44:16,  1.74s/it]

{'loss': 0.1938, 'grad_norm': 2.3239331245422363, 'learning_rate': 6.780262690805822e-07, 'epoch': 2.59}


 87%|████████▋ | 9750/11268 [7:58:38<44:36,  1.76s/it]

{'loss': 0.3242, 'grad_norm': 42.511444091796875, 'learning_rate': 6.735889243876465e-07, 'epoch': 2.6}


 87%|████████▋ | 9760/11268 [7:58:56<44:51,  1.78s/it]

{'loss': 0.3028, 'grad_norm': 105.04347229003906, 'learning_rate': 6.691515796947107e-07, 'epoch': 2.6}


 87%|████████▋ | 9770/11268 [7:59:14<44:55,  1.80s/it]

{'loss': 0.2319, 'grad_norm': 3.3141932487487793, 'learning_rate': 6.64714235001775e-07, 'epoch': 2.6}


 87%|████████▋ | 9780/11268 [7:59:32<45:08,  1.82s/it]

{'loss': 0.235, 'grad_norm': 158.89149475097656, 'learning_rate': 6.602768903088393e-07, 'epoch': 2.6}


 87%|████████▋ | 9790/11268 [7:59:51<49:54,  2.03s/it]

{'loss': 0.4767, 'grad_norm': 52.233211517333984, 'learning_rate': 6.558395456159035e-07, 'epoch': 2.61}


 87%|████████▋ | 9800/11268 [8:00:14<54:46,  2.24s/it]

{'loss': 0.2198, 'grad_norm': 70.42778015136719, 'learning_rate': 6.514022009229677e-07, 'epoch': 2.61}


 87%|████████▋ | 9810/11268 [8:00:37<56:09,  2.31s/it]

{'loss': 0.3508, 'grad_norm': 6.668198585510254, 'learning_rate': 6.469648562300319e-07, 'epoch': 2.61}


 87%|████████▋ | 9820/11268 [8:00:59<53:40,  2.22s/it]

{'loss': 0.2984, 'grad_norm': 62.85617446899414, 'learning_rate': 6.425275115370963e-07, 'epoch': 2.61}


 87%|████████▋ | 9830/11268 [8:01:21<52:05,  2.17s/it]

{'loss': 0.2377, 'grad_norm': 6.584071159362793, 'learning_rate': 6.380901668441605e-07, 'epoch': 2.62}


 87%|████████▋ | 9840/11268 [8:01:43<53:06,  2.23s/it]

{'loss': 0.3247, 'grad_norm': 101.82833099365234, 'learning_rate': 6.336528221512247e-07, 'epoch': 2.62}


 87%|████████▋ | 9850/11268 [8:02:06<52:44,  2.23s/it]

{'loss': 0.3492, 'grad_norm': 60.63193130493164, 'learning_rate': 6.292154774582889e-07, 'epoch': 2.62}


 88%|████████▊ | 9860/11268 [8:02:30<56:54,  2.42s/it]

{'loss': 0.4204, 'grad_norm': 51.09575653076172, 'learning_rate': 6.247781327653533e-07, 'epoch': 2.63}


 88%|████████▊ | 9870/11268 [8:02:53<52:34,  2.26s/it]

{'loss': 0.5067, 'grad_norm': 59.40119934082031, 'learning_rate': 6.203407880724175e-07, 'epoch': 2.63}


 88%|████████▊ | 9880/11268 [8:03:15<52:12,  2.26s/it]

{'loss': 0.4713, 'grad_norm': 16.005334854125977, 'learning_rate': 6.159034433794818e-07, 'epoch': 2.63}


 88%|████████▊ | 9890/11268 [8:03:38<51:55,  2.26s/it]

{'loss': 0.2721, 'grad_norm': 77.22711944580078, 'learning_rate': 6.11466098686546e-07, 'epoch': 2.63}


 88%|████████▊ | 9900/11268 [8:04:01<52:17,  2.29s/it]

{'loss': 0.3598, 'grad_norm': 42.7409782409668, 'learning_rate': 6.070287539936103e-07, 'epoch': 2.64}


 88%|████████▊ | 9910/11268 [8:04:24<52:23,  2.32s/it]

{'loss': 0.2663, 'grad_norm': 51.81867599487305, 'learning_rate': 6.025914093006746e-07, 'epoch': 2.64}


 88%|████████▊ | 9920/11268 [8:04:47<50:37,  2.25s/it]

{'loss': 0.3383, 'grad_norm': 61.61949920654297, 'learning_rate': 5.981540646077388e-07, 'epoch': 2.64}


 88%|████████▊ | 9930/11268 [8:05:09<50:43,  2.27s/it]

{'loss': 0.2935, 'grad_norm': 30.91911506652832, 'learning_rate': 5.937167199148031e-07, 'epoch': 2.64}


 88%|████████▊ | 9940/11268 [8:05:33<52:21,  2.37s/it]

{'loss': 0.4891, 'grad_norm': 8.226977348327637, 'learning_rate': 5.892793752218673e-07, 'epoch': 2.65}


 88%|████████▊ | 9950/11268 [8:05:56<51:38,  2.35s/it]

{'loss': 0.2669, 'grad_norm': 1.3014225959777832, 'learning_rate': 5.848420305289315e-07, 'epoch': 2.65}


 88%|████████▊ | 9960/11268 [8:06:20<51:15,  2.35s/it]

{'loss': 0.328, 'grad_norm': 4.458194255828857, 'learning_rate': 5.804046858359958e-07, 'epoch': 2.65}


 88%|████████▊ | 9970/11268 [8:06:44<52:37,  2.43s/it]

{'loss': 0.1581, 'grad_norm': 31.905170440673828, 'learning_rate': 5.7596734114306e-07, 'epoch': 2.65}


 89%|████████▊ | 9980/11268 [8:07:08<53:03,  2.47s/it]

{'loss': 0.2108, 'grad_norm': 70.21726989746094, 'learning_rate': 5.715299964501242e-07, 'epoch': 2.66}


 89%|████████▊ | 9990/11268 [8:07:32<50:35,  2.38s/it]

{'loss': 0.321, 'grad_norm': 4.236892223358154, 'learning_rate': 5.670926517571885e-07, 'epoch': 2.66}


 89%|████████▊ | 10000/11268 [8:07:56<49:28,  2.34s/it]

{'loss': 0.4995, 'grad_norm': 25.685033798217773, 'learning_rate': 5.626553070642527e-07, 'epoch': 2.66}


 89%|████████▉ | 10010/11268 [8:08:21<52:21,  2.50s/it]

{'loss': 0.3757, 'grad_norm': 105.27903747558594, 'learning_rate': 5.582179623713171e-07, 'epoch': 2.67}


 89%|████████▉ | 10020/11268 [8:08:45<49:31,  2.38s/it]

{'loss': 0.2937, 'grad_norm': 72.91624450683594, 'learning_rate': 5.537806176783813e-07, 'epoch': 2.67}


 89%|████████▉ | 10030/11268 [8:09:10<52:12,  2.53s/it]

{'loss': 0.2111, 'grad_norm': 18.722057342529297, 'learning_rate': 5.493432729854456e-07, 'epoch': 2.67}


 89%|████████▉ | 10040/11268 [8:09:34<49:00,  2.39s/it]

{'loss': 0.3016, 'grad_norm': 22.521028518676758, 'learning_rate': 5.449059282925098e-07, 'epoch': 2.67}


 89%|████████▉ | 10050/11268 [8:09:58<48:52,  2.41s/it]

{'loss': 0.3113, 'grad_norm': 150.63192749023438, 'learning_rate': 5.404685835995741e-07, 'epoch': 2.68}


 89%|████████▉ | 10060/11268 [8:10:23<49:09,  2.44s/it]

{'loss': 0.1662, 'grad_norm': 5.0425848960876465, 'learning_rate': 5.360312389066383e-07, 'epoch': 2.68}


 89%|████████▉ | 10070/11268 [8:10:47<48:10,  2.41s/it]

{'loss': 0.2912, 'grad_norm': 28.275833129882812, 'learning_rate': 5.315938942137026e-07, 'epoch': 2.68}


 89%|████████▉ | 10080/11268 [8:11:10<46:53,  2.37s/it]

{'loss': 0.2945, 'grad_norm': 50.8092041015625, 'learning_rate': 5.271565495207668e-07, 'epoch': 2.68}


 90%|████████▉ | 10090/11268 [8:11:34<47:07,  2.40s/it]

{'loss': 0.3889, 'grad_norm': 67.05181121826172, 'learning_rate': 5.227192048278311e-07, 'epoch': 2.69}


 90%|████████▉ | 10100/11268 [8:11:59<47:40,  2.45s/it]

{'loss': 0.4159, 'grad_norm': 39.60372543334961, 'learning_rate': 5.182818601348953e-07, 'epoch': 2.69}


 90%|████████▉ | 10110/11268 [8:12:23<47:41,  2.47s/it]

{'loss': 0.5376, 'grad_norm': 52.968597412109375, 'learning_rate': 5.138445154419595e-07, 'epoch': 2.69}


 90%|████████▉ | 10120/11268 [8:12:48<47:13,  2.47s/it]

{'loss': 0.3229, 'grad_norm': 100.39524841308594, 'learning_rate': 5.094071707490238e-07, 'epoch': 2.69}


 90%|████████▉ | 10130/11268 [8:13:13<47:19,  2.50s/it]

{'loss': 0.2619, 'grad_norm': 19.945972442626953, 'learning_rate': 5.04969826056088e-07, 'epoch': 2.7}


 90%|████████▉ | 10140/11268 [8:13:38<45:51,  2.44s/it]

{'loss': 0.2484, 'grad_norm': 27.328872680664062, 'learning_rate': 5.005324813631523e-07, 'epoch': 2.7}


 90%|█████████ | 10150/11268 [8:14:02<45:32,  2.44s/it]

{'loss': 0.5062, 'grad_norm': 11.822389602661133, 'learning_rate': 4.960951366702166e-07, 'epoch': 2.7}


 90%|█████████ | 10160/11268 [8:14:27<44:59,  2.44s/it]

{'loss': 0.1489, 'grad_norm': 30.832340240478516, 'learning_rate': 4.916577919772808e-07, 'epoch': 2.71}


 90%|█████████ | 10170/11268 [8:14:51<44:37,  2.44s/it]

{'loss': 0.3503, 'grad_norm': 103.85608673095703, 'learning_rate': 4.872204472843451e-07, 'epoch': 2.71}


 90%|█████████ | 10180/11268 [8:15:16<45:46,  2.52s/it]

{'loss': 0.2455, 'grad_norm': 9.48681354522705, 'learning_rate': 4.827831025914094e-07, 'epoch': 2.71}


 90%|█████████ | 10190/11268 [8:15:41<44:31,  2.48s/it]

{'loss': 0.3234, 'grad_norm': 43.2510986328125, 'learning_rate': 4.783457578984736e-07, 'epoch': 2.71}


 91%|█████████ | 10200/11268 [8:16:07<44:45,  2.51s/it]

{'loss': 0.3603, 'grad_norm': 16.59434700012207, 'learning_rate': 4.7390841320553784e-07, 'epoch': 2.72}


 91%|█████████ | 10210/11268 [8:16:32<43:21,  2.46s/it]

{'loss': 0.0944, 'grad_norm': 13.888628959655762, 'learning_rate': 4.694710685126021e-07, 'epoch': 2.72}


 91%|█████████ | 10220/11268 [8:16:57<44:21,  2.54s/it]

{'loss': 0.1858, 'grad_norm': 91.68234252929688, 'learning_rate': 4.6503372381966635e-07, 'epoch': 2.72}


 91%|█████████ | 10230/11268 [8:17:22<43:06,  2.49s/it]

{'loss': 0.2515, 'grad_norm': 11.337655067443848, 'learning_rate': 4.605963791267306e-07, 'epoch': 2.72}


 91%|█████████ | 10240/11268 [8:17:48<45:21,  2.65s/it]

{'loss': 0.3363, 'grad_norm': 34.34951400756836, 'learning_rate': 4.561590344337948e-07, 'epoch': 2.73}


 91%|█████████ | 10250/11268 [8:18:14<44:01,  2.60s/it]

{'loss': 0.4597, 'grad_norm': 85.99002075195312, 'learning_rate': 4.517216897408591e-07, 'epoch': 2.73}


 91%|█████████ | 10260/11268 [8:18:40<42:27,  2.53s/it]

{'loss': 0.4207, 'grad_norm': 24.125316619873047, 'learning_rate': 4.4728434504792333e-07, 'epoch': 2.73}


 91%|█████████ | 10270/11268 [8:19:05<42:12,  2.54s/it]

{'loss': 0.2906, 'grad_norm': 39.75052261352539, 'learning_rate': 4.4284700035498764e-07, 'epoch': 2.73}


 91%|█████████ | 10280/11268 [8:19:31<42:19,  2.57s/it]

{'loss': 0.2741, 'grad_norm': 4.5171613693237305, 'learning_rate': 4.3840965566205184e-07, 'epoch': 2.74}


 91%|█████████▏| 10290/11268 [8:19:57<42:48,  2.63s/it]

{'loss': 0.413, 'grad_norm': 6.434981822967529, 'learning_rate': 4.339723109691161e-07, 'epoch': 2.74}


 91%|█████████▏| 10300/11268 [8:20:24<41:37,  2.58s/it]

{'loss': 0.3456, 'grad_norm': 28.698942184448242, 'learning_rate': 4.2953496627618036e-07, 'epoch': 2.74}


 91%|█████████▏| 10310/11268 [8:20:50<41:21,  2.59s/it]

{'loss': 0.1689, 'grad_norm': 0.9151111841201782, 'learning_rate': 4.250976215832446e-07, 'epoch': 2.74}


 92%|█████████▏| 10320/11268 [8:21:16<41:48,  2.65s/it]

{'loss': 0.3111, 'grad_norm': 0.8015732169151306, 'learning_rate': 4.206602768903089e-07, 'epoch': 2.75}


 92%|█████████▏| 10330/11268 [8:21:42<40:37,  2.60s/it]

{'loss': 0.3018, 'grad_norm': 94.2939224243164, 'learning_rate': 4.1622293219737313e-07, 'epoch': 2.75}


 92%|█████████▏| 10340/11268 [8:22:07<38:36,  2.50s/it]

{'loss': 0.3507, 'grad_norm': 81.34716033935547, 'learning_rate': 4.1178558750443733e-07, 'epoch': 2.75}


 92%|█████████▏| 10350/11268 [8:22:32<37:16,  2.44s/it]

{'loss': 0.3204, 'grad_norm': 34.192466735839844, 'learning_rate': 4.0734824281150164e-07, 'epoch': 2.76}


 92%|█████████▏| 10360/11268 [8:22:57<38:01,  2.51s/it]

{'loss': 0.4342, 'grad_norm': 89.90076446533203, 'learning_rate': 4.029108981185659e-07, 'epoch': 2.76}


 92%|█████████▏| 10370/11268 [8:23:23<39:51,  2.66s/it]

{'loss': 0.6276, 'grad_norm': 134.81710815429688, 'learning_rate': 3.9847355342563016e-07, 'epoch': 2.76}


 92%|█████████▏| 10380/11268 [8:23:49<38:52,  2.63s/it]

{'loss': 0.2983, 'grad_norm': 78.62545013427734, 'learning_rate': 3.940362087326944e-07, 'epoch': 2.76}


 92%|█████████▏| 10390/11268 [8:24:16<39:14,  2.68s/it]

{'loss': 0.3205, 'grad_norm': 42.85738754272461, 'learning_rate': 3.895988640397586e-07, 'epoch': 2.77}


 92%|█████████▏| 10400/11268 [8:24:41<36:35,  2.53s/it]

{'loss': 0.1765, 'grad_norm': 14.836753845214844, 'learning_rate': 3.8516151934682293e-07, 'epoch': 2.77}


 92%|█████████▏| 10410/11268 [8:25:06<34:35,  2.42s/it]

{'loss': 0.4492, 'grad_norm': 23.406118392944336, 'learning_rate': 3.8072417465388713e-07, 'epoch': 2.77}


 92%|█████████▏| 10420/11268 [8:25:32<36:18,  2.57s/it]

{'loss': 0.2428, 'grad_norm': 65.14427185058594, 'learning_rate': 3.762868299609514e-07, 'epoch': 2.77}


 93%|█████████▎| 10430/11268 [8:25:58<37:53,  2.71s/it]

{'loss': 0.2344, 'grad_norm': 102.985107421875, 'learning_rate': 3.7184948526801565e-07, 'epoch': 2.78}


 93%|█████████▎| 10440/11268 [8:26:25<36:53,  2.67s/it]

{'loss': 0.4103, 'grad_norm': 21.253517150878906, 'learning_rate': 3.674121405750799e-07, 'epoch': 2.78}


 93%|█████████▎| 10450/11268 [8:26:53<38:10,  2.80s/it]

{'loss': 0.4604, 'grad_norm': 74.63506317138672, 'learning_rate': 3.6297479588214416e-07, 'epoch': 2.78}


 93%|█████████▎| 10460/11268 [8:27:21<36:57,  2.74s/it]

{'loss': 0.1568, 'grad_norm': 5.663853645324707, 'learning_rate': 3.585374511892084e-07, 'epoch': 2.78}


 93%|█████████▎| 10470/11268 [8:27:49<36:29,  2.74s/it]

{'loss': 0.2586, 'grad_norm': 22.88048553466797, 'learning_rate': 3.541001064962726e-07, 'epoch': 2.79}


 93%|█████████▎| 10480/11268 [8:28:16<34:09,  2.60s/it]

{'loss': 0.4632, 'grad_norm': 16.873645782470703, 'learning_rate': 3.4966276180333693e-07, 'epoch': 2.79}


 93%|█████████▎| 10490/11268 [8:28:42<34:00,  2.62s/it]

{'loss': 0.2101, 'grad_norm': 31.31005096435547, 'learning_rate': 3.4522541711040114e-07, 'epoch': 2.79}


 93%|█████████▎| 10500/11268 [8:29:08<32:49,  2.56s/it]

{'loss': 0.322, 'grad_norm': 60.22578430175781, 'learning_rate': 3.4078807241746545e-07, 'epoch': 2.8}


 93%|█████████▎| 10510/11268 [8:29:33<32:45,  2.59s/it]

{'loss': 0.3191, 'grad_norm': 67.45529174804688, 'learning_rate': 3.3635072772452965e-07, 'epoch': 2.8}


 93%|█████████▎| 10520/11268 [8:29:59<32:13,  2.58s/it]

{'loss': 0.3703, 'grad_norm': 47.997745513916016, 'learning_rate': 3.319133830315939e-07, 'epoch': 2.8}


 93%|█████████▎| 10530/11268 [8:30:26<32:08,  2.61s/it]

{'loss': 0.4238, 'grad_norm': 0.9193398356437683, 'learning_rate': 3.274760383386582e-07, 'epoch': 2.8}


 94%|█████████▎| 10540/11268 [8:30:54<33:00,  2.72s/it]

{'loss': 0.1474, 'grad_norm': 5.201143741607666, 'learning_rate': 3.230386936457224e-07, 'epoch': 2.81}


 94%|█████████▎| 10550/11268 [8:31:20<31:27,  2.63s/it]

{'loss': 0.3411, 'grad_norm': 191.88259887695312, 'learning_rate': 3.186013489527866e-07, 'epoch': 2.81}


 94%|█████████▎| 10560/11268 [8:31:47<31:23,  2.66s/it]

{'loss': 0.2608, 'grad_norm': 113.4052963256836, 'learning_rate': 3.1416400425985094e-07, 'epoch': 2.81}


 94%|█████████▍| 10570/11268 [8:32:12<29:51,  2.57s/it]

{'loss': 0.5207, 'grad_norm': 15.42180061340332, 'learning_rate': 3.097266595669152e-07, 'epoch': 2.81}


 94%|█████████▍| 10580/11268 [8:32:39<29:42,  2.59s/it]

{'loss': 0.1727, 'grad_norm': 17.21611213684082, 'learning_rate': 3.0528931487397945e-07, 'epoch': 2.82}


 94%|█████████▍| 10590/11268 [8:33:05<30:08,  2.67s/it]

{'loss': 0.3201, 'grad_norm': 6.222742080688477, 'learning_rate': 3.008519701810437e-07, 'epoch': 2.82}


 94%|█████████▍| 10600/11268 [8:33:32<30:04,  2.70s/it]

{'loss': 0.3538, 'grad_norm': 38.24384689331055, 'learning_rate': 2.9641462548810796e-07, 'epoch': 2.82}


 94%|█████████▍| 10610/11268 [8:33:58<28:42,  2.62s/it]

{'loss': 0.5641, 'grad_norm': 52.899391174316406, 'learning_rate': 2.919772807951722e-07, 'epoch': 2.82}


 94%|█████████▍| 10620/11268 [8:34:25<28:11,  2.61s/it]

{'loss': 0.5429, 'grad_norm': 12.201167106628418, 'learning_rate': 2.875399361022364e-07, 'epoch': 2.83}


 94%|█████████▍| 10630/11268 [8:34:51<27:31,  2.59s/it]

{'loss': 0.2881, 'grad_norm': 80.1072998046875, 'learning_rate': 2.831025914093007e-07, 'epoch': 2.83}


 94%|█████████▍| 10640/11268 [8:35:17<27:10,  2.60s/it]

{'loss': 0.4063, 'grad_norm': 33.6482048034668, 'learning_rate': 2.7866524671636494e-07, 'epoch': 2.83}


 95%|█████████▍| 10650/11268 [8:35:43<26:58,  2.62s/it]

{'loss': 0.3953, 'grad_norm': 4.446513652801514, 'learning_rate': 2.742279020234292e-07, 'epoch': 2.84}


 95%|█████████▍| 10660/11268 [8:36:09<26:26,  2.61s/it]

{'loss': 0.3866, 'grad_norm': 8.394654273986816, 'learning_rate': 2.6979055733049345e-07, 'epoch': 2.84}


 95%|█████████▍| 10670/11268 [8:36:35<25:56,  2.60s/it]

{'loss': 0.4736, 'grad_norm': 78.55708312988281, 'learning_rate': 2.653532126375577e-07, 'epoch': 2.84}


 95%|█████████▍| 10680/11268 [8:37:02<25:49,  2.64s/it]

{'loss': 0.2772, 'grad_norm': 70.56546020507812, 'learning_rate': 2.6091586794462197e-07, 'epoch': 2.84}


 95%|█████████▍| 10690/11268 [8:37:28<25:04,  2.60s/it]

{'loss': 0.1488, 'grad_norm': 43.750553131103516, 'learning_rate': 2.564785232516862e-07, 'epoch': 2.85}


 95%|█████████▍| 10700/11268 [8:37:54<24:37,  2.60s/it]

{'loss': 0.2672, 'grad_norm': 12.761569023132324, 'learning_rate': 2.520411785587505e-07, 'epoch': 2.85}


 95%|█████████▌| 10710/11268 [8:38:20<24:36,  2.65s/it]

{'loss': 0.4332, 'grad_norm': 3.4964170455932617, 'learning_rate': 2.476038338658147e-07, 'epoch': 2.85}


 95%|█████████▌| 10720/11268 [8:38:46<23:51,  2.61s/it]

{'loss': 0.2518, 'grad_norm': 1.204679250717163, 'learning_rate': 2.4316648917287894e-07, 'epoch': 2.85}


 95%|█████████▌| 10730/11268 [8:39:13<23:38,  2.64s/it]

{'loss': 0.3102, 'grad_norm': 106.91858673095703, 'learning_rate': 2.387291444799432e-07, 'epoch': 2.86}


 95%|█████████▌| 10740/11268 [8:39:39<22:37,  2.57s/it]

{'loss': 0.2781, 'grad_norm': 24.199724197387695, 'learning_rate': 2.3429179978700748e-07, 'epoch': 2.86}


 95%|█████████▌| 10750/11268 [8:40:05<23:06,  2.68s/it]

{'loss': 0.3028, 'grad_norm': 54.24142074584961, 'learning_rate': 2.2985445509407171e-07, 'epoch': 2.86}


 95%|█████████▌| 10760/11268 [8:40:32<22:25,  2.65s/it]

{'loss': 0.3798, 'grad_norm': 32.54857635498047, 'learning_rate': 2.2541711040113597e-07, 'epoch': 2.86}


 96%|█████████▌| 10770/11268 [8:41:00<22:30,  2.71s/it]

{'loss': 0.4542, 'grad_norm': 46.540279388427734, 'learning_rate': 2.2097976570820023e-07, 'epoch': 2.87}


 96%|█████████▌| 10780/11268 [8:41:30<24:33,  3.02s/it]

{'loss': 0.3309, 'grad_norm': 19.46084213256836, 'learning_rate': 2.1654242101526449e-07, 'epoch': 2.87}


 96%|█████████▌| 10790/11268 [8:41:56<21:05,  2.65s/it]

{'loss': 0.1884, 'grad_norm': 58.1977653503418, 'learning_rate': 2.1210507632232872e-07, 'epoch': 2.87}


 96%|█████████▌| 10800/11268 [8:42:23<20:45,  2.66s/it]

{'loss': 0.2782, 'grad_norm': 99.43461608886719, 'learning_rate': 2.0766773162939297e-07, 'epoch': 2.88}


 96%|█████████▌| 10810/11268 [8:42:49<20:05,  2.63s/it]

{'loss': 0.3613, 'grad_norm': 3.946159839630127, 'learning_rate': 2.0323038693645723e-07, 'epoch': 2.88}


 96%|█████████▌| 10820/11268 [8:43:16<19:39,  2.63s/it]

{'loss': 0.263, 'grad_norm': 71.38822937011719, 'learning_rate': 1.987930422435215e-07, 'epoch': 2.88}


 96%|█████████▌| 10830/11268 [8:43:43<19:45,  2.71s/it]

{'loss': 0.1901, 'grad_norm': 0.8311753273010254, 'learning_rate': 1.9435569755058577e-07, 'epoch': 2.88}


 96%|█████████▌| 10840/11268 [8:44:09<18:36,  2.61s/it]

{'loss': 0.1858, 'grad_norm': 51.09591293334961, 'learning_rate': 1.8991835285764998e-07, 'epoch': 2.89}


 96%|█████████▋| 10850/11268 [8:44:35<18:25,  2.64s/it]

{'loss': 0.2508, 'grad_norm': 0.7948598265647888, 'learning_rate': 1.8548100816471426e-07, 'epoch': 2.89}


 96%|█████████▋| 10860/11268 [8:45:02<18:08,  2.67s/it]

{'loss': 0.3938, 'grad_norm': 14.117850303649902, 'learning_rate': 1.8104366347177852e-07, 'epoch': 2.89}


 96%|█████████▋| 10870/11268 [8:45:31<18:56,  2.85s/it]

{'loss': 0.2735, 'grad_norm': 91.07365417480469, 'learning_rate': 1.7660631877884277e-07, 'epoch': 2.89}


 97%|█████████▋| 10880/11268 [8:46:00<19:17,  2.98s/it]

{'loss': 0.2989, 'grad_norm': 2.704272508621216, 'learning_rate': 1.72168974085907e-07, 'epoch': 2.9}


 97%|█████████▋| 10890/11268 [8:46:28<17:15,  2.74s/it]

{'loss': 0.2851, 'grad_norm': 5.119171619415283, 'learning_rate': 1.6773162939297126e-07, 'epoch': 2.9}


 97%|█████████▋| 10900/11268 [8:46:55<16:33,  2.70s/it]

{'loss': 0.167, 'grad_norm': 55.949195861816406, 'learning_rate': 1.6329428470003552e-07, 'epoch': 2.9}


 97%|█████████▋| 10910/11268 [8:47:23<16:40,  2.79s/it]

{'loss': 0.2153, 'grad_norm': 32.78831481933594, 'learning_rate': 1.5885694000709978e-07, 'epoch': 2.9}


 97%|█████████▋| 10920/11268 [8:47:50<15:43,  2.71s/it]

{'loss': 0.1669, 'grad_norm': 4.8742265701293945, 'learning_rate': 1.54419595314164e-07, 'epoch': 2.91}


 97%|█████████▋| 10930/11268 [8:48:17<16:00,  2.84s/it]

{'loss': 0.2921, 'grad_norm': 148.50189208984375, 'learning_rate': 1.4998225062122826e-07, 'epoch': 2.91}


 97%|█████████▋| 10940/11268 [8:48:43<14:18,  2.62s/it]

{'loss': 0.2621, 'grad_norm': 52.48775100708008, 'learning_rate': 1.4554490592829252e-07, 'epoch': 2.91}


 97%|█████████▋| 10950/11268 [8:49:10<14:12,  2.68s/it]

{'loss': 0.4221, 'grad_norm': 36.922000885009766, 'learning_rate': 1.4110756123535678e-07, 'epoch': 2.92}


 97%|█████████▋| 10960/11268 [8:49:37<13:38,  2.66s/it]

{'loss': 0.2133, 'grad_norm': 2.3826258182525635, 'learning_rate': 1.3667021654242103e-07, 'epoch': 2.92}


 97%|█████████▋| 10970/11268 [8:50:03<12:50,  2.59s/it]

{'loss': 0.4164, 'grad_norm': 234.92001342773438, 'learning_rate': 1.322328718494853e-07, 'epoch': 2.92}


 97%|█████████▋| 10980/11268 [8:50:29<12:39,  2.64s/it]

{'loss': 0.4847, 'grad_norm': 38.0904426574707, 'learning_rate': 1.2779552715654952e-07, 'epoch': 2.92}


 98%|█████████▊| 10990/11268 [8:50:57<13:07,  2.83s/it]

{'loss': 0.1573, 'grad_norm': 36.71804428100586, 'learning_rate': 1.2335818246361378e-07, 'epoch': 2.93}


 98%|█████████▊| 11000/11268 [8:51:25<12:37,  2.83s/it]

{'loss': 0.2552, 'grad_norm': 27.281105041503906, 'learning_rate': 1.1892083777067802e-07, 'epoch': 2.93}


 98%|█████████▊| 11010/11268 [8:51:52<12:03,  2.80s/it]

{'loss': 0.2638, 'grad_norm': 102.39623260498047, 'learning_rate': 1.144834930777423e-07, 'epoch': 2.93}


 98%|█████████▊| 11020/11268 [8:52:20<11:21,  2.75s/it]

{'loss': 0.43, 'grad_norm': 8.017369270324707, 'learning_rate': 1.1004614838480654e-07, 'epoch': 2.93}


 98%|█████████▊| 11030/11268 [8:52:47<10:15,  2.59s/it]

{'loss': 0.437, 'grad_norm': 56.770206451416016, 'learning_rate': 1.056088036918708e-07, 'epoch': 2.94}


 98%|█████████▊| 11040/11268 [8:53:14<10:40,  2.81s/it]

{'loss': 0.2583, 'grad_norm': 5.171400547027588, 'learning_rate': 1.0117145899893505e-07, 'epoch': 2.94}


 98%|█████████▊| 11050/11268 [8:53:42<09:54,  2.73s/it]

{'loss': 0.3295, 'grad_norm': 13.535947799682617, 'learning_rate': 9.67341143059993e-08, 'epoch': 2.94}


 98%|█████████▊| 11060/11268 [8:54:09<09:31,  2.75s/it]

{'loss': 0.2533, 'grad_norm': 11.08226203918457, 'learning_rate': 9.229676961306355e-08, 'epoch': 2.94}


 98%|█████████▊| 11070/11268 [8:54:36<08:51,  2.68s/it]

{'loss': 0.3809, 'grad_norm': 102.27842712402344, 'learning_rate': 8.78594249201278e-08, 'epoch': 2.95}


 98%|█████████▊| 11080/11268 [8:55:03<08:13,  2.62s/it]

{'loss': 0.4155, 'grad_norm': 13.20802116394043, 'learning_rate': 8.342208022719205e-08, 'epoch': 2.95}


 98%|█████████▊| 11090/11268 [8:55:30<08:06,  2.74s/it]

{'loss': 0.2764, 'grad_norm': 115.45135498046875, 'learning_rate': 7.89847355342563e-08, 'epoch': 2.95}


 99%|█████████▊| 11100/11268 [8:55:57<07:47,  2.78s/it]

{'loss': 0.3116, 'grad_norm': 17.7121639251709, 'learning_rate': 7.454739084132057e-08, 'epoch': 2.96}


 99%|█████████▊| 11110/11268 [8:56:24<06:53,  2.62s/it]

{'loss': 0.5238, 'grad_norm': 13.090487480163574, 'learning_rate': 7.011004614838481e-08, 'epoch': 2.96}


 99%|█████████▊| 11120/11268 [8:56:51<06:41,  2.71s/it]

{'loss': 0.3501, 'grad_norm': 111.6506118774414, 'learning_rate': 6.567270145544907e-08, 'epoch': 2.96}


 99%|█████████▉| 11130/11268 [8:57:18<06:11,  2.69s/it]

{'loss': 0.526, 'grad_norm': 45.0068359375, 'learning_rate': 6.123535676251331e-08, 'epoch': 2.96}


 99%|█████████▉| 11140/11268 [8:57:45<05:41,  2.67s/it]

{'loss': 0.2647, 'grad_norm': 75.22984313964844, 'learning_rate': 5.679801206957757e-08, 'epoch': 2.97}


 99%|█████████▉| 11150/11268 [8:58:12<05:16,  2.68s/it]

{'loss': 0.2181, 'grad_norm': 89.87074279785156, 'learning_rate': 5.236066737664182e-08, 'epoch': 2.97}


 99%|█████████▉| 11160/11268 [8:58:39<04:46,  2.65s/it]

{'loss': 0.3852, 'grad_norm': 3.005546808242798, 'learning_rate': 4.792332268370607e-08, 'epoch': 2.97}


 99%|█████████▉| 11170/11268 [8:59:05<04:19,  2.65s/it]

{'loss': 0.3686, 'grad_norm': 124.94135284423828, 'learning_rate': 4.348597799077032e-08, 'epoch': 2.97}


 99%|█████████▉| 11180/11268 [8:59:32<03:53,  2.66s/it]

{'loss': 0.1968, 'grad_norm': 33.08036422729492, 'learning_rate': 3.904863329783458e-08, 'epoch': 2.98}


 99%|█████████▉| 11190/11268 [8:59:58<03:26,  2.64s/it]

{'loss': 0.3085, 'grad_norm': 67.15409851074219, 'learning_rate': 3.461128860489883e-08, 'epoch': 2.98}


 99%|█████████▉| 11200/11268 [9:00:25<02:59,  2.64s/it]

{'loss': 0.3928, 'grad_norm': 2.3185179233551025, 'learning_rate': 3.0173943911963086e-08, 'epoch': 2.98}


 99%|█████████▉| 11210/11268 [9:00:51<02:36,  2.71s/it]

{'loss': 0.5321, 'grad_norm': 54.76182174682617, 'learning_rate': 2.5736599219027333e-08, 'epoch': 2.98}


100%|█████████▉| 11220/11268 [9:01:18<02:12,  2.76s/it]

{'loss': 0.2429, 'grad_norm': 167.281982421875, 'learning_rate': 2.129925452609159e-08, 'epoch': 2.99}


100%|█████████▉| 11230/11268 [9:01:46<01:42,  2.71s/it]

{'loss': 0.2159, 'grad_norm': 90.14580535888672, 'learning_rate': 1.686190983315584e-08, 'epoch': 2.99}


100%|█████████▉| 11240/11268 [9:02:14<01:18,  2.80s/it]

{'loss': 0.3978, 'grad_norm': 22.672422409057617, 'learning_rate': 1.2424565140220093e-08, 'epoch': 2.99}


100%|█████████▉| 11250/11268 [9:02:40<00:47,  2.63s/it]

{'loss': 0.4544, 'grad_norm': 125.3049087524414, 'learning_rate': 7.987220447284345e-09, 'epoch': 3.0}


100%|█████████▉| 11260/11268 [9:03:06<00:20,  2.57s/it]

{'loss': 0.379, 'grad_norm': 92.91517639160156, 'learning_rate': 3.549875754348598e-09, 'epoch': 3.0}


                                                       
100%|██████████| 11268/11268 [9:09:47<00:00,  2.17s/it]

{'eval_loss': 0.4616864025592804, 'eval_runtime': 379.2874, 'eval_samples_per_second': 9.903, 'eval_steps_per_second': 1.239, 'epoch': 3.0}


100%|██████████| 11268/11268 [9:09:51<00:00,  2.93s/it]

{'train_runtime': 32991.3852, 'train_samples_per_second': 2.732, 'train_steps_per_second': 0.342, 'train_loss': 0.43026263869566234, 'epoch': 3.0}





TrainOutput(global_step=11268, training_loss=0.43026263869566234, metrics={'train_runtime': 32991.3852, 'train_samples_per_second': 2.732, 'train_steps_per_second': 0.342, 'total_flos': 2.371414922640691e+16, 'train_loss': 0.43026263869566234, 'epoch': 3.0})

epoch 3 model

In [9]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
model_path = "./results/checkpoint-11268" 
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [11]:
test_dataset = df.iloc[round(0.9*len(df)):]
batch_size = 50
all_predictions = []
for i in range(0, len(test_dataset), batch_size):
    print(i)
    temp = test_dataset.iloc[i:i+batch_size]
    test_encodings = tokenizer(temp['content'].to_list(), padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        predictions = model(**test_encodings)
    all_predictions.extend(predictions.logits.argmax(dim=-1).cpu().numpy())


# test_encodings = tokenizer(test_dataset['content'].to_list(), padding="max_length", truncation=True, max_length=512, return_tensors="pt")

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750


In [15]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
# convert all preds to numpy
np_preds = np.array(all_predictions)
labels = test_dataset["bias"].to_numpy()
print(f"Accuracy: {accuracy_score(labels, np_preds):.4f}")
print(classification_report(labels, np_preds))

Accuracy: 0.8631
              precision    recall  f1-score   support

           0       0.85      0.88      0.87      1299
           1       0.88      0.85      0.86      1109
           2       0.86      0.86      0.86      1347

    accuracy                           0.86      3755
   macro avg       0.86      0.86      0.86      3755
weighted avg       0.86      0.86      0.86      3755



In [16]:
valid_dataset = df.iloc[round(0.8*len(df)):round(0.9*len(df))]
batch_size = 50
all_predictions = []
for i in range(0, len(test_dataset), batch_size):
    print(i)
    temp = valid_dataset.iloc[i:i+batch_size]
    test_encodings = tokenizer(temp['content'].to_list(), padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        predictions = model(**test_encodings)
    all_predictions.extend(predictions.logits.argmax(dim=-1).cpu().numpy())


# test_encodings = tokenizer(test_dataset['content'].to_list(), padding="max_length", truncation=True, max_length=512, return_tensors="pt")

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750


In [18]:
np_preds = np.array(all_predictions)
labels = valid_dataset["bias"].to_numpy()
print(f"Accuracy: {accuracy_score(labels, np_preds):.4f}")
print(classification_report(labels, np_preds))

Accuracy: 0.8698
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1310
           1       0.87      0.86      0.86      1061
           2       0.89      0.86      0.87      1385

    accuracy                           0.87      3756
   macro avg       0.87      0.87      0.87      3756
weighted avg       0.87      0.87      0.87      3756



epoch 2 model

In [19]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
model_path = "./results/checkpoint-7512" 
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [20]:
test_dataset = df.iloc[round(0.9*len(df)):]
batch_size = 50
all_predictions = []
for i in range(0, len(test_dataset), batch_size):
    print(i)
    temp = test_dataset.iloc[i:i+batch_size]
    test_encodings = tokenizer(temp['content'].to_list(), padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        predictions = model(**test_encodings)
    all_predictions.extend(predictions.logits.argmax(dim=-1).cpu().numpy())

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750


In [22]:
np_preds = np.array(all_predictions)
labels = test_dataset["bias"].to_numpy()
print(f"Accuracy: {accuracy_score(labels, np_preds):.4f}")
print(classification_report(labels, np_preds))

Accuracy: 0.8453
              precision    recall  f1-score   support

           0       0.80      0.90      0.85      1299
           1       0.89      0.83      0.86      1109
           2       0.87      0.80      0.83      1347

    accuracy                           0.85      3755
   macro avg       0.85      0.85      0.85      3755
weighted avg       0.85      0.85      0.85      3755



epoch 1 model

In [5]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
model_path = "./results/checkpoint-3756" 
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [24]:
test_dataset = df.iloc[round(0.9*len(df)):]
batch_size = 50
all_predictions = []
for i in range(0, len(test_dataset), batch_size):
    print(i)
    temp = test_dataset.iloc[i:i+batch_size]
    test_encodings = tokenizer(temp['content'].to_list(), padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        predictions = model(**test_encodings)
    all_predictions.extend(predictions.logits.argmax(dim=-1).cpu().numpy())

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750


In [25]:
np_preds = np.array(all_predictions)
labels = test_dataset["bias"].to_numpy()
print(f"Accuracy: {accuracy_score(labels, np_preds):.4f}")
print(classification_report(labels, np_preds))

Accuracy: 0.8413
              precision    recall  f1-score   support

           0       0.87      0.84      0.85      1299
           1       0.85      0.81      0.83      1109
           2       0.81      0.87      0.84      1347

    accuracy                           0.84      3755
   macro avg       0.84      0.84      0.84      3755
weighted avg       0.84      0.84      0.84      3755



testing on twin views

In [8]:
babe_test_df = pd.read_excel("/Users/kritigupta/mlp project/data/final_labels_MBIC.xlsx")
babe_test_df.head()

  for idx, row in parser.parse():


Unnamed: 0,text,news_link,outlet,topic,type,group_id,num_sent,label_bias,label_opinion,article,biased_words
0,YouTube is making clear there will be no “birt...,https://eu.usatoday.com/story/tech/2020/02/03/...,usa-today,elections-2020,center,1,1,Biased,Somewhat factual but also opinionated,YouTube says no ‘deepfakes’ or ‘birther’ video...,"['belated', 'birtherism']"
1,So while there may be a humanitarian crisis dr...,https://www.alternet.org/2019/01/here-are-5-of...,alternet,immigration,left,1,1,Biased,Expresses writer’s opinion,Speaking to the country for the first time fro...,['crisis']
2,"Looking around the United States, there is nev...",https://thefederalist.com/2020/03/11/woman-who...,federalist,abortion,right,1,1,Biased,Somewhat factual but also opinionated,The left has a thing for taking babies hostage...,"['killing', 'never', 'developing', 'humans', '..."
3,The Republican president assumed he was helpin...,http://www.msnbc.com/rachel-maddow-show/auto-i...,msnbc,environment,left,1,1,Biased,Expresses writer’s opinion,"In Barack Obama’s first term, the administrati...","['rejects', 'happy', 'assumed']"
4,The explosion of the Hispanic population has l...,https://www.breitbart.com/politics/2015/02/26/...,breitbart,student-debt,right,1,1,Biased,No agreement,"Republicans should stop fighting amnesty, Pres...",['explosion']


In [10]:
# converting type to integer label
babe_test_df['label'] = babe_test_df['type'].map({'left': 0, 'center': 1, 'right': 2})


In [13]:
# drop duplicates from text column
babe_test_df = babe_test_df.drop_duplicates(subset=['text'])

#drop empty topic rows
babe_test_df = babe_test_df.dropna(subset=['text','topic'])

babe_test_df.shape

(1700, 12)

In [14]:
batch_size = 50
all_predictions = []
for i in range(0, len(babe_test_df), batch_size):
    print(i)
    temp = babe_test_df.iloc[i:i+batch_size]
    test_encodings = tokenizer(temp['text'].to_list(), padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        predictions = model(**test_encodings)
    all_predictions.extend(predictions.logits.argmax(dim=-1).cpu().numpy())

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650


In [17]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
np_preds = np.array(all_predictions)
labels = babe_test_df["label"].to_numpy()
print(f"Accuracy: {accuracy_score(labels, np_preds):.4f}")
print(classification_report(labels, np_preds))

Accuracy: 0.5006
              precision    recall  f1-score   support

           0       0.45      0.86      0.59       694
           1       0.80      0.01      0.03       315
           2       0.66      0.36      0.46       691

    accuracy                           0.50      1700
   macro avg       0.64      0.41      0.36      1700
weighted avg       0.60      0.50      0.44      1700



In [18]:
df_twinviews_test = df = pd.read_csv("hf://datasets/wwbrannon/twinviews-13k/twinviews-13k.csv")
df_twinviews_test.head()

Unnamed: 0,l,r,topic
0,LGBTQ+ individuals should have the same rights...,Marriage should only be between a man and a wo...,LGBTQ+ Rights
1,Police reform is necessary to address systemic...,Law enforcement should be supported and given ...,Police Reform
2,A woman should have the right to choose whethe...,Abortion should be illegal and considered as t...,Abortion Rights
3,Increase regulations on businesses to protect ...,Reduce regulations on businesses to promote in...,Regulation
4,Investing in clean energy technologies will cr...,Government subsidies for clean energy distort ...,Clean Energy


In [19]:
# drop topic duplicates (taking only one sentence per topic)
df_twinviews_test['topic']  = df_twinviews_test.topic.str.lower()
df_twinviews_test = df_twinviews_test.drop_duplicates(subset=['topic'])


In [20]:
df_twinviews_test.shape

(875, 3)

In [26]:
#split dataset by l and r
d1 = df_twinviews_test[['l','topic']]
d1['label'] = 0
d1.rename(columns={'l':'text'},inplace=True)
d2 = df_twinviews_test[['r','topic']]
d2['label'] = 2
d2.rename(columns={'r':'text'},inplace=True)
df_twinviews_test = pd.concat([d1,d2])

KeyError: "['l'] not in index"

In [31]:
df_twinviews_test.drop_duplicates(subset=['text'],inplace=True)
df_twinviews_test.dropna(subset=['text'],inplace=True)

In [32]:
df_twinviews_test.shape

(1745, 3)

In [33]:
batch_size = 50
all_predictions = []
for i in range(0, len(df_twinviews_test), batch_size):
    print(i)
    temp = df_twinviews_test.iloc[i:i+batch_size]
    test_encodings = tokenizer(temp['text'].to_list(), padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    with torch.no_grad():
        predictions = model(**test_encodings)
    all_predictions.extend(predictions.logits.argmax(dim=-1).cpu().numpy())

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700


In [34]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
np_preds = np.array(all_predictions)
labels = df_twinviews_test["label"].to_numpy()
print(f"Accuracy: {accuracy_score(labels, np_preds):.4f}")
print(classification_report(labels, np_preds))

Accuracy: 0.6126
              precision    recall  f1-score   support

           0       0.58      0.81      0.68       872
           1       0.00      0.00      0.00         0
           2       0.69      0.41      0.52       873

    accuracy                           0.61      1745
   macro avg       0.43      0.41      0.40      1745
weighted avg       0.64      0.61      0.60      1745



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
