## Training

In [12]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq

# Clean CUDA cache
torch.cuda.empty_cache()

# Load T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Load the dataset
from datasets import load_from_disk
dataset_samsum = load_from_disk('./samsum_dataset')

# Function to convert examples to features
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'], max_length=1024, truncation=True)
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length=128, truncation=True)
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

# Map the conversion function to the dataset
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # Reduced number of epochs
    per_device_train_batch_size=1,  # Batch size of 1
    per_device_eval_batch_size=1,  # Batch size of 1
    warmup_steps=100,  # Reduced warmup steps
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,  # Reduced logging steps
    evaluation_strategy="steps",
    eval_steps=100,  # Reduced evaluation steps
    save_steps=200,  # Reduced save steps
    gradient_accumulation_steps=1,  # No gradient accumulation
    fp16=True  # Use mixed precision training
)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_samsum_pt["train"],
    eval_dataset=dataset_samsum_pt["validation"],
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model('./t5_base_trained')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 14732/14732 [00:04<00:00, 3181.80 examples/s]
Map: 100%|██████████| 819/819 [00:00<00:00, 3120.95 examples/s]
Map: 100%|██████████| 818/818 [00:00<00:00, 3179.94 examples/s]
  0%|          | 51/14732 [00:08<42:46,  5.72it/s] 

{'loss': 3.0774, 'grad_norm': 18.636903762817383, 'learning_rate': 2.25e-05, 'epoch': 0.0}


  1%|          | 100/14732 [00:17<41:58,  5.81it/s]

{'loss': 2.0201, 'grad_norm': 15.437552452087402, 'learning_rate': 4.75e-05, 'epoch': 0.01}


                                                   
  1%|          | 101/14732 [00:28<14:14:16,  3.50s/it]

{'eval_loss': 1.7793482542037964, 'eval_runtime': 11.0936, 'eval_samples_per_second': 73.736, 'eval_steps_per_second': 73.736, 'epoch': 0.01}


  1%|          | 151/14732 [00:36<42:43,  5.69it/s]   

{'loss': 2.0267, 'grad_norm': 16.29692840576172, 'learning_rate': 4.984622744669219e-05, 'epoch': 0.01}


  1%|▏         | 200/14732 [00:45<40:59,  5.91it/s]

{'loss': 1.8065, 'grad_norm': 21.25791358947754, 'learning_rate': 4.9675369054127944e-05, 'epoch': 0.01}


                                                   
  1%|▏         | 200/14732 [00:56<40:59,  5.91it/s]

{'eval_loss': 1.6638611555099487, 'eval_runtime': 11.0122, 'eval_samples_per_second': 74.281, 'eval_steps_per_second': 74.281, 'epoch': 0.01}


  2%|▏         | 251/14732 [01:07<42:23,  5.69it/s]   

{'loss': 1.824, 'grad_norm': 12.781135559082031, 'learning_rate': 4.950451066156369e-05, 'epoch': 0.02}


  2%|▏         | 300/14732 [01:15<40:51,  5.89it/s]

{'loss': 1.9381, 'grad_norm': 10.729756355285645, 'learning_rate': 4.933365226899945e-05, 'epoch': 0.02}


                                                   
  2%|▏         | 301/14732 [01:27<14:17:09,  3.56s/it]

{'eval_loss': 1.6045453548431396, 'eval_runtime': 11.3122, 'eval_samples_per_second': 72.312, 'eval_steps_per_second': 72.312, 'epoch': 0.02}


  2%|▏         | 351/14732 [01:35<40:51,  5.87it/s]   

{'loss': 1.895, 'grad_norm': 6.27483606338501, 'learning_rate': 4.916279387643521e-05, 'epoch': 0.02}


  3%|▎         | 400/14732 [01:43<40:51,  5.85it/s]

{'loss': 1.6666, 'grad_norm': 10.34182357788086, 'learning_rate': 4.899193548387097e-05, 'epoch': 0.03}


                                                   
  3%|▎         | 400/14732 [01:54<40:51,  5.85it/s]

{'eval_loss': 1.5820105075836182, 'eval_runtime': 10.9089, 'eval_samples_per_second': 74.985, 'eval_steps_per_second': 74.985, 'epoch': 0.03}


  3%|▎         | 451/14732 [02:05<40:30,  5.88it/s]   

{'loss': 1.8497, 'grad_norm': 16.357051849365234, 'learning_rate': 4.8821077091306726e-05, 'epoch': 0.03}


  3%|▎         | 500/14732 [02:13<40:24,  5.87it/s]

{'loss': 1.7445, 'grad_norm': 8.055150032043457, 'learning_rate': 4.865021869874248e-05, 'epoch': 0.03}


                                                   
  3%|▎         | 501/14732 [02:24<13:44:48,  3.48s/it]

{'eval_loss': 1.5707825422286987, 'eval_runtime': 11.0001, 'eval_samples_per_second': 74.363, 'eval_steps_per_second': 74.363, 'epoch': 0.03}


  4%|▎         | 551/14732 [02:33<40:44,  5.80it/s]   

{'loss': 1.7414, 'grad_norm': 5.102427959442139, 'learning_rate': 4.847936030617824e-05, 'epoch': 0.04}


  4%|▍         | 600/14732 [02:42<41:26,  5.68it/s]

{'loss': 1.7629, 'grad_norm': 8.253499984741211, 'learning_rate': 4.8308501913614e-05, 'epoch': 0.04}


                                                   
  4%|▍         | 600/14732 [02:53<41:26,  5.68it/s]

{'eval_loss': 1.5506666898727417, 'eval_runtime': 11.1559, 'eval_samples_per_second': 73.324, 'eval_steps_per_second': 73.324, 'epoch': 0.04}


  4%|▍         | 651/14732 [03:04<40:34,  5.78it/s]   

{'loss': 1.6784, 'grad_norm': 10.156864166259766, 'learning_rate': 4.813764352104976e-05, 'epoch': 0.04}


  5%|▍         | 700/14732 [03:12<40:20,  5.80it/s]

{'loss': 1.6692, 'grad_norm': 5.922618389129639, 'learning_rate': 4.7970202296336796e-05, 'epoch': 0.05}


                                                   
  5%|▍         | 701/14732 [03:23<13:27:26,  3.45s/it]

{'eval_loss': 1.53667151927948, 'eval_runtime': 10.9881, 'eval_samples_per_second': 74.444, 'eval_steps_per_second': 74.444, 'epoch': 0.05}


  5%|▌         | 751/14732 [03:32<40:18,  5.78it/s]   

{'loss': 1.5915, 'grad_norm': 10.407232284545898, 'learning_rate': 4.779934390377255e-05, 'epoch': 0.05}


  5%|▌         | 800/14732 [03:40<39:59,  5.81it/s]

{'loss': 1.5126, 'grad_norm': 7.359691619873047, 'learning_rate': 4.7628485511208316e-05, 'epoch': 0.05}


                                                   
  5%|▌         | 800/14732 [03:51<39:59,  5.81it/s]

{'eval_loss': 1.542599081993103, 'eval_runtime': 11.0467, 'eval_samples_per_second': 74.049, 'eval_steps_per_second': 74.049, 'epoch': 0.05}


  6%|▌         | 851/14732 [04:02<39:32,  5.85it/s]   

{'loss': 1.5194, 'grad_norm': 4.745540618896484, 'learning_rate': 4.745762711864407e-05, 'epoch': 0.06}


  6%|▌         | 900/14732 [04:11<41:53,  5.50it/s]

{'loss': 1.6428, 'grad_norm': 8.75198745727539, 'learning_rate': 4.728676872607983e-05, 'epoch': 0.06}


                                                   
  6%|▌         | 901/14732 [04:22<13:16:04,  3.45s/it]

{'eval_loss': 1.5294899940490723, 'eval_runtime': 10.9056, 'eval_samples_per_second': 75.007, 'eval_steps_per_second': 75.007, 'epoch': 0.06}


  6%|▋         | 951/14732 [04:30<40:16,  5.70it/s]   

{'loss': 1.8141, 'grad_norm': 5.910475730895996, 'learning_rate': 4.7115910333515585e-05, 'epoch': 0.06}


  7%|▋         | 1000/14732 [04:39<38:13,  5.99it/s]

{'loss': 1.6745, 'grad_norm': 5.674985408782959, 'learning_rate': 4.694505194095134e-05, 'epoch': 0.07}


                                                    
  7%|▋         | 1000/14732 [04:50<38:13,  5.99it/s]

{'eval_loss': 1.5061604976654053, 'eval_runtime': 11.226, 'eval_samples_per_second': 72.866, 'eval_steps_per_second': 72.866, 'epoch': 0.07}


  7%|▋         | 1051/14732 [05:01<38:32,  5.92it/s]   

{'loss': 1.7354, 'grad_norm': 5.951595783233643, 'learning_rate': 4.67741935483871e-05, 'epoch': 0.07}


  7%|▋         | 1100/14732 [05:10<39:08,  5.81it/s]

{'loss': 1.5829, 'grad_norm': 16.826005935668945, 'learning_rate': 4.6603335155822855e-05, 'epoch': 0.07}


                                                    
  7%|▋         | 1101/14732 [05:21<13:28:32,  3.56s/it]

{'eval_loss': 1.509636640548706, 'eval_runtime': 11.2877, 'eval_samples_per_second': 72.468, 'eval_steps_per_second': 72.468, 'epoch': 0.07}


  8%|▊         | 1151/14732 [05:30<39:17,  5.76it/s]   

{'loss': 1.7237, 'grad_norm': 7.286661624908447, 'learning_rate': 4.643247676325861e-05, 'epoch': 0.08}


  8%|▊         | 1200/14732 [05:38<38:56,  5.79it/s]

{'loss': 1.6947, 'grad_norm': 8.341531753540039, 'learning_rate': 4.626161837069437e-05, 'epoch': 0.08}


                                                    
  8%|▊         | 1200/14732 [05:49<38:56,  5.79it/s]

{'eval_loss': 1.4913302659988403, 'eval_runtime': 11.2013, 'eval_samples_per_second': 73.027, 'eval_steps_per_second': 73.027, 'epoch': 0.08}


  8%|▊         | 1251/14732 [06:00<38:37,  5.82it/s]   

{'loss': 1.4867, 'grad_norm': 11.492606163024902, 'learning_rate': 4.6090759978130124e-05, 'epoch': 0.08}


  9%|▉         | 1300/14732 [06:09<38:23,  5.83it/s]

{'loss': 1.7322, 'grad_norm': 5.666599750518799, 'learning_rate': 4.591990158556588e-05, 'epoch': 0.09}


                                                    
  9%|▉         | 1301/14732 [06:20<12:58:41,  3.48s/it]

{'eval_loss': 1.488692283630371, 'eval_runtime': 11.0192, 'eval_samples_per_second': 74.234, 'eval_steps_per_second': 74.234, 'epoch': 0.09}


  9%|▉         | 1351/14732 [06:28<38:27,  5.80it/s]   

{'loss': 1.5737, 'grad_norm': 7.267458915710449, 'learning_rate': 4.5749043193001644e-05, 'epoch': 0.09}


 10%|▉         | 1400/14732 [06:37<37:33,  5.92it/s]

{'loss': 1.6608, 'grad_norm': 16.044109344482422, 'learning_rate': 4.55781848004374e-05, 'epoch': 0.1}


                                                    
 10%|▉         | 1400/14732 [06:48<37:33,  5.92it/s]

{'eval_loss': 1.4898924827575684, 'eval_runtime': 11.2419, 'eval_samples_per_second': 72.764, 'eval_steps_per_second': 72.764, 'epoch': 0.1}


 10%|▉         | 1451/14732 [06:59<37:27,  5.91it/s]   

{'loss': 1.8069, 'grad_norm': 5.0879807472229, 'learning_rate': 4.540732640787316e-05, 'epoch': 0.1}


 10%|█         | 1500/14732 [07:07<37:26,  5.89it/s]

{'loss': 1.6509, 'grad_norm': 6.312069892883301, 'learning_rate': 4.523646801530891e-05, 'epoch': 0.1}


                                                    
 10%|█         | 1501/14732 [07:19<12:44:25,  3.47s/it]

{'eval_loss': 1.4745090007781982, 'eval_runtime': 10.981, 'eval_samples_per_second': 74.492, 'eval_steps_per_second': 74.492, 'epoch': 0.1}


 11%|█         | 1551/14732 [07:27<37:48,  5.81it/s]   

{'loss': 1.6546, 'grad_norm': 17.795499801635742, 'learning_rate': 4.506560962274467e-05, 'epoch': 0.11}


 11%|█         | 1600/14732 [07:36<38:34,  5.67it/s]

{'loss': 1.6017, 'grad_norm': 9.19310188293457, 'learning_rate': 4.4894751230180426e-05, 'epoch': 0.11}


                                                    
 11%|█         | 1600/14732 [07:47<38:34,  5.67it/s]

{'eval_loss': 1.4736695289611816, 'eval_runtime': 11.1618, 'eval_samples_per_second': 73.285, 'eval_steps_per_second': 73.285, 'epoch': 0.11}


 11%|█         | 1651/14732 [07:58<38:16,  5.70it/s]   

{'loss': 1.6378, 'grad_norm': 12.61184310913086, 'learning_rate': 4.472389283761619e-05, 'epoch': 0.11}


 12%|█▏        | 1700/14732 [08:07<38:05,  5.70it/s]

{'loss': 1.6557, 'grad_norm': 11.599056243896484, 'learning_rate': 4.4553034445051946e-05, 'epoch': 0.12}


                                                    
 12%|█▏        | 1701/14732 [08:18<12:37:37,  3.49s/it]

{'eval_loss': 1.4922646284103394, 'eval_runtime': 11.0355, 'eval_samples_per_second': 74.125, 'eval_steps_per_second': 74.125, 'epoch': 0.12}


 12%|█▏        | 1751/14732 [08:26<37:55,  5.71it/s]   

{'loss': 1.4367, 'grad_norm': 12.82433795928955, 'learning_rate': 4.43821760524877e-05, 'epoch': 0.12}


 12%|█▏        | 1800/14732 [08:35<35:34,  6.06it/s]

{'loss': 1.5667, 'grad_norm': 7.4361162185668945, 'learning_rate': 4.421131765992346e-05, 'epoch': 0.12}


                                                    
 12%|█▏        | 1800/14732 [08:45<35:34,  6.06it/s]

{'eval_loss': 1.4692106246948242, 'eval_runtime': 10.2671, 'eval_samples_per_second': 79.672, 'eval_steps_per_second': 79.672, 'epoch': 0.12}


 13%|█▎        | 1851/14732 [08:56<36:13,  5.93it/s]   

{'loss': 1.5661, 'grad_norm': 8.531088829040527, 'learning_rate': 4.4040459267359215e-05, 'epoch': 0.13}


 13%|█▎        | 1900/14732 [09:04<36:05,  5.93it/s]

{'loss': 1.5833, 'grad_norm': 5.144695281982422, 'learning_rate': 4.386960087479497e-05, 'epoch': 0.13}


                                                    
 13%|█▎        | 1901/14732 [09:15<11:48:35,  3.31s/it]

{'eval_loss': 1.4590020179748535, 'eval_runtime': 10.4804, 'eval_samples_per_second': 78.051, 'eval_steps_per_second': 78.051, 'epoch': 0.13}


 13%|█▎        | 1951/14732 [09:23<36:13,  5.88it/s]   

{'loss': 1.7035, 'grad_norm': 12.5563383102417, 'learning_rate': 4.369874248223073e-05, 'epoch': 0.13}


 14%|█▎        | 2000/14732 [09:31<36:14,  5.86it/s]

{'loss': 1.5341, 'grad_norm': 9.48702335357666, 'learning_rate': 4.3527884089666484e-05, 'epoch': 0.14}


                                                    
 14%|█▎        | 2000/14732 [09:42<36:14,  5.86it/s]

{'eval_loss': 1.4593133926391602, 'eval_runtime': 10.6251, 'eval_samples_per_second': 76.987, 'eval_steps_per_second': 76.987, 'epoch': 0.14}


 14%|█▍        | 2051/14732 [09:53<37:28,  5.64it/s]   

{'loss': 1.6355, 'grad_norm': 3.944331169128418, 'learning_rate': 4.335702569710224e-05, 'epoch': 0.14}


 14%|█▍        | 2100/14732 [10:01<36:19,  5.80it/s]

{'loss': 1.6224, 'grad_norm': 7.916934013366699, 'learning_rate': 4.3186167304538e-05, 'epoch': 0.14}


                                                    
 14%|█▍        | 2101/14732 [10:11<11:22:42,  3.24s/it]

{'eval_loss': 1.456167221069336, 'eval_runtime': 10.2334, 'eval_samples_per_second': 79.934, 'eval_steps_per_second': 79.934, 'epoch': 0.14}


 15%|█▍        | 2151/14732 [10:20<34:49,  6.02it/s]   

{'loss': 1.5537, 'grad_norm': 7.454837322235107, 'learning_rate': 4.3015308911973754e-05, 'epoch': 0.15}


 15%|█▍        | 2200/14732 [10:28<34:59,  5.97it/s]

{'loss': 1.7567, 'grad_norm': 10.035154342651367, 'learning_rate': 4.284445051940952e-05, 'epoch': 0.15}


                                                    
 15%|█▍        | 2200/14732 [10:38<34:59,  5.97it/s]

{'eval_loss': 1.4532725811004639, 'eval_runtime': 10.4063, 'eval_samples_per_second': 78.607, 'eval_steps_per_second': 78.607, 'epoch': 0.15}


 15%|█▌        | 2251/14732 [10:49<34:58,  5.95it/s]   

{'loss': 1.6498, 'grad_norm': 4.944429874420166, 'learning_rate': 4.267359212684527e-05, 'epoch': 0.15}


 16%|█▌        | 2300/14732 [10:57<34:52,  5.94it/s]

{'loss': 1.5066, 'grad_norm': 6.837541580200195, 'learning_rate': 4.250273373428103e-05, 'epoch': 0.16}


                                                    
 16%|█▌        | 2301/14732 [11:08<11:32:29,  3.34s/it]

{'eval_loss': 1.4518886804580688, 'eval_runtime': 10.5699, 'eval_samples_per_second': 77.39, 'eval_steps_per_second': 77.39, 'epoch': 0.16}


 16%|█▌        | 2351/14732 [11:16<34:05,  6.05it/s]   

{'loss': 1.6376, 'grad_norm': 7.533262252807617, 'learning_rate': 4.2331875341716786e-05, 'epoch': 0.16}


 16%|█▋        | 2400/14732 [11:25<34:28,  5.96it/s]

{'loss': 1.5965, 'grad_norm': 8.56042766571045, 'learning_rate': 4.216101694915254e-05, 'epoch': 0.16}


                                                    
 16%|█▋        | 2400/14732 [11:35<34:28,  5.96it/s]

{'eval_loss': 1.4505373239517212, 'eval_runtime': 10.164, 'eval_samples_per_second': 80.48, 'eval_steps_per_second': 80.48, 'epoch': 0.16}


 17%|█▋        | 2451/14732 [11:45<35:59,  5.69it/s]   

{'loss': 1.618, 'grad_norm': 11.584467887878418, 'learning_rate': 4.19901585565883e-05, 'epoch': 0.17}


 17%|█▋        | 2500/14732 [11:54<34:39,  5.88it/s]

{'loss': 1.5581, 'grad_norm': 6.982281684875488, 'learning_rate': 4.181930016402406e-05, 'epoch': 0.17}


                                                    
 17%|█▋        | 2501/14732 [12:04<11:11:45,  3.30s/it]

{'eval_loss': 1.4484531879425049, 'eval_runtime': 10.4148, 'eval_samples_per_second': 78.542, 'eval_steps_per_second': 78.542, 'epoch': 0.17}


 17%|█▋        | 2551/14732 [12:13<34:26,  5.89it/s]   

{'loss': 1.7048, 'grad_norm': 10.736274719238281, 'learning_rate': 4.164844177145982e-05, 'epoch': 0.17}


 18%|█▊        | 2600/14732 [12:21<33:14,  6.08it/s]

{'loss': 1.5259, 'grad_norm': 8.725205421447754, 'learning_rate': 4.1477583378895575e-05, 'epoch': 0.18}


                                                    
 18%|█▊        | 2600/14732 [12:31<33:14,  6.08it/s]

{'eval_loss': 1.4550443887710571, 'eval_runtime': 10.0816, 'eval_samples_per_second': 81.138, 'eval_steps_per_second': 81.138, 'epoch': 0.18}


 18%|█▊        | 2651/14732 [12:42<33:52,  5.94it/s]   

{'loss': 1.6613, 'grad_norm': 14.671867370605469, 'learning_rate': 4.130672498633133e-05, 'epoch': 0.18}


 18%|█▊        | 2700/14732 [12:50<34:17,  5.85it/s]

{'loss': 1.6552, 'grad_norm': 6.5807085037231445, 'learning_rate': 4.113586659376709e-05, 'epoch': 0.18}


                                                    
 18%|█▊        | 2701/14732 [13:01<11:35:04,  3.47s/it]

{'eval_loss': 1.4441858530044556, 'eval_runtime': 10.9762, 'eval_samples_per_second': 74.525, 'eval_steps_per_second': 74.525, 'epoch': 0.18}


 19%|█▊        | 2751/14732 [13:10<34:37,  5.77it/s]   

{'loss': 1.4223, 'grad_norm': 5.026063919067383, 'learning_rate': 4.0965008201202844e-05, 'epoch': 0.19}


 19%|█▉        | 2800/14732 [13:18<32:39,  6.09it/s]

{'loss': 1.5606, 'grad_norm': 10.303850173950195, 'learning_rate': 4.07941498086386e-05, 'epoch': 0.19}


                                                    
 19%|█▉        | 2800/14732 [13:28<32:39,  6.09it/s]

{'eval_loss': 1.437418818473816, 'eval_runtime': 10.3852, 'eval_samples_per_second': 78.766, 'eval_steps_per_second': 78.766, 'epoch': 0.19}


 19%|█▉        | 2851/14732 [13:39<34:15,  5.78it/s]   

{'loss': 1.5585, 'grad_norm': 5.786732196807861, 'learning_rate': 4.062329141607436e-05, 'epoch': 0.19}


 20%|█▉        | 2900/14732 [13:47<33:25,  5.90it/s]

{'loss': 1.5951, 'grad_norm': 17.402009963989258, 'learning_rate': 4.0452433023510114e-05, 'epoch': 0.2}


                                                    
 20%|█▉        | 2901/14732 [13:58<10:47:30,  3.28s/it]

{'eval_loss': 1.4291549921035767, 'eval_runtime': 10.3776, 'eval_samples_per_second': 78.823, 'eval_steps_per_second': 78.823, 'epoch': 0.2}


 20%|██        | 2951/14732 [14:06<33:26,  5.87it/s]   

{'loss': 1.3864, 'grad_norm': 3.888143301010132, 'learning_rate': 4.028157463094587e-05, 'epoch': 0.2}


 20%|██        | 3000/14732 [14:15<32:04,  6.09it/s]

{'loss': 1.5635, 'grad_norm': 16.35315704345703, 'learning_rate': 4.011071623838163e-05, 'epoch': 0.2}


                                                    
 20%|██        | 3000/14732 [14:25<32:04,  6.09it/s]

{'eval_loss': 1.4327304363250732, 'eval_runtime': 10.5608, 'eval_samples_per_second': 77.456, 'eval_steps_per_second': 77.456, 'epoch': 0.2}


 21%|██        | 3051/14732 [14:36<32:52,  5.92it/s]   

{'loss': 1.6093, 'grad_norm': 8.353537559509277, 'learning_rate': 3.993985784581739e-05, 'epoch': 0.21}


 21%|██        | 3100/14732 [14:44<32:11,  6.02it/s]

{'loss': 1.6314, 'grad_norm': 9.276473999023438, 'learning_rate': 3.9768999453253146e-05, 'epoch': 0.21}


                                                    
 21%|██        | 3101/14732 [14:55<10:21:52,  3.21s/it]

{'eval_loss': 1.426613688468933, 'eval_runtime': 10.1237, 'eval_samples_per_second': 80.801, 'eval_steps_per_second': 80.801, 'epoch': 0.21}


 21%|██▏       | 3151/14732 [15:03<31:51,  6.06it/s]   

{'loss': 1.6675, 'grad_norm': 5.933454513549805, 'learning_rate': 3.95981410606889e-05, 'epoch': 0.21}


 22%|██▏       | 3200/14732 [15:11<32:00,  6.01it/s]

{'loss': 1.5188, 'grad_norm': 10.874310493469238, 'learning_rate': 3.942728266812466e-05, 'epoch': 0.22}


                                                    
 22%|██▏       | 3200/14732 [15:22<32:00,  6.01it/s]

{'eval_loss': 1.427061915397644, 'eval_runtime': 10.2507, 'eval_samples_per_second': 79.799, 'eval_steps_per_second': 79.799, 'epoch': 0.22}


 22%|██▏       | 3251/14732 [15:32<32:37,  5.87it/s]   

{'loss': 1.6149, 'grad_norm': 5.782668590545654, 'learning_rate': 3.9256424275560416e-05, 'epoch': 0.22}


 22%|██▏       | 3300/14732 [15:41<31:37,  6.02it/s]

{'loss': 1.5164, 'grad_norm': 5.8126139640808105, 'learning_rate': 3.908556588299617e-05, 'epoch': 0.22}


                                                    
 22%|██▏       | 3301/14732 [15:51<10:16:57,  3.24s/it]

{'eval_loss': 1.4273738861083984, 'eval_runtime': 10.2358, 'eval_samples_per_second': 79.916, 'eval_steps_per_second': 79.916, 'epoch': 0.22}


 23%|██▎       | 3351/14732 [15:59<31:37,  6.00it/s]   

{'loss': 1.5224, 'grad_norm': 6.716641426086426, 'learning_rate': 3.8914707490431935e-05, 'epoch': 0.23}


 23%|██▎       | 3400/14732 [16:08<31:25,  6.01it/s]

{'loss': 1.6541, 'grad_norm': 5.280689716339111, 'learning_rate': 3.874384909786769e-05, 'epoch': 0.23}


                                                    
 23%|██▎       | 3400/14732 [16:18<31:25,  6.01it/s]

{'eval_loss': 1.4332294464111328, 'eval_runtime': 10.0945, 'eval_samples_per_second': 81.034, 'eval_steps_per_second': 81.034, 'epoch': 0.23}


 23%|██▎       | 3451/14732 [16:28<31:47,  5.91it/s]   

{'loss': 1.3306, 'grad_norm': 5.226157188415527, 'learning_rate': 3.857299070530345e-05, 'epoch': 0.23}


 24%|██▍       | 3500/14732 [16:36<31:07,  6.02it/s]

{'loss': 1.4578, 'grad_norm': 8.2465181350708, 'learning_rate': 3.8402132312739205e-05, 'epoch': 0.24}


                                                    
 24%|██▍       | 3501/14732 [16:46<9:45:20,  3.13s/it]

{'eval_loss': 1.4297674894332886, 'eval_runtime': 9.8609, 'eval_samples_per_second': 82.954, 'eval_steps_per_second': 82.954, 'epoch': 0.24}


 24%|██▍       | 3551/14732 [16:55<30:53,  6.03it/s]  

{'loss': 1.5593, 'grad_norm': 9.748558044433594, 'learning_rate': 3.823127392017496e-05, 'epoch': 0.24}


 24%|██▍       | 3600/14732 [17:03<31:07,  5.96it/s]

{'loss': 1.5225, 'grad_norm': 5.31842041015625, 'learning_rate': 3.806041552761072e-05, 'epoch': 0.24}


                                                    
 24%|██▍       | 3600/14732 [17:13<31:07,  5.96it/s]

{'eval_loss': 1.4229366779327393, 'eval_runtime': 10.684, 'eval_samples_per_second': 76.563, 'eval_steps_per_second': 76.563, 'epoch': 0.24}


 25%|██▍       | 3651/14732 [17:24<31:09,  5.93it/s]   

{'loss': 1.7499, 'grad_norm': 13.65909194946289, 'learning_rate': 3.7889557135046474e-05, 'epoch': 0.25}


 25%|██▌       | 3700/14732 [17:32<31:13,  5.89it/s]

{'loss': 1.5458, 'grad_norm': 7.556812286376953, 'learning_rate': 3.771869874248223e-05, 'epoch': 0.25}


                                                    
 25%|██▌       | 3701/14732 [17:43<10:19:48,  3.37s/it]

{'eval_loss': 1.4187390804290771, 'eval_runtime': 10.6641, 'eval_samples_per_second': 76.706, 'eval_steps_per_second': 76.706, 'epoch': 0.25}


 25%|██▌       | 3751/14732 [17:51<30:51,  5.93it/s]   

{'loss': 1.5713, 'grad_norm': 9.13346004486084, 'learning_rate': 3.754784034991799e-05, 'epoch': 0.25}


 26%|██▌       | 3800/14732 [18:00<31:45,  5.74it/s]

{'loss': 1.6909, 'grad_norm': 8.21447467803955, 'learning_rate': 3.737698195735374e-05, 'epoch': 0.26}


                                                    
 26%|██▌       | 3800/14732 [18:11<31:45,  5.74it/s]

{'eval_loss': 1.4183800220489502, 'eval_runtime': 10.7575, 'eval_samples_per_second': 76.04, 'eval_steps_per_second': 76.04, 'epoch': 0.26}


 26%|██▌       | 3851/14732 [18:21<31:22,  5.78it/s]   

{'loss': 1.5223, 'grad_norm': 4.579014301300049, 'learning_rate': 3.72061235647895e-05, 'epoch': 0.26}


 26%|██▋       | 3900/14732 [18:30<31:06,  5.80it/s]

{'loss': 1.5537, 'grad_norm': 5.231040000915527, 'learning_rate': 3.703526517222526e-05, 'epoch': 0.26}


                                                    
 26%|██▋       | 3901/14732 [18:41<10:22:07,  3.45s/it]

{'eval_loss': 1.4078214168548584, 'eval_runtime': 10.9082, 'eval_samples_per_second': 74.99, 'eval_steps_per_second': 74.99, 'epoch': 0.26}


 27%|██▋       | 3951/14732 [18:49<30:57,  5.80it/s]   

{'loss': 1.4806, 'grad_norm': 4.947532653808594, 'learning_rate': 3.686440677966102e-05, 'epoch': 0.27}


 27%|██▋       | 4000/14732 [18:57<30:15,  5.91it/s]

{'loss': 1.5242, 'grad_norm': 10.137144088745117, 'learning_rate': 3.6693548387096776e-05, 'epoch': 0.27}


                                                    
 27%|██▋       | 4000/14732 [19:08<30:15,  5.91it/s]

{'eval_loss': 1.4193446636199951, 'eval_runtime': 10.4105, 'eval_samples_per_second': 78.574, 'eval_steps_per_second': 78.574, 'epoch': 0.27}


 27%|██▋       | 4051/14732 [19:18<29:57,  5.94it/s]   

{'loss': 1.6123, 'grad_norm': 6.175423622131348, 'learning_rate': 3.652268999453253e-05, 'epoch': 0.27}


 28%|██▊       | 4100/14732 [19:27<30:00,  5.90it/s]

{'loss': 1.4901, 'grad_norm': 10.768360137939453, 'learning_rate': 3.635183160196829e-05, 'epoch': 0.28}


                                                    
 28%|██▊       | 4101/14732 [19:38<10:09:19,  3.44s/it]

{'eval_loss': 1.414007306098938, 'eval_runtime': 10.8931, 'eval_samples_per_second': 75.094, 'eval_steps_per_second': 75.094, 'epoch': 0.28}


 28%|██▊       | 4151/14732 [19:46<29:53,  5.90it/s]   

{'loss': 1.6258, 'grad_norm': 6.9233784675598145, 'learning_rate': 3.6180973209404045e-05, 'epoch': 0.28}


 29%|██▊       | 4200/14732 [19:54<30:11,  5.81it/s]

{'loss': 1.4311, 'grad_norm': 5.380087852478027, 'learning_rate': 3.601011481683981e-05, 'epoch': 0.29}


                                                    
 29%|██▊       | 4200/14732 [20:05<30:11,  5.81it/s]

{'eval_loss': 1.415880560874939, 'eval_runtime': 10.3373, 'eval_samples_per_second': 79.131, 'eval_steps_per_second': 79.131, 'epoch': 0.29}


 29%|██▉       | 4251/14732 [20:16<30:18,  5.76it/s]   

{'loss': 1.4898, 'grad_norm': 5.75232458114624, 'learning_rate': 3.5839256424275565e-05, 'epoch': 0.29}


 29%|██▉       | 4300/14732 [20:24<29:12,  5.95it/s]

{'loss': 1.433, 'grad_norm': 11.828474998474121, 'learning_rate': 3.566839803171132e-05, 'epoch': 0.29}


                                                    
 29%|██▉       | 4301/14732 [20:35<9:30:53,  3.28s/it]

{'eval_loss': 1.4071227312088013, 'eval_runtime': 10.3799, 'eval_samples_per_second': 78.806, 'eval_steps_per_second': 78.806, 'epoch': 0.29}


 30%|██▉       | 4351/14732 [20:43<28:46,  6.01it/s]  

{'loss': 1.5055, 'grad_norm': 5.227931022644043, 'learning_rate': 3.549753963914708e-05, 'epoch': 0.3}


 30%|██▉       | 4400/14732 [20:51<28:56,  5.95it/s]

{'loss': 1.4776, 'grad_norm': 6.65316915512085, 'learning_rate': 3.5326681246582834e-05, 'epoch': 0.3}


                                                    
 30%|██▉       | 4400/14732 [21:02<28:56,  5.95it/s]

{'eval_loss': 1.4145311117172241, 'eval_runtime': 10.7298, 'eval_samples_per_second': 76.236, 'eval_steps_per_second': 76.236, 'epoch': 0.3}


 30%|███       | 4451/14732 [21:13<28:55,  5.92it/s]   

{'loss': 1.4936, 'grad_norm': 7.928944110870361, 'learning_rate': 3.515582285401859e-05, 'epoch': 0.3}


 31%|███       | 4500/14732 [21:21<28:31,  5.98it/s]

{'loss': 1.5417, 'grad_norm': 6.357004165649414, 'learning_rate': 3.4984964461454354e-05, 'epoch': 0.31}


                                                    
 31%|███       | 4501/14732 [21:31<9:16:48,  3.27s/it]

{'eval_loss': 1.406744122505188, 'eval_runtime': 10.3232, 'eval_samples_per_second': 79.239, 'eval_steps_per_second': 79.239, 'epoch': 0.31}


 31%|███       | 4551/14732 [21:40<28:46,  5.90it/s]  

{'loss': 1.612, 'grad_norm': 8.902140617370605, 'learning_rate': 3.4814106068890103e-05, 'epoch': 0.31}


 31%|███       | 4600/14732 [21:48<28:45,  5.87it/s]

{'loss': 1.5764, 'grad_norm': 5.392091274261475, 'learning_rate': 3.464324767632586e-05, 'epoch': 0.31}


                                                    
 31%|███       | 4600/14732 [21:59<28:45,  5.87it/s]

{'eval_loss': 1.4102201461791992, 'eval_runtime': 10.4026, 'eval_samples_per_second': 78.635, 'eval_steps_per_second': 78.635, 'epoch': 0.31}


 32%|███▏      | 4651/14732 [22:09<27:51,  6.03it/s]   

{'loss': 1.4646, 'grad_norm': 12.946717262268066, 'learning_rate': 3.4472389283761616e-05, 'epoch': 0.32}


 32%|███▏      | 4700/14732 [22:17<28:44,  5.82it/s]

{'loss': 1.5607, 'grad_norm': 7.865845203399658, 'learning_rate': 3.430153089119737e-05, 'epoch': 0.32}


                                                    
 32%|███▏      | 4701/14732 [22:28<9:02:42,  3.25s/it]

{'eval_loss': 1.407075047492981, 'eval_runtime': 10.2464, 'eval_samples_per_second': 79.833, 'eval_steps_per_second': 79.833, 'epoch': 0.32}


 32%|███▏      | 4751/14732 [22:36<28:06,  5.92it/s]  

{'loss': 1.551, 'grad_norm': 11.322683334350586, 'learning_rate': 3.4130672498633136e-05, 'epoch': 0.32}


 33%|███▎      | 4800/14732 [22:44<27:51,  5.94it/s]

{'loss': 1.4238, 'grad_norm': 4.71055269241333, 'learning_rate': 3.3963231273920174e-05, 'epoch': 0.33}


                                                    
 33%|███▎      | 4800/14732 [22:55<27:51,  5.94it/s]

{'eval_loss': 1.4042096138000488, 'eval_runtime': 10.7038, 'eval_samples_per_second': 76.421, 'eval_steps_per_second': 76.421, 'epoch': 0.33}


 33%|███▎      | 4851/14732 [23:06<29:57,  5.50it/s]   

{'loss': 1.5471, 'grad_norm': 10.82568359375, 'learning_rate': 3.379237288135593e-05, 'epoch': 0.33}


 33%|███▎      | 4900/14732 [23:15<28:19,  5.79it/s]

{'loss': 1.693, 'grad_norm': 4.811676502227783, 'learning_rate': 3.362151448879169e-05, 'epoch': 0.33}


                                                    
 33%|███▎      | 4901/14732 [23:26<9:20:50,  3.42s/it]

{'eval_loss': 1.4018336534500122, 'eval_runtime': 10.8278, 'eval_samples_per_second': 75.546, 'eval_steps_per_second': 75.546, 'epoch': 0.33}


 34%|███▎      | 4951/14732 [23:34<28:30,  5.72it/s]  

{'loss': 1.6893, 'grad_norm': 4.34696626663208, 'learning_rate': 3.345065609622745e-05, 'epoch': 0.34}


 34%|███▍      | 5000/14732 [23:43<24:27,  6.63it/s]

{'loss': 1.9185, 'grad_norm': 6.119470596313477, 'learning_rate': 3.328321487151449e-05, 'epoch': 0.34}


                                                    
 34%|███▍      | 5000/14732 [23:53<24:27,  6.63it/s]

{'eval_loss': 1.3975814580917358, 'eval_runtime': 10.4759, 'eval_samples_per_second': 78.084, 'eval_steps_per_second': 78.084, 'epoch': 0.34}


 34%|███▍      | 5051/14732 [24:04<27:06,  5.95it/s]  

{'loss': 1.6958, 'grad_norm': 5.741353511810303, 'learning_rate': 3.3112356478950245e-05, 'epoch': 0.34}


 35%|███▍      | 5100/14732 [24:12<27:34,  5.82it/s]

{'loss': 1.5363, 'grad_norm': 6.716649532318115, 'learning_rate': 3.2941498086386e-05, 'epoch': 0.35}


                                                    
 35%|███▍      | 5101/14732 [24:23<8:49:43,  3.30s/it]

{'eval_loss': 1.4079103469848633, 'eval_runtime': 10.427, 'eval_samples_per_second': 78.45, 'eval_steps_per_second': 78.45, 'epoch': 0.35}


 35%|███▍      | 5151/14732 [24:31<26:53,  5.94it/s]  

{'loss': 1.6062, 'grad_norm': 5.846882343292236, 'learning_rate': 3.2770639693821764e-05, 'epoch': 0.35}


 35%|███▌      | 5200/14732 [24:40<26:32,  5.99it/s]

{'loss': 1.6776, 'grad_norm': 5.77061128616333, 'learning_rate': 3.259978130125752e-05, 'epoch': 0.35}


                                                    
 35%|███▌      | 5200/14732 [24:50<26:32,  5.99it/s]

{'eval_loss': 1.3960001468658447, 'eval_runtime': 10.4181, 'eval_samples_per_second': 78.517, 'eval_steps_per_second': 78.517, 'epoch': 0.35}


 36%|███▌      | 5251/14732 [25:01<26:44,  5.91it/s]   

{'loss': 1.5888, 'grad_norm': 8.092183113098145, 'learning_rate': 3.242892290869328e-05, 'epoch': 0.36}


 36%|███▌      | 5300/14732 [25:09<27:12,  5.78it/s]

{'loss': 1.4838, 'grad_norm': 6.129683971405029, 'learning_rate': 3.2258064516129034e-05, 'epoch': 0.36}


                                                    
 36%|███▌      | 5301/14732 [25:20<8:24:21,  3.21s/it]

{'eval_loss': 1.39592444896698, 'eval_runtime': 10.1129, 'eval_samples_per_second': 80.887, 'eval_steps_per_second': 80.887, 'epoch': 0.36}


 36%|███▋      | 5351/14732 [25:28<25:56,  6.03it/s]  

{'loss': 1.5514, 'grad_norm': 7.484516143798828, 'learning_rate': 3.208720612356479e-05, 'epoch': 0.36}


 37%|███▋      | 5400/14732 [25:36<25:54,  6.00it/s]

{'loss': 1.5458, 'grad_norm': 14.789386749267578, 'learning_rate': 3.191634773100055e-05, 'epoch': 0.37}


                                                    
 37%|███▋      | 5400/14732 [25:46<25:54,  6.00it/s]

{'eval_loss': 1.394863247871399, 'eval_runtime': 10.2192, 'eval_samples_per_second': 80.045, 'eval_steps_per_second': 80.045, 'epoch': 0.37}


 37%|███▋      | 5451/14732 [25:57<26:23,  5.86it/s]  

{'loss': 1.5694, 'grad_norm': 6.739391803741455, 'learning_rate': 3.174548933843631e-05, 'epoch': 0.37}


 37%|███▋      | 5500/14732 [26:05<25:58,  5.92it/s]

{'loss': 1.5898, 'grad_norm': 10.652405738830566, 'learning_rate': 3.1574630945872066e-05, 'epoch': 0.37}


                                                    
 37%|███▋      | 5501/14732 [26:16<8:24:52,  3.28s/it]

{'eval_loss': 1.3936388492584229, 'eval_runtime': 10.3749, 'eval_samples_per_second': 78.844, 'eval_steps_per_second': 78.844, 'epoch': 0.37}


 38%|███▊      | 5551/14732 [26:24<24:35,  6.22it/s]  

{'loss': 1.4936, 'grad_norm': 5.029247760772705, 'learning_rate': 3.1407189721159104e-05, 'epoch': 0.38}


 38%|███▊      | 5600/14732 [26:32<25:40,  5.93it/s]

{'loss': 1.5387, 'grad_norm': 4.074696063995361, 'learning_rate': 3.123633132859487e-05, 'epoch': 0.38}


                                                    
 38%|███▊      | 5600/14732 [26:43<25:40,  5.93it/s]

{'eval_loss': 1.3902921676635742, 'eval_runtime': 10.4672, 'eval_samples_per_second': 78.149, 'eval_steps_per_second': 78.149, 'epoch': 0.38}


 38%|███▊      | 5651/14732 [26:54<26:39,  5.68it/s]   

{'loss': 1.4479, 'grad_norm': 6.414786338806152, 'learning_rate': 3.1065472936030624e-05, 'epoch': 0.38}


 39%|███▊      | 5700/14732 [27:03<25:54,  5.81it/s]

{'loss': 1.5058, 'grad_norm': 8.424344062805176, 'learning_rate': 3.089461454346638e-05, 'epoch': 0.39}


                                                    
 39%|███▊      | 5701/14732 [27:14<8:39:13,  3.45s/it]

{'eval_loss': 1.3904051780700684, 'eval_runtime': 10.9144, 'eval_samples_per_second': 74.947, 'eval_steps_per_second': 74.947, 'epoch': 0.39}


 39%|███▉      | 5751/14732 [27:23<25:52,  5.79it/s]  

{'loss': 1.6364, 'grad_norm': 5.739473819732666, 'learning_rate': 3.072375615090213e-05, 'epoch': 0.39}


 39%|███▉      | 5800/14732 [27:31<25:42,  5.79it/s]

{'loss': 1.4049, 'grad_norm': 10.630772590637207, 'learning_rate': 3.0552897758337886e-05, 'epoch': 0.39}


                                                    
 39%|███▉      | 5800/14732 [27:42<25:42,  5.79it/s]

{'eval_loss': 1.3879104852676392, 'eval_runtime': 10.7363, 'eval_samples_per_second': 76.19, 'eval_steps_per_second': 76.19, 'epoch': 0.39}


 40%|███▉      | 5851/14732 [27:53<25:30,  5.80it/s]   

{'loss': 1.541, 'grad_norm': 16.367305755615234, 'learning_rate': 3.0382039365773646e-05, 'epoch': 0.4}


 40%|████      | 5900/14732 [28:01<25:38,  5.74it/s]

{'loss': 1.4543, 'grad_norm': 5.5453996658325195, 'learning_rate': 3.0211180973209406e-05, 'epoch': 0.4}


                                                    
 40%|████      | 5901/14732 [28:12<8:20:08,  3.40s/it]

{'eval_loss': 1.394425630569458, 'eval_runtime': 10.7407, 'eval_samples_per_second': 76.159, 'eval_steps_per_second': 76.159, 'epoch': 0.4}


 40%|████      | 5951/14732 [28:21<25:20,  5.78it/s]  

{'loss': 1.5548, 'grad_norm': 5.145216941833496, 'learning_rate': 3.0040322580645162e-05, 'epoch': 0.4}


 41%|████      | 6000/14732 [28:29<24:54,  5.84it/s]

{'loss': 1.4166, 'grad_norm': 12.376266479492188, 'learning_rate': 2.986946418808092e-05, 'epoch': 0.41}


                                                    
 41%|████      | 6000/14732 [28:40<24:54,  5.84it/s]

{'eval_loss': 1.3977315425872803, 'eval_runtime': 10.6835, 'eval_samples_per_second': 76.567, 'eval_steps_per_second': 76.567, 'epoch': 0.41}


 41%|████      | 6051/14732 [28:51<24:55,  5.80it/s]  

{'loss': 1.6058, 'grad_norm': 7.546597957611084, 'learning_rate': 2.9698605795516675e-05, 'epoch': 0.41}


 41%|████▏     | 6100/14732 [28:59<24:35,  5.85it/s]

{'loss': 1.5927, 'grad_norm': 8.330795288085938, 'learning_rate': 2.9527747402952432e-05, 'epoch': 0.41}


                                                    
 41%|████▏     | 6101/14732 [29:10<8:11:10,  3.41s/it]

{'eval_loss': 1.3951719999313354, 'eval_runtime': 10.8074, 'eval_samples_per_second': 75.689, 'eval_steps_per_second': 75.689, 'epoch': 0.41}


 42%|████▏     | 6151/14732 [29:19<23:57,  5.97it/s]  

{'loss': 1.7119, 'grad_norm': 8.987910270690918, 'learning_rate': 2.9356889010388195e-05, 'epoch': 0.42}


 42%|████▏     | 6200/14732 [29:27<23:37,  6.02it/s]

{'loss': 1.4524, 'grad_norm': 9.005112648010254, 'learning_rate': 2.918603061782395e-05, 'epoch': 0.42}


                                                    
 42%|████▏     | 6200/14732 [29:37<23:37,  6.02it/s]

{'eval_loss': 1.3920053243637085, 'eval_runtime': 10.3875, 'eval_samples_per_second': 78.748, 'eval_steps_per_second': 78.748, 'epoch': 0.42}


 42%|████▏     | 6251/14732 [29:48<23:12,  6.09it/s]  

{'loss': 1.5173, 'grad_norm': 12.863531112670898, 'learning_rate': 2.9015172225259708e-05, 'epoch': 0.42}


 43%|████▎     | 6300/14732 [29:56<23:40,  5.94it/s]

{'loss': 1.516, 'grad_norm': 9.510525703430176, 'learning_rate': 2.8844313832695464e-05, 'epoch': 0.43}


                                                    
 43%|████▎     | 6301/14732 [30:07<8:01:22,  3.43s/it]

{'eval_loss': 1.3814780712127686, 'eval_runtime': 10.8532, 'eval_samples_per_second': 75.369, 'eval_steps_per_second': 75.369, 'epoch': 0.43}


 43%|████▎     | 6351/14732 [30:16<23:43,  5.89it/s]  

{'loss': 1.5723, 'grad_norm': 4.832693099975586, 'learning_rate': 2.867345544013122e-05, 'epoch': 0.43}


 43%|████▎     | 6400/14732 [30:24<22:58,  6.05it/s]

{'loss': 1.4446, 'grad_norm': 12.126529693603516, 'learning_rate': 2.8502597047566977e-05, 'epoch': 0.43}


                                                    
 43%|████▎     | 6400/14732 [30:34<22:58,  6.05it/s]

{'eval_loss': 1.3819104433059692, 'eval_runtime': 10.1491, 'eval_samples_per_second': 80.598, 'eval_steps_per_second': 80.598, 'epoch': 0.43}


 44%|████▍     | 6451/14732 [30:45<23:36,  5.84it/s]  

{'loss': 1.4747, 'grad_norm': 6.082335948944092, 'learning_rate': 2.8331738655002737e-05, 'epoch': 0.44}


 44%|████▍     | 6500/14732 [30:54<23:30,  5.84it/s]

{'loss': 1.4022, 'grad_norm': 6.53870153427124, 'learning_rate': 2.8160880262438493e-05, 'epoch': 0.44}


                                                    
 44%|████▍     | 6501/14732 [31:05<8:04:12,  3.53s/it]

{'eval_loss': 1.3834974765777588, 'eval_runtime': 11.1871, 'eval_samples_per_second': 73.12, 'eval_steps_per_second': 73.12, 'epoch': 0.44}


 44%|████▍     | 6551/14732 [31:14<23:58,  5.69it/s]  

{'loss': 1.6248, 'grad_norm': 6.404665470123291, 'learning_rate': 2.799002186987425e-05, 'epoch': 0.44}


 45%|████▍     | 6600/14732 [31:22<23:10,  5.85it/s]

{'loss': 1.486, 'grad_norm': 4.6708664894104, 'learning_rate': 2.7819163477310006e-05, 'epoch': 0.45}


                                                    
 45%|████▍     | 6600/14732 [31:33<23:10,  5.85it/s]

{'eval_loss': 1.3817843198776245, 'eval_runtime': 10.6645, 'eval_samples_per_second': 76.703, 'eval_steps_per_second': 76.703, 'epoch': 0.45}


 45%|████▌     | 6651/14732 [31:44<23:50,  5.65it/s]  

{'loss': 1.4636, 'grad_norm': 4.228817462921143, 'learning_rate': 2.7648305084745763e-05, 'epoch': 0.45}


 45%|████▌     | 6700/14732 [31:52<22:47,  5.87it/s]

{'loss': 1.4369, 'grad_norm': 4.625975131988525, 'learning_rate': 2.747744669218152e-05, 'epoch': 0.45}


                                                    
 45%|████▌     | 6701/14732 [32:03<7:29:50,  3.36s/it]

{'eval_loss': 1.377915382385254, 'eval_runtime': 10.6286, 'eval_samples_per_second': 76.962, 'eval_steps_per_second': 76.962, 'epoch': 0.45}


 46%|████▌     | 6751/14732 [32:11<23:10,  5.74it/s]  

{'loss': 1.4494, 'grad_norm': 5.364345073699951, 'learning_rate': 2.730658829961728e-05, 'epoch': 0.46}


 46%|████▌     | 6800/14732 [32:20<22:42,  5.82it/s]

{'loss': 1.5654, 'grad_norm': 5.746093273162842, 'learning_rate': 2.7135729907053036e-05, 'epoch': 0.46}


                                                    
 46%|████▌     | 6800/14732 [32:31<22:42,  5.82it/s]

{'eval_loss': 1.3756434917449951, 'eval_runtime': 10.8529, 'eval_samples_per_second': 75.372, 'eval_steps_per_second': 75.372, 'epoch': 0.46}


 47%|████▋     | 6851/14732 [32:42<22:43,  5.78it/s]  

{'loss': 1.6903, 'grad_norm': 5.322749614715576, 'learning_rate': 2.6964871514488792e-05, 'epoch': 0.46}


 47%|████▋     | 6900/14732 [32:50<22:29,  5.80it/s]

{'loss': 1.6185, 'grad_norm': 4.3203582763671875, 'learning_rate': 2.679401312192455e-05, 'epoch': 0.47}


                                                    
 47%|████▋     | 6901/14732 [33:01<7:38:04,  3.51s/it]

{'eval_loss': 1.3700097799301147, 'eval_runtime': 11.1156, 'eval_samples_per_second': 73.59, 'eval_steps_per_second': 73.59, 'epoch': 0.47}


 47%|████▋     | 6951/14732 [33:10<22:02,  5.88it/s]  

{'loss': 1.4835, 'grad_norm': 8.373361587524414, 'learning_rate': 2.6623154729360305e-05, 'epoch': 0.47}


 48%|████▊     | 7000/14732 [33:18<22:04,  5.84it/s]

{'loss': 1.5153, 'grad_norm': 8.296738624572754, 'learning_rate': 2.6452296336796068e-05, 'epoch': 0.48}


                                                    
 48%|████▊     | 7000/14732 [33:29<22:04,  5.84it/s]

{'eval_loss': 1.3721208572387695, 'eval_runtime': 10.8901, 'eval_samples_per_second': 75.114, 'eval_steps_per_second': 75.114, 'epoch': 0.48}


 48%|████▊     | 7051/14732 [33:40<22:24,  5.71it/s]  

{'loss': 1.6104, 'grad_norm': 5.342876434326172, 'learning_rate': 2.6281437944231825e-05, 'epoch': 0.48}


 48%|████▊     | 7100/14732 [33:49<22:22,  5.69it/s]

{'loss': 1.4906, 'grad_norm': 6.786604404449463, 'learning_rate': 2.611057955166758e-05, 'epoch': 0.48}


                                                    
 48%|████▊     | 7101/14732 [34:00<7:32:36,  3.56s/it]

{'eval_loss': 1.3715282678604126, 'eval_runtime': 11.2689, 'eval_samples_per_second': 72.589, 'eval_steps_per_second': 72.589, 'epoch': 0.48}


 49%|████▊     | 7151/14732 [34:09<22:30,  5.61it/s]  

{'loss': 1.5418, 'grad_norm': 9.560026168823242, 'learning_rate': 2.5939721159103337e-05, 'epoch': 0.49}


 49%|████▉     | 7200/14732 [34:18<21:12,  5.92it/s]

{'loss': 1.4111, 'grad_norm': 5.690976619720459, 'learning_rate': 2.5768862766539094e-05, 'epoch': 0.49}


                                                    
 49%|████▉     | 7200/14732 [34:29<21:12,  5.92it/s]

{'eval_loss': 1.3732542991638184, 'eval_runtime': 10.9657, 'eval_samples_per_second': 74.596, 'eval_steps_per_second': 74.596, 'epoch': 0.49}


 49%|████▉     | 7251/14732 [34:40<21:48,  5.72it/s]  

{'loss': 1.5771, 'grad_norm': 7.95228910446167, 'learning_rate': 2.559800437397485e-05, 'epoch': 0.49}


 50%|████▉     | 7300/14732 [34:48<20:36,  6.01it/s]

{'loss': 1.3949, 'grad_norm': 4.39606237411499, 'learning_rate': 2.542714598141061e-05, 'epoch': 0.5}


                                                    
 50%|████▉     | 7301/14732 [35:00<7:18:40,  3.54s/it]

{'eval_loss': 1.3727549314498901, 'eval_runtime': 11.2418, 'eval_samples_per_second': 72.764, 'eval_steps_per_second': 72.764, 'epoch': 0.5}


 50%|████▉     | 7351/14732 [35:08<20:57,  5.87it/s]  

{'loss': 1.5836, 'grad_norm': 7.520824909210205, 'learning_rate': 2.5256287588846367e-05, 'epoch': 0.5}


 50%|█████     | 7400/14732 [35:17<21:49,  5.60it/s]

{'loss': 1.4339, 'grad_norm': 6.446773529052734, 'learning_rate': 2.5085429196282123e-05, 'epoch': 0.5}


                                                    
 50%|█████     | 7400/14732 [35:27<21:49,  5.60it/s]

{'eval_loss': 1.3687529563903809, 'eval_runtime': 10.8093, 'eval_samples_per_second': 75.675, 'eval_steps_per_second': 75.675, 'epoch': 0.5}


 51%|█████     | 7451/14732 [35:38<20:41,  5.87it/s]  

{'loss': 1.4397, 'grad_norm': 6.787688732147217, 'learning_rate': 2.4917987971569164e-05, 'epoch': 0.51}


 51%|█████     | 7500/14732 [35:46<20:43,  5.82it/s]

{'loss': 1.6072, 'grad_norm': 10.999531745910645, 'learning_rate': 2.474712957900492e-05, 'epoch': 0.51}


                                                    
 51%|█████     | 7501/14732 [35:57<6:50:59,  3.41s/it]

{'eval_loss': 1.3705943822860718, 'eval_runtime': 10.7866, 'eval_samples_per_second': 75.835, 'eval_steps_per_second': 75.835, 'epoch': 0.51}


 51%|█████▏    | 7551/14732 [36:06<21:03,  5.68it/s]  

{'loss': 1.4497, 'grad_norm': 5.280839443206787, 'learning_rate': 2.457627118644068e-05, 'epoch': 0.51}


 52%|█████▏    | 7600/14732 [36:15<20:13,  5.88it/s]

{'loss': 1.5139, 'grad_norm': 7.0220046043396, 'learning_rate': 2.4405412793876437e-05, 'epoch': 0.52}


                                                    
 52%|█████▏    | 7600/14732 [36:25<20:13,  5.88it/s]

{'eval_loss': 1.370314359664917, 'eval_runtime': 10.6097, 'eval_samples_per_second': 77.099, 'eval_steps_per_second': 77.099, 'epoch': 0.52}


 52%|█████▏    | 7651/14732 [36:36<20:35,  5.73it/s]  

{'loss': 1.5206, 'grad_norm': 4.9969706535339355, 'learning_rate': 2.4234554401312194e-05, 'epoch': 0.52}


 52%|█████▏    | 7700/14732 [36:45<19:44,  5.94it/s]

{'loss': 1.4603, 'grad_norm': 4.693324565887451, 'learning_rate': 2.4063696008747953e-05, 'epoch': 0.52}


                                                    
 52%|█████▏    | 7701/14732 [36:55<6:22:45,  3.27s/it]

{'eval_loss': 1.3692125082015991, 'eval_runtime': 10.3235, 'eval_samples_per_second': 79.237, 'eval_steps_per_second': 79.237, 'epoch': 0.52}


 53%|█████▎    | 7751/14732 [37:03<20:17,  5.73it/s]  

{'loss': 1.5043, 'grad_norm': 6.611298084259033, 'learning_rate': 2.3892837616183706e-05, 'epoch': 0.53}


 53%|█████▎    | 7800/14732 [37:12<19:17,  5.99it/s]

{'loss': 1.5263, 'grad_norm': 10.174145698547363, 'learning_rate': 2.3721979223619463e-05, 'epoch': 0.53}


                                                    
 53%|█████▎    | 7800/14732 [37:22<19:17,  5.99it/s]

{'eval_loss': 1.3669793605804443, 'eval_runtime': 10.6049, 'eval_samples_per_second': 77.134, 'eval_steps_per_second': 77.134, 'epoch': 0.53}


 53%|█████▎    | 7851/14732 [37:33<19:09,  5.99it/s]  

{'loss': 1.525, 'grad_norm': 9.167699813842773, 'learning_rate': 2.3551120831055223e-05, 'epoch': 0.53}


 54%|█████▎    | 7900/14732 [37:41<19:30,  5.84it/s]

{'loss': 1.6975, 'grad_norm': 5.436936378479004, 'learning_rate': 2.338026243849098e-05, 'epoch': 0.54}


                                                    
 54%|█████▎    | 7901/14732 [37:52<6:28:15,  3.41s/it]

{'eval_loss': 1.363503098487854, 'eval_runtime': 10.7927, 'eval_samples_per_second': 75.792, 'eval_steps_per_second': 75.792, 'epoch': 0.54}


 54%|█████▍    | 7951/14732 [38:01<19:04,  5.93it/s]  

{'loss': 1.5764, 'grad_norm': 5.115318775177002, 'learning_rate': 2.3209404045926736e-05, 'epoch': 0.54}


 54%|█████▍    | 8000/14732 [38:09<18:51,  5.95it/s]

{'loss': 1.3636, 'grad_norm': 9.0882568359375, 'learning_rate': 2.3038545653362495e-05, 'epoch': 0.54}


                                                    
 54%|█████▍    | 8000/14732 [38:20<18:51,  5.95it/s]

{'eval_loss': 1.3691602945327759, 'eval_runtime': 10.8207, 'eval_samples_per_second': 75.596, 'eval_steps_per_second': 75.596, 'epoch': 0.54}


 55%|█████▍    | 8051/14732 [38:31<18:52,  5.90it/s]  

{'loss': 1.4751, 'grad_norm': 7.2659735679626465, 'learning_rate': 2.2867687260798252e-05, 'epoch': 0.55}


 55%|█████▍    | 8100/14732 [38:39<18:56,  5.84it/s]

{'loss': 1.4729, 'grad_norm': 6.974119663238525, 'learning_rate': 2.270366320393658e-05, 'epoch': 0.55}


                                                    
 55%|█████▍    | 8101/14732 [38:50<6:24:27,  3.48s/it]

{'eval_loss': 1.3652606010437012, 'eval_runtime': 11.1813, 'eval_samples_per_second': 73.158, 'eval_steps_per_second': 73.158, 'epoch': 0.55}


 55%|█████▌    | 8151/14732 [38:59<19:02,  5.76it/s]  

{'loss': 1.496, 'grad_norm': 6.163193702697754, 'learning_rate': 2.2532804811372335e-05, 'epoch': 0.55}


 56%|█████▌    | 8200/14732 [39:08<18:35,  5.85it/s]

{'loss': 1.5306, 'grad_norm': 4.214131832122803, 'learning_rate': 2.2361946418808095e-05, 'epoch': 0.56}


                                                    
 56%|█████▌    | 8200/14732 [39:18<18:35,  5.85it/s]

{'eval_loss': 1.361916422843933, 'eval_runtime': 10.2852, 'eval_samples_per_second': 79.531, 'eval_steps_per_second': 79.531, 'epoch': 0.56}


 56%|█████▌    | 8251/14732 [39:29<18:03,  5.98it/s]  

{'loss': 1.4417, 'grad_norm': 17.050947189331055, 'learning_rate': 2.219108802624385e-05, 'epoch': 0.56}


 56%|█████▋    | 8300/14732 [39:37<17:50,  6.01it/s]

{'loss': 1.5267, 'grad_norm': 5.857585430145264, 'learning_rate': 2.2027063969382177e-05, 'epoch': 0.56}


                                                    
 56%|█████▋    | 8301/14732 [39:48<5:51:03,  3.28s/it]

{'eval_loss': 1.362062931060791, 'eval_runtime': 10.6146, 'eval_samples_per_second': 77.064, 'eval_steps_per_second': 77.064, 'epoch': 0.56}


 57%|█████▋    | 8351/14732 [39:56<17:39,  6.02it/s]  

{'loss': 1.5767, 'grad_norm': 7.542107582092285, 'learning_rate': 2.1856205576817934e-05, 'epoch': 0.57}


 57%|█████▋    | 8400/14732 [40:04<17:44,  5.95it/s]

{'loss': 1.5638, 'grad_norm': 8.936718940734863, 'learning_rate': 2.168534718425369e-05, 'epoch': 0.57}


                                                    
 57%|█████▋    | 8400/14732 [40:16<17:44,  5.95it/s]

{'eval_loss': 1.3616082668304443, 'eval_runtime': 11.168, 'eval_samples_per_second': 73.245, 'eval_steps_per_second': 73.245, 'epoch': 0.57}


 57%|█████▋    | 8451/14732 [40:26<17:45,  5.89it/s]  

{'loss': 1.3753, 'grad_norm': 8.405887603759766, 'learning_rate': 2.151448879168945e-05, 'epoch': 0.57}


 58%|█████▊    | 8500/14732 [40:35<17:45,  5.85it/s]

{'loss': 1.4399, 'grad_norm': 3.9184255599975586, 'learning_rate': 2.1343630399125207e-05, 'epoch': 0.58}


                                                    
 58%|█████▊    | 8501/14732 [40:46<6:07:51,  3.54s/it]

{'eval_loss': 1.3640505075454712, 'eval_runtime': 11.2265, 'eval_samples_per_second': 72.863, 'eval_steps_per_second': 72.863, 'epoch': 0.58}


 58%|█████▊    | 8551/14732 [40:55<18:02,  5.71it/s]  

{'loss': 1.4909, 'grad_norm': 8.547700881958008, 'learning_rate': 2.1172772006560963e-05, 'epoch': 0.58}


 58%|█████▊    | 8600/14732 [41:03<17:32,  5.82it/s]

{'loss': 1.327, 'grad_norm': 9.030593872070312, 'learning_rate': 2.1005330781848008e-05, 'epoch': 0.58}


                                                    
 58%|█████▊    | 8600/14732 [41:14<17:32,  5.82it/s]

{'eval_loss': 1.3636387586593628, 'eval_runtime': 10.8326, 'eval_samples_per_second': 75.513, 'eval_steps_per_second': 75.513, 'epoch': 0.58}


 59%|█████▊    | 8651/14732 [41:25<16:43,  6.06it/s]  

{'loss': 1.4787, 'grad_norm': 1.944970965385437, 'learning_rate': 2.083788955713505e-05, 'epoch': 0.59}


 59%|█████▉    | 8700/14732 [41:33<17:02,  5.90it/s]

{'loss': 1.5616, 'grad_norm': 7.358983993530273, 'learning_rate': 2.067044833242209e-05, 'epoch': 0.59}


                                                    
 59%|█████▉    | 8701/14732 [41:45<5:54:06,  3.52s/it]

{'eval_loss': 1.363389253616333, 'eval_runtime': 11.1659, 'eval_samples_per_second': 73.259, 'eval_steps_per_second': 73.259, 'epoch': 0.59}


 59%|█████▉    | 8751/14732 [41:53<17:10,  5.81it/s]  

{'loss': 1.6071, 'grad_norm': 6.632728099822998, 'learning_rate': 2.0499589939857847e-05, 'epoch': 0.59}


 60%|█████▉    | 8800/14732 [42:02<17:03,  5.80it/s]

{'loss': 1.5563, 'grad_norm': 3.459913969039917, 'learning_rate': 2.0328731547293604e-05, 'epoch': 0.6}


                                                    
 60%|█████▉    | 8800/14732 [42:13<17:03,  5.80it/s]

{'eval_loss': 1.3577849864959717, 'eval_runtime': 11.2324, 'eval_samples_per_second': 72.825, 'eval_steps_per_second': 72.825, 'epoch': 0.6}


 60%|██████    | 8851/14732 [42:24<17:23,  5.64it/s]  

{'loss': 1.4668, 'grad_norm': 7.705807685852051, 'learning_rate': 2.0157873154729364e-05, 'epoch': 0.6}


 60%|██████    | 8900/14732 [42:33<17:14,  5.64it/s]

{'loss': 1.6423, 'grad_norm': 3.628916025161743, 'learning_rate': 1.9987014762165117e-05, 'epoch': 0.6}


                                                    
 60%|██████    | 8901/14732 [42:44<5:41:26,  3.51s/it]

{'eval_loss': 1.3573126792907715, 'eval_runtime': 11.111, 'eval_samples_per_second': 73.621, 'eval_steps_per_second': 73.621, 'epoch': 0.6}


 61%|██████    | 8951/14732 [42:52<17:08,  5.62it/s]  

{'loss': 1.5224, 'grad_norm': 7.890390396118164, 'learning_rate': 1.9816156369600873e-05, 'epoch': 0.61}


 61%|██████    | 9000/14732 [43:01<16:11,  5.90it/s]

{'loss': 1.6483, 'grad_norm': 5.412892818450928, 'learning_rate': 1.9645297977036633e-05, 'epoch': 0.61}


                                                    
 61%|██████    | 9000/14732 [43:12<16:11,  5.90it/s]

{'eval_loss': 1.3567980527877808, 'eval_runtime': 11.1077, 'eval_samples_per_second': 73.643, 'eval_steps_per_second': 73.643, 'epoch': 0.61}


 61%|██████▏   | 9051/14732 [43:23<16:21,  5.79it/s]  

{'loss': 1.5128, 'grad_norm': 7.4686760902404785, 'learning_rate': 1.947443958447239e-05, 'epoch': 0.61}


 62%|██████▏   | 9100/14732 [43:32<16:04,  5.84it/s]

{'loss': 1.6524, 'grad_norm': 4.567707061767578, 'learning_rate': 1.9303581191908146e-05, 'epoch': 0.62}


                                                    
 62%|██████▏   | 9101/14732 [43:43<5:26:06,  3.47s/it]

{'eval_loss': 1.353794813156128, 'eval_runtime': 10.9993, 'eval_samples_per_second': 74.368, 'eval_steps_per_second': 74.368, 'epoch': 0.62}


 62%|██████▏   | 9151/14732 [43:51<15:26,  6.02it/s]  

{'loss': 1.5279, 'grad_norm': 6.395282745361328, 'learning_rate': 1.9132722799343906e-05, 'epoch': 0.62}


 62%|██████▏   | 9200/14732 [44:00<16:14,  5.68it/s]

{'loss': 1.6434, 'grad_norm': 6.021356105804443, 'learning_rate': 1.8961864406779662e-05, 'epoch': 0.62}


                                                    
 62%|██████▏   | 9200/14732 [44:11<16:14,  5.68it/s]

{'eval_loss': 1.3482210636138916, 'eval_runtime': 11.0773, 'eval_samples_per_second': 73.845, 'eval_steps_per_second': 73.845, 'epoch': 0.62}


 63%|██████▎   | 9251/14732 [44:22<15:33,  5.87it/s]  

{'loss': 1.4197, 'grad_norm': 3.7675113677978516, 'learning_rate': 1.8791006014215422e-05, 'epoch': 0.63}


 63%|██████▎   | 9300/14732 [44:30<15:46,  5.74it/s]

{'loss': 1.5597, 'grad_norm': 7.630548477172852, 'learning_rate': 1.8620147621651178e-05, 'epoch': 0.63}


                                                    
 63%|██████▎   | 9301/14732 [44:42<5:21:49,  3.56s/it]

{'eval_loss': 1.3541779518127441, 'eval_runtime': 11.2645, 'eval_samples_per_second': 72.617, 'eval_steps_per_second': 72.617, 'epoch': 0.63}


 63%|██████▎   | 9351/14732 [44:50<15:39,  5.73it/s]  

{'loss': 1.383, 'grad_norm': 4.900386333465576, 'learning_rate': 1.844928922908693e-05, 'epoch': 0.63}


 64%|██████▍   | 9400/14732 [44:59<15:10,  5.85it/s]

{'loss': 1.2822, 'grad_norm': 4.499915599822998, 'learning_rate': 1.827843083652269e-05, 'epoch': 0.64}


                                                    
 64%|██████▍   | 9400/14732 [45:10<15:10,  5.85it/s]

{'eval_loss': 1.3573496341705322, 'eval_runtime': 11.0955, 'eval_samples_per_second': 73.724, 'eval_steps_per_second': 73.724, 'epoch': 0.64}


 64%|██████▍   | 9451/14732 [45:21<15:03,  5.85it/s]  

{'loss': 1.4236, 'grad_norm': 4.148250579833984, 'learning_rate': 1.8107572443958448e-05, 'epoch': 0.64}


 64%|██████▍   | 9500/14732 [45:29<14:37,  5.96it/s]

{'loss': 1.7084, 'grad_norm': 3.2590036392211914, 'learning_rate': 1.7936714051394204e-05, 'epoch': 0.64}


                                                    
 64%|██████▍   | 9501/14732 [45:40<4:59:54,  3.44s/it]

{'eval_loss': 1.356772780418396, 'eval_runtime': 10.9033, 'eval_samples_per_second': 75.023, 'eval_steps_per_second': 75.023, 'epoch': 0.64}


 65%|██████▍   | 9551/14732 [45:49<14:53,  5.80it/s]  

{'loss': 1.43, 'grad_norm': 4.718110084533691, 'learning_rate': 1.7765855658829964e-05, 'epoch': 0.65}


 65%|██████▌   | 9600/14732 [45:57<14:26,  5.93it/s]

{'loss': 1.6338, 'grad_norm': 13.06055736541748, 'learning_rate': 1.7598414434117005e-05, 'epoch': 0.65}


                                                    
 65%|██████▌   | 9600/14732 [46:09<14:26,  5.93it/s]

{'eval_loss': 1.3591325283050537, 'eval_runtime': 11.5021, 'eval_samples_per_second': 71.117, 'eval_steps_per_second': 71.117, 'epoch': 0.65}


 66%|██████▌   | 9651/14732 [46:20<14:13,  5.96it/s]  

{'loss': 1.4595, 'grad_norm': 5.0712103843688965, 'learning_rate': 1.7427556041552762e-05, 'epoch': 0.66}


 66%|██████▌   | 9700/14732 [46:28<14:16,  5.88it/s]

{'loss': 1.3612, 'grad_norm': 4.591216087341309, 'learning_rate': 1.7256697648988518e-05, 'epoch': 0.66}


                                                    
 66%|██████▌   | 9701/14732 [46:39<4:34:06,  3.27s/it]

{'eval_loss': 1.3558852672576904, 'eval_runtime': 10.3281, 'eval_samples_per_second': 79.202, 'eval_steps_per_second': 79.202, 'epoch': 0.66}


 66%|██████▌   | 9751/14732 [46:47<14:07,  5.88it/s]  

{'loss': 1.5261, 'grad_norm': 4.63593053817749, 'learning_rate': 1.7085839256424278e-05, 'epoch': 0.66}


 67%|██████▋   | 9800/14732 [46:55<13:57,  5.89it/s]

{'loss': 1.6018, 'grad_norm': 10.342902183532715, 'learning_rate': 1.6914980863860034e-05, 'epoch': 0.67}


                                                    
 67%|██████▋   | 9800/14732 [47:06<13:57,  5.89it/s]

{'eval_loss': 1.3518919944763184, 'eval_runtime': 10.751, 'eval_samples_per_second': 76.086, 'eval_steps_per_second': 76.086, 'epoch': 0.67}


 67%|██████▋   | 9851/14732 [47:17<13:48,  5.89it/s]  

{'loss': 1.6178, 'grad_norm': 16.219240188598633, 'learning_rate': 1.674412247129579e-05, 'epoch': 0.67}


 67%|██████▋   | 9900/14732 [47:25<13:19,  6.04it/s]

{'loss': 1.5091, 'grad_norm': 3.5812935829162598, 'learning_rate': 1.6573264078731547e-05, 'epoch': 0.67}


                                                    
 67%|██████▋   | 9901/14732 [47:36<4:39:23,  3.47s/it]

{'eval_loss': 1.3512158393859863, 'eval_runtime': 10.9989, 'eval_samples_per_second': 74.371, 'eval_steps_per_second': 74.371, 'epoch': 0.67}


 68%|██████▊   | 9951/14732 [47:45<13:16,  6.01it/s]  

{'loss': 1.4713, 'grad_norm': 7.012796401977539, 'learning_rate': 1.6402405686167304e-05, 'epoch': 0.68}


 68%|██████▊   | 10000/14732 [47:53<13:26,  5.86it/s]

{'loss': 1.6117, 'grad_norm': 5.017807483673096, 'learning_rate': 1.6234964461454345e-05, 'epoch': 0.68}


                                                     
 68%|██████▊   | 10000/14732 [48:04<13:26,  5.86it/s]

{'eval_loss': 1.352590560913086, 'eval_runtime': 10.351, 'eval_samples_per_second': 79.026, 'eval_steps_per_second': 79.026, 'epoch': 0.68}


 68%|██████▊   | 10051/14732 [48:15<13:32,  5.76it/s]  

{'loss': 1.5659, 'grad_norm': 8.26111888885498, 'learning_rate': 1.6064106068890105e-05, 'epoch': 0.68}


 69%|██████▊   | 10100/14732 [48:23<12:50,  6.01it/s]

{'loss': 1.3539, 'grad_norm': 3.3784592151641846, 'learning_rate': 1.589324767632586e-05, 'epoch': 0.69}


                                                     
 69%|██████▊   | 10101/14732 [48:34<4:21:19,  3.39s/it]

{'eval_loss': 1.3509572744369507, 'eval_runtime': 10.7244, 'eval_samples_per_second': 76.274, 'eval_steps_per_second': 76.274, 'epoch': 0.69}


 69%|██████▉   | 10151/14732 [48:42<12:42,  6.01it/s]  

{'loss': 1.6302, 'grad_norm': 5.681556701660156, 'learning_rate': 1.5722389283761618e-05, 'epoch': 0.69}


 69%|██████▉   | 10200/14732 [48:51<12:33,  6.02it/s]

{'loss': 1.4014, 'grad_norm': 6.090015411376953, 'learning_rate': 1.5551530891197378e-05, 'epoch': 0.69}


                                                     
 69%|██████▉   | 10200/14732 [49:01<12:33,  6.02it/s]

{'eval_loss': 1.3489515781402588, 'eval_runtime': 10.4954, 'eval_samples_per_second': 77.939, 'eval_steps_per_second': 77.939, 'epoch': 0.69}


 70%|██████▉   | 10251/14732 [49:12<12:40,  5.89it/s]  

{'loss': 1.5137, 'grad_norm': 4.127411842346191, 'learning_rate': 1.5380672498633134e-05, 'epoch': 0.7}


 70%|██████▉   | 10300/14732 [49:20<12:45,  5.79it/s]

{'loss': 1.4669, 'grad_norm': 7.12483549118042, 'learning_rate': 1.5209814106068889e-05, 'epoch': 0.7}


                                                     
 70%|██████▉   | 10301/14732 [49:31<4:15:09,  3.46s/it]

{'eval_loss': 1.345658302307129, 'eval_runtime': 10.9346, 'eval_samples_per_second': 74.808, 'eval_steps_per_second': 74.808, 'epoch': 0.7}


 70%|███████   | 10351/14732 [49:40<12:20,  5.92it/s]  

{'loss': 1.3618, 'grad_norm': 18.157581329345703, 'learning_rate': 1.5038955713504649e-05, 'epoch': 0.7}


 71%|███████   | 10400/14732 [49:48<12:04,  5.98it/s]

{'loss': 1.4376, 'grad_norm': 6.692669868469238, 'learning_rate': 1.4868097320940405e-05, 'epoch': 0.71}


                                                     
 71%|███████   | 10400/14732 [49:59<12:04,  5.98it/s]

{'eval_loss': 1.3471781015396118, 'eval_runtime': 10.4507, 'eval_samples_per_second': 78.272, 'eval_steps_per_second': 78.272, 'epoch': 0.71}


 71%|███████   | 10451/14732 [50:09<11:14,  6.35it/s]  

{'loss': 1.4339, 'grad_norm': 4.017574310302734, 'learning_rate': 1.4700656096227447e-05, 'epoch': 0.71}


 71%|███████▏  | 10500/14732 [50:17<11:39,  6.05it/s]

{'loss': 1.4534, 'grad_norm': 10.427239418029785, 'learning_rate': 1.4529797703663203e-05, 'epoch': 0.71}


                                                     
 71%|███████▏  | 10501/14732 [50:27<3:50:47,  3.27s/it]

{'eval_loss': 1.349068284034729, 'eval_runtime': 10.3458, 'eval_samples_per_second': 79.066, 'eval_steps_per_second': 79.066, 'epoch': 0.71}


 72%|███████▏  | 10551/14732 [50:36<11:42,  5.95it/s]  

{'loss': 1.4435, 'grad_norm': 8.939870834350586, 'learning_rate': 1.4358939311098963e-05, 'epoch': 0.72}


 72%|███████▏  | 10600/14732 [50:44<11:22,  6.06it/s]

{'loss': 1.4639, 'grad_norm': 18.391191482543945, 'learning_rate': 1.418808091853472e-05, 'epoch': 0.72}


                                                     
 72%|███████▏  | 10600/14732 [50:55<11:22,  6.06it/s]

{'eval_loss': 1.3522250652313232, 'eval_runtime': 10.3964, 'eval_samples_per_second': 78.681, 'eval_steps_per_second': 78.681, 'epoch': 0.72}


 72%|███████▏  | 10651/14732 [51:06<11:35,  5.87it/s]  

{'loss': 1.4703, 'grad_norm': 7.965848922729492, 'learning_rate': 1.4017222525970478e-05, 'epoch': 0.72}


 73%|███████▎  | 10700/14732 [51:14<11:23,  5.90it/s]

{'loss': 1.565, 'grad_norm': 4.021920680999756, 'learning_rate': 1.3846364133406234e-05, 'epoch': 0.73}


                                                     
 73%|███████▎  | 10701/14732 [51:25<3:52:50,  3.47s/it]

{'eval_loss': 1.351072907447815, 'eval_runtime': 10.9788, 'eval_samples_per_second': 74.507, 'eval_steps_per_second': 74.507, 'epoch': 0.73}


 73%|███████▎  | 10751/14732 [51:33<11:01,  6.02it/s]  

{'loss': 1.3874, 'grad_norm': 5.638105869293213, 'learning_rate': 1.367550574084199e-05, 'epoch': 0.73}


 73%|███████▎  | 10800/14732 [51:42<11:00,  5.95it/s]

{'loss': 1.4605, 'grad_norm': 4.217365741729736, 'learning_rate': 1.3504647348277749e-05, 'epoch': 0.73}


                                                     
 73%|███████▎  | 10800/14732 [51:52<11:00,  5.95it/s]

{'eval_loss': 1.3422144651412964, 'eval_runtime': 10.2298, 'eval_samples_per_second': 79.962, 'eval_steps_per_second': 79.962, 'epoch': 0.73}


 74%|███████▎  | 10851/14732 [52:03<10:52,  5.95it/s]  

{'loss': 1.5123, 'grad_norm': 11.247200012207031, 'learning_rate': 1.3333788955713505e-05, 'epoch': 0.74}


 74%|███████▍  | 10900/14732 [52:11<10:48,  5.91it/s]

{'loss': 1.5369, 'grad_norm': 3.955390214920044, 'learning_rate': 1.3162930563149261e-05, 'epoch': 0.74}


                                                     
 74%|███████▍  | 10901/14732 [52:22<3:30:20,  3.29s/it]

{'eval_loss': 1.3387789726257324, 'eval_runtime': 10.4096, 'eval_samples_per_second': 78.581, 'eval_steps_per_second': 78.581, 'epoch': 0.74}


 74%|███████▍  | 10951/14732 [52:30<10:55,  5.77it/s]  

{'loss': 1.3785, 'grad_norm': 4.960180759429932, 'learning_rate': 1.2992072170585021e-05, 'epoch': 0.74}


 75%|███████▍  | 11000/14732 [52:39<10:13,  6.08it/s]

{'loss': 1.3389, 'grad_norm': 8.481831550598145, 'learning_rate': 1.2821213778020778e-05, 'epoch': 0.75}


                                                     
 75%|███████▍  | 11000/14732 [52:49<10:13,  6.08it/s]

{'eval_loss': 1.3425439596176147, 'eval_runtime': 10.3583, 'eval_samples_per_second': 78.97, 'eval_steps_per_second': 78.97, 'epoch': 0.75}


 75%|███████▌  | 11051/14732 [53:00<10:46,  5.70it/s]  

{'loss': 1.5055, 'grad_norm': 5.263872146606445, 'learning_rate': 1.2650355385456532e-05, 'epoch': 0.75}


 75%|███████▌  | 11100/14732 [53:08<10:04,  6.01it/s]

{'loss': 1.4671, 'grad_norm': 4.910567760467529, 'learning_rate': 1.247949699289229e-05, 'epoch': 0.75}


                                                     
 75%|███████▌  | 11101/14732 [53:19<3:18:06,  3.27s/it]

{'eval_loss': 1.3408477306365967, 'eval_runtime': 10.3523, 'eval_samples_per_second': 79.016, 'eval_steps_per_second': 79.016, 'epoch': 0.75}


 76%|███████▌  | 11151/14732 [53:27<09:54,  6.02it/s]  

{'loss': 1.5468, 'grad_norm': 4.8252949714660645, 'learning_rate': 1.2308638600328049e-05, 'epoch': 0.76}


 76%|███████▌  | 11200/14732 [53:35<09:43,  6.06it/s]

{'loss': 1.4745, 'grad_norm': 9.281892776489258, 'learning_rate': 1.214119737561509e-05, 'epoch': 0.76}


                                                     
 76%|███████▌  | 11200/14732 [53:46<09:43,  6.06it/s]

{'eval_loss': 1.3374793529510498, 'eval_runtime': 10.2425, 'eval_samples_per_second': 79.864, 'eval_steps_per_second': 79.864, 'epoch': 0.76}


 76%|███████▋  | 11251/14732 [53:57<09:39,  6.01it/s]  

{'loss': 1.4191, 'grad_norm': 10.90896224975586, 'learning_rate': 1.1970338983050848e-05, 'epoch': 0.76}


 77%|███████▋  | 11300/14732 [54:05<09:32,  5.99it/s]

{'loss': 1.5681, 'grad_norm': 5.844140529632568, 'learning_rate': 1.1799480590486606e-05, 'epoch': 0.77}


                                                     
 77%|███████▋  | 11301/14732 [54:16<3:13:03,  3.38s/it]

{'eval_loss': 1.3377052545547485, 'eval_runtime': 10.6885, 'eval_samples_per_second': 76.531, 'eval_steps_per_second': 76.531, 'epoch': 0.77}


 77%|███████▋  | 11351/14732 [54:24<09:21,  6.03it/s]  

{'loss': 1.5337, 'grad_norm': 14.018595695495605, 'learning_rate': 1.1628622197922361e-05, 'epoch': 0.77}


 77%|███████▋  | 11400/14732 [54:32<09:08,  6.08it/s]

{'loss': 1.3982, 'grad_norm': 7.918650150299072, 'learning_rate': 1.145776380535812e-05, 'epoch': 0.77}


                                                     
 77%|███████▋  | 11400/14732 [54:43<09:08,  6.08it/s]

{'eval_loss': 1.3396594524383545, 'eval_runtime': 10.2591, 'eval_samples_per_second': 79.734, 'eval_steps_per_second': 79.734, 'epoch': 0.77}


 78%|███████▊  | 11451/14732 [54:54<09:18,  5.87it/s]  

{'loss': 1.5265, 'grad_norm': 16.719009399414062, 'learning_rate': 1.1286905412793877e-05, 'epoch': 0.78}


 78%|███████▊  | 11500/14732 [55:02<09:04,  5.94it/s]

{'loss': 1.5745, 'grad_norm': 8.799788475036621, 'learning_rate': 1.1116047020229636e-05, 'epoch': 0.78}


                                                     
 78%|███████▊  | 11501/14732 [55:13<3:00:55,  3.36s/it]

{'eval_loss': 1.3416593074798584, 'eval_runtime': 10.6236, 'eval_samples_per_second': 76.998, 'eval_steps_per_second': 76.998, 'epoch': 0.78}


 78%|███████▊  | 11551/14732 [55:21<08:46,  6.04it/s]  

{'loss': 1.5034, 'grad_norm': 5.4377827644348145, 'learning_rate': 1.0945188627665392e-05, 'epoch': 0.78}


 79%|███████▊  | 11600/14732 [55:29<09:03,  5.77it/s]

{'loss': 1.5006, 'grad_norm': 11.623464584350586, 'learning_rate': 1.0774330235101148e-05, 'epoch': 0.79}


                                                     
 79%|███████▊  | 11600/14732 [55:40<09:03,  5.77it/s]

{'eval_loss': 1.3374148607254028, 'eval_runtime': 10.1902, 'eval_samples_per_second': 80.273, 'eval_steps_per_second': 80.273, 'epoch': 0.79}


 79%|███████▉  | 11651/14732 [55:51<08:39,  5.94it/s]  

{'loss': 1.4009, 'grad_norm': 3.6542325019836426, 'learning_rate': 1.0603471842536907e-05, 'epoch': 0.79}


 79%|███████▉  | 11700/14732 [55:59<08:41,  5.82it/s]

{'loss': 1.4636, 'grad_norm': 5.470332145690918, 'learning_rate': 1.0432613449972663e-05, 'epoch': 0.79}


                                                     
 79%|███████▉  | 11701/14732 [56:10<2:51:37,  3.40s/it]

{'eval_loss': 1.3375704288482666, 'eval_runtime': 10.7545, 'eval_samples_per_second': 76.061, 'eval_steps_per_second': 76.061, 'epoch': 0.79}


 80%|███████▉  | 11751/14732 [56:18<08:21,  5.95it/s]  

{'loss': 1.5223, 'grad_norm': 4.011507511138916, 'learning_rate': 1.0261755057408421e-05, 'epoch': 0.8}


 80%|████████  | 11800/14732 [56:26<08:10,  5.98it/s]

{'loss': 1.6929, 'grad_norm': 5.017610549926758, 'learning_rate': 1.0090896664844178e-05, 'epoch': 0.8}


                                                     
 80%|████████  | 11800/14732 [56:37<08:10,  5.98it/s]

{'eval_loss': 1.337245225906372, 'eval_runtime': 10.6878, 'eval_samples_per_second': 76.536, 'eval_steps_per_second': 76.536, 'epoch': 0.8}


 80%|████████  | 11851/14732 [56:47<07:53,  6.08it/s]  

{'loss': 1.4907, 'grad_norm': 6.858500957489014, 'learning_rate': 9.920038272279934e-06, 'epoch': 0.8}


 81%|████████  | 11900/14732 [56:56<08:01,  5.88it/s]

{'loss': 1.441, 'grad_norm': 15.890560150146484, 'learning_rate': 9.749179879715692e-06, 'epoch': 0.81}


                                                     
 81%|████████  | 11901/14732 [57:07<2:39:29,  3.38s/it]

{'eval_loss': 1.3372584581375122, 'eval_runtime': 10.6943, 'eval_samples_per_second': 76.489, 'eval_steps_per_second': 76.489, 'epoch': 0.81}


 81%|████████  | 11951/14732 [57:15<07:37,  6.08it/s]  

{'loss': 1.4352, 'grad_norm': 5.590476036071777, 'learning_rate': 9.57832148715145e-06, 'epoch': 0.81}


 81%|████████▏ | 12000/14732 [57:23<07:31,  6.05it/s]

{'loss': 1.38, 'grad_norm': 4.404998302459717, 'learning_rate': 9.407463094587207e-06, 'epoch': 0.81}


                                                     
 81%|████████▏ | 12000/14732 [57:34<07:31,  6.05it/s]

{'eval_loss': 1.336645483970642, 'eval_runtime': 10.5759, 'eval_samples_per_second': 77.346, 'eval_steps_per_second': 77.346, 'epoch': 0.81}


 82%|████████▏ | 12051/14732 [57:45<07:40,  5.82it/s]  

{'loss': 1.6747, 'grad_norm': 4.8471832275390625, 'learning_rate': 9.236604702022963e-06, 'epoch': 0.82}


 82%|████████▏ | 12100/14732 [57:53<07:16,  6.04it/s]

{'loss': 1.4427, 'grad_norm': 9.140684127807617, 'learning_rate': 9.065746309458721e-06, 'epoch': 0.82}


                                                     
 82%|████████▏ | 12101/14732 [58:04<2:25:11,  3.31s/it]

{'eval_loss': 1.3352822065353394, 'eval_runtime': 10.4726, 'eval_samples_per_second': 78.109, 'eval_steps_per_second': 78.109, 'epoch': 0.82}


 82%|████████▏ | 12151/14732 [58:12<07:05,  6.06it/s]  

{'loss': 1.4397, 'grad_norm': 10.39458179473877, 'learning_rate': 8.89488791689448e-06, 'epoch': 0.82}


 83%|████████▎ | 12200/14732 [58:21<06:52,  6.13it/s]

{'loss': 1.4136, 'grad_norm': 10.014602661132812, 'learning_rate': 8.724029524330234e-06, 'epoch': 0.83}


                                                     
 83%|████████▎ | 12200/14732 [58:31<06:52,  6.13it/s]

{'eval_loss': 1.3363827466964722, 'eval_runtime': 10.1366, 'eval_samples_per_second': 80.698, 'eval_steps_per_second': 80.698, 'epoch': 0.83}


 83%|████████▎ | 12251/14732 [58:42<06:48,  6.08it/s]  

{'loss': 1.2671, 'grad_norm': 6.287600517272949, 'learning_rate': 8.553171131765992e-06, 'epoch': 0.83}


 83%|████████▎ | 12300/14732 [58:50<06:52,  5.89it/s]

{'loss': 1.506, 'grad_norm': 12.552708625793457, 'learning_rate': 8.38231273920175e-06, 'epoch': 0.83}


                                                     
 83%|████████▎ | 12301/14732 [59:01<2:15:07,  3.34s/it]

{'eval_loss': 1.3385311365127563, 'eval_runtime': 10.5492, 'eval_samples_per_second': 77.541, 'eval_steps_per_second': 77.541, 'epoch': 0.83}


 84%|████████▍ | 12351/14732 [59:09<06:40,  5.95it/s]  

{'loss': 1.509, 'grad_norm': 6.275385856628418, 'learning_rate': 8.211454346637507e-06, 'epoch': 0.84}


 84%|████████▍ | 12400/14732 [59:17<06:43,  5.77it/s]

{'loss': 1.4537, 'grad_norm': 5.142225742340088, 'learning_rate': 8.040595954073265e-06, 'epoch': 0.84}


                                                     
 84%|████████▍ | 12400/14732 [59:28<06:43,  5.77it/s]

{'eval_loss': 1.3347357511520386, 'eval_runtime': 10.5001, 'eval_samples_per_second': 77.904, 'eval_steps_per_second': 77.904, 'epoch': 0.84}


 85%|████████▍ | 12451/14732 [59:39<06:23,  5.95it/s]  

{'loss': 1.386, 'grad_norm': 4.578582286834717, 'learning_rate': 7.869737561509021e-06, 'epoch': 0.85}


 85%|████████▍ | 12500/14732 [59:47<06:14,  5.97it/s]

{'loss': 1.4429, 'grad_norm': 5.829076766967773, 'learning_rate': 7.69887916894478e-06, 'epoch': 0.85}


                                                     
 85%|████████▍ | 12501/14732 [59:58<2:01:55,  3.28s/it]

{'eval_loss': 1.3369477987289429, 'eval_runtime': 10.3664, 'eval_samples_per_second': 78.909, 'eval_steps_per_second': 78.909, 'epoch': 0.85}


 85%|████████▌ | 12551/14732 [1:00:06<06:01,  6.04it/s]

{'loss': 1.3603, 'grad_norm': 4.168961048126221, 'learning_rate': 7.528020776380536e-06, 'epoch': 0.85}


 86%|████████▌ | 12600/14732 [1:00:14<06:04,  5.85it/s]

{'loss': 1.5314, 'grad_norm': 18.84520721435547, 'learning_rate': 7.357162383816293e-06, 'epoch': 0.86}


                                                       
 86%|████████▌ | 12600/14732 [1:00:25<06:04,  5.85it/s]

{'eval_loss': 1.3358736038208008, 'eval_runtime': 10.4836, 'eval_samples_per_second': 78.026, 'eval_steps_per_second': 78.026, 'epoch': 0.86}


 86%|████████▌ | 12651/14732 [1:00:35<05:40,  6.12it/s]  

{'loss': 1.4933, 'grad_norm': 9.816152572631836, 'learning_rate': 7.1863039912520515e-06, 'epoch': 0.86}


 86%|████████▌ | 12700/14732 [1:00:44<05:45,  5.88it/s]

{'loss': 1.5737, 'grad_norm': 13.815107345581055, 'learning_rate': 7.015445598687807e-06, 'epoch': 0.86}


                                                       
 86%|████████▌ | 12701/14732 [1:00:54<1:50:16,  3.26s/it]

{'eval_loss': 1.3339293003082275, 'eval_runtime': 10.2925, 'eval_samples_per_second': 79.475, 'eval_steps_per_second': 79.475, 'epoch': 0.86}


 87%|████████▋ | 12751/14732 [1:01:02<05:39,  5.84it/s]  

{'loss': 1.454, 'grad_norm': 4.619235038757324, 'learning_rate': 6.844587206123565e-06, 'epoch': 0.87}


 87%|████████▋ | 12800/14732 [1:01:11<05:33,  5.79it/s]

{'loss': 1.5712, 'grad_norm': 10.315315246582031, 'learning_rate': 6.6737288135593225e-06, 'epoch': 0.87}


                                                       
 87%|████████▋ | 12800/14732 [1:01:22<05:33,  5.79it/s]

{'eval_loss': 1.3348240852355957, 'eval_runtime': 10.8366, 'eval_samples_per_second': 75.485, 'eval_steps_per_second': 75.485, 'epoch': 0.87}


 87%|████████▋ | 12851/14732 [1:01:33<05:24,  5.80it/s]  

{'loss': 1.4202, 'grad_norm': 7.696214199066162, 'learning_rate': 6.502870420995081e-06, 'epoch': 0.87}


 88%|████████▊ | 12900/14732 [1:01:41<05:06,  5.98it/s]

{'loss': 1.5758, 'grad_norm': 5.750315189361572, 'learning_rate': 6.332012028430836e-06, 'epoch': 0.88}


                                                       
 88%|████████▊ | 12901/14732 [1:01:52<1:43:01,  3.38s/it]

{'eval_loss': 1.3332515954971313, 'eval_runtime': 10.6851, 'eval_samples_per_second': 76.555, 'eval_steps_per_second': 76.555, 'epoch': 0.88}


 88%|████████▊ | 12951/14732 [1:02:00<04:54,  6.04it/s]  

{'loss': 1.3748, 'grad_norm': 5.488556861877441, 'learning_rate': 6.161153635866594e-06, 'epoch': 0.88}


 88%|████████▊ | 13000/14732 [1:02:08<04:45,  6.06it/s]

{'loss': 1.38, 'grad_norm': 4.159388065338135, 'learning_rate': 5.990295243302351e-06, 'epoch': 0.88}


                                                       
 88%|████████▊ | 13000/14732 [1:02:19<04:45,  6.06it/s]

{'eval_loss': 1.3315739631652832, 'eval_runtime': 10.2291, 'eval_samples_per_second': 79.968, 'eval_steps_per_second': 79.968, 'epoch': 0.88}


 89%|████████▊ | 13051/14732 [1:02:29<04:39,  6.00it/s]  

{'loss': 1.4934, 'grad_norm': 5.938668727874756, 'learning_rate': 5.819436850738109e-06, 'epoch': 0.89}


 89%|████████▉ | 13100/14732 [1:02:38<04:32,  6.00it/s]

{'loss': 1.4532, 'grad_norm': 4.099301338195801, 'learning_rate': 5.648578458173865e-06, 'epoch': 0.89}


                                                       
 89%|████████▉ | 13101/14732 [1:02:48<1:28:36,  3.26s/it]

{'eval_loss': 1.3316320180892944, 'eval_runtime': 10.2983, 'eval_samples_per_second': 79.431, 'eval_steps_per_second': 79.431, 'epoch': 0.89}


 89%|████████▉ | 13151/14732 [1:02:56<04:26,  5.93it/s]  

{'loss': 1.5746, 'grad_norm': 5.189172267913818, 'learning_rate': 5.477720065609623e-06, 'epoch': 0.89}


 90%|████████▉ | 13200/14732 [1:03:05<04:18,  5.93it/s]

{'loss': 1.557, 'grad_norm': 4.97382116317749, 'learning_rate': 5.30686167304538e-06, 'epoch': 0.9}


                                                       
 90%|████████▉ | 13200/14732 [1:03:15<04:18,  5.93it/s]

{'eval_loss': 1.3309440612792969, 'eval_runtime': 10.1929, 'eval_samples_per_second': 80.252, 'eval_steps_per_second': 80.252, 'epoch': 0.9}


 90%|████████▉ | 13251/14732 [1:03:26<04:10,  5.92it/s]  

{'loss': 1.5727, 'grad_norm': 5.2551188468933105, 'learning_rate': 5.136003280481137e-06, 'epoch': 0.9}


 90%|█████████ | 13300/14732 [1:03:34<04:03,  5.88it/s]

{'loss': 1.4392, 'grad_norm': 4.202824115753174, 'learning_rate': 4.9651448879168945e-06, 'epoch': 0.9}


                                                       
 90%|█████████ | 13301/14732 [1:03:45<1:19:14,  3.32s/it]

{'eval_loss': 1.330358624458313, 'eval_runtime': 10.5075, 'eval_samples_per_second': 77.849, 'eval_steps_per_second': 77.849, 'epoch': 0.9}


 91%|█████████ | 13351/14732 [1:03:53<03:52,  5.95it/s]  

{'loss': 1.5689, 'grad_norm': 5.8394269943237305, 'learning_rate': 4.794286495352652e-06, 'epoch': 0.91}


 91%|█████████ | 13400/14732 [1:04:02<03:40,  6.04it/s]

{'loss': 1.3909, 'grad_norm': 7.554951190948486, 'learning_rate': 4.623428102788409e-06, 'epoch': 0.91}


                                                       
 91%|█████████ | 13400/14732 [1:04:12<03:40,  6.04it/s]

{'eval_loss': 1.33061683177948, 'eval_runtime': 10.1419, 'eval_samples_per_second': 80.656, 'eval_steps_per_second': 80.656, 'epoch': 0.91}


 91%|█████████▏| 13451/14732 [1:04:23<03:31,  6.07it/s]  

{'loss': 1.3611, 'grad_norm': 6.002711772918701, 'learning_rate': 4.452569710224166e-06, 'epoch': 0.91}


 92%|█████████▏| 13500/14732 [1:04:31<03:26,  5.97it/s]

{'loss': 1.456, 'grad_norm': 7.730889320373535, 'learning_rate': 4.281711317659924e-06, 'epoch': 0.92}


                                                       
 92%|█████████▏| 13501/14732 [1:04:42<1:10:12,  3.42s/it]

{'eval_loss': 1.3318848609924316, 'eval_runtime': 10.8405, 'eval_samples_per_second': 75.458, 'eval_steps_per_second': 75.458, 'epoch': 0.92}


 92%|█████████▏| 13551/14732 [1:04:50<03:19,  5.91it/s]  

{'loss': 1.3465, 'grad_norm': 7.06649923324585, 'learning_rate': 4.110852925095681e-06, 'epoch': 0.92}


 92%|█████████▏| 13600/14732 [1:04:58<03:06,  6.06it/s]

{'loss': 1.3552, 'grad_norm': 8.274965286254883, 'learning_rate': 3.939994532531438e-06, 'epoch': 0.92}


                                                       
 92%|█████████▏| 13600/14732 [1:05:09<03:06,  6.06it/s]

{'eval_loss': 1.3324368000030518, 'eval_runtime': 10.3452, 'eval_samples_per_second': 79.071, 'eval_steps_per_second': 79.071, 'epoch': 0.92}


 93%|█████████▎| 13651/14732 [1:05:20<02:57,  6.09it/s]  

{'loss': 1.5492, 'grad_norm': 9.774681091308594, 'learning_rate': 3.7691361399671955e-06, 'epoch': 0.93}


 93%|█████████▎| 13700/14732 [1:05:28<02:53,  5.96it/s]

{'loss': 1.3675, 'grad_norm': 5.026118278503418, 'learning_rate': 3.598277747402953e-06, 'epoch': 0.93}


                                                       
 93%|█████████▎| 13701/14732 [1:05:38<56:09,  3.27s/it]

{'eval_loss': 1.3318512439727783, 'eval_runtime': 10.3336, 'eval_samples_per_second': 79.159, 'eval_steps_per_second': 79.159, 'epoch': 0.93}


 93%|█████████▎| 13751/14732 [1:05:47<02:40,  6.10it/s]

{'loss': 1.4085, 'grad_norm': 7.138827800750732, 'learning_rate': 3.4308365226899947e-06, 'epoch': 0.93}


 94%|█████████▎| 13800/14732 [1:05:55<02:31,  6.14it/s]

{'loss': 1.684, 'grad_norm': 9.8630952835083, 'learning_rate': 3.259978130125752e-06, 'epoch': 0.94}


                                                       
 94%|█████████▎| 13800/14732 [1:06:05<02:31,  6.14it/s]

{'eval_loss': 1.3314722776412964, 'eval_runtime': 10.4648, 'eval_samples_per_second': 78.167, 'eval_steps_per_second': 78.167, 'epoch': 0.94}


 94%|█████████▍| 13851/14732 [1:06:17<02:28,  5.94it/s]  

{'loss': 1.4727, 'grad_norm': 5.230906009674072, 'learning_rate': 3.0891197375615093e-06, 'epoch': 0.94}


 94%|█████████▍| 13900/14732 [1:06:25<02:24,  5.76it/s]

{'loss': 1.5009, 'grad_norm': 7.8853888511657715, 'learning_rate': 2.9182613449972666e-06, 'epoch': 0.94}


                                                       
 94%|█████████▍| 13901/14732 [1:06:36<48:22,  3.49s/it]

{'eval_loss': 1.3311588764190674, 'eval_runtime': 11.0542, 'eval_samples_per_second': 73.999, 'eval_steps_per_second': 73.999, 'epoch': 0.94}


 95%|█████████▍| 13951/14732 [1:06:45<02:13,  5.85it/s]

{'loss': 1.3348, 'grad_norm': 3.976266622543335, 'learning_rate': 2.747402952433024e-06, 'epoch': 0.95}


 95%|█████████▌| 14000/14732 [1:06:53<02:06,  5.77it/s]

{'loss': 1.4052, 'grad_norm': 4.471831798553467, 'learning_rate': 2.576544559868781e-06, 'epoch': 0.95}


                                                       
 95%|█████████▌| 14000/14732 [1:07:04<02:06,  5.77it/s]

{'eval_loss': 1.3309518098831177, 'eval_runtime': 10.6763, 'eval_samples_per_second': 76.618, 'eval_steps_per_second': 76.618, 'epoch': 0.95}


 95%|█████████▌| 14051/14732 [1:07:15<01:58,  5.76it/s]

{'loss': 1.4721, 'grad_norm': 5.891083240509033, 'learning_rate': 2.405686167304538e-06, 'epoch': 0.95}


 96%|█████████▌| 14100/14732 [1:07:23<01:47,  5.89it/s]

{'loss': 1.3933, 'grad_norm': 3.7090492248535156, 'learning_rate': 2.2348277747402953e-06, 'epoch': 0.96}


                                                       
 96%|█████████▌| 14101/14732 [1:07:34<33:45,  3.21s/it]

{'eval_loss': 1.3309236764907837, 'eval_runtime': 10.1209, 'eval_samples_per_second': 80.823, 'eval_steps_per_second': 80.823, 'epoch': 0.96}


 96%|█████████▌| 14151/14732 [1:07:42<01:36,  6.05it/s]

{'loss': 1.4111, 'grad_norm': 6.635793685913086, 'learning_rate': 2.0639693821760526e-06, 'epoch': 0.96}


 96%|█████████▋| 14200/14732 [1:07:50<01:29,  5.93it/s]

{'loss': 1.4635, 'grad_norm': 10.514676094055176, 'learning_rate': 1.8931109896118099e-06, 'epoch': 0.96}


                                                       
 96%|█████████▋| 14200/14732 [1:08:00<01:29,  5.93it/s]

{'eval_loss': 1.3307548761367798, 'eval_runtime': 10.1223, 'eval_samples_per_second': 80.812, 'eval_steps_per_second': 80.812, 'epoch': 0.96}


 97%|█████████▋| 14251/14732 [1:08:12<01:23,  5.79it/s]

{'loss': 1.2937, 'grad_norm': 9.406229972839355, 'learning_rate': 1.7222525970475672e-06, 'epoch': 0.97}


 97%|█████████▋| 14300/14732 [1:08:20<01:14,  5.78it/s]

{'loss': 1.5277, 'grad_norm': 8.034127235412598, 'learning_rate': 1.5513942044833242e-06, 'epoch': 0.97}


                                                       
 97%|█████████▋| 14301/14732 [1:08:31<24:46,  3.45s/it]

{'eval_loss': 1.330556869506836, 'eval_runtime': 10.9259, 'eval_samples_per_second': 74.868, 'eval_steps_per_second': 74.868, 'epoch': 0.97}


 97%|█████████▋| 14351/14732 [1:08:40<01:05,  5.80it/s]

{'loss': 1.3796, 'grad_norm': 9.058425903320312, 'learning_rate': 1.3805358119190815e-06, 'epoch': 0.97}


 98%|█████████▊| 14400/14732 [1:08:48<00:55,  5.97it/s]

{'loss': 1.4186, 'grad_norm': 3.3186213970184326, 'learning_rate': 1.2096774193548388e-06, 'epoch': 0.98}


                                                       
 98%|█████████▊| 14400/14732 [1:08:58<00:55,  5.97it/s]

{'eval_loss': 1.330425500869751, 'eval_runtime': 10.1364, 'eval_samples_per_second': 80.699, 'eval_steps_per_second': 80.699, 'epoch': 0.98}


 98%|█████████▊| 14451/14732 [1:09:09<00:48,  5.85it/s]

{'loss': 1.469, 'grad_norm': 9.310784339904785, 'learning_rate': 1.038819026790596e-06, 'epoch': 0.98}


 98%|█████████▊| 14500/14732 [1:09:18<00:44,  5.18it/s]

{'loss': 1.3912, 'grad_norm': 16.590946197509766, 'learning_rate': 8.679606342263532e-07, 'epoch': 0.98}


                                                       
 98%|█████████▊| 14501/14732 [1:09:29<13:04,  3.39s/it]

{'eval_loss': 1.3302371501922607, 'eval_runtime': 10.6975, 'eval_samples_per_second': 76.467, 'eval_steps_per_second': 76.467, 'epoch': 0.98}


 99%|█████████▉| 14551/14732 [1:09:37<00:30,  5.88it/s]

{'loss': 1.3626, 'grad_norm': 4.571035385131836, 'learning_rate': 6.971022416621105e-07, 'epoch': 0.99}


 99%|█████████▉| 14600/14732 [1:09:45<00:22,  5.91it/s]

{'loss': 1.4573, 'grad_norm': 9.24195671081543, 'learning_rate': 5.262438490978676e-07, 'epoch': 0.99}


                                                       
 99%|█████████▉| 14600/14732 [1:09:57<00:22,  5.91it/s]

{'eval_loss': 1.3301736116409302, 'eval_runtime': 11.1718, 'eval_samples_per_second': 73.22, 'eval_steps_per_second': 73.22, 'epoch': 0.99}


 99%|█████████▉| 14651/14732 [1:10:08<00:13,  6.04it/s]

{'loss': 1.4937, 'grad_norm': 6.085907936096191, 'learning_rate': 3.553854565336249e-07, 'epoch': 0.99}


100%|█████████▉| 14700/14732 [1:10:16<00:05,  5.91it/s]

{'loss': 1.4811, 'grad_norm': 5.403878211975098, 'learning_rate': 1.8452706396938218e-07, 'epoch': 1.0}


                                                       
100%|█████████▉| 14701/14732 [1:10:26<01:38,  3.18s/it]

{'eval_loss': 1.330243468284607, 'eval_runtime': 10.0245, 'eval_samples_per_second': 81.6, 'eval_steps_per_second': 81.6, 'epoch': 1.0}


100%|██████████| 14732/14732 [1:10:31<00:00,  3.48it/s]


{'train_runtime': 4231.9966, 'train_samples_per_second': 3.481, 'train_steps_per_second': 3.481, 'train_loss': 1.5405529352623646, 'epoch': 1.0}


## Evaluation

In [17]:
from datasets import load_metric
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Clean CUDA cache
torch.cuda.empty_cache()

# Load the trained model
model_path = './t5_base_trained'
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Load BLEU metric with trust_remote_code=True
bleu = load_metric('bleu', trust_remote_code=True)

def compute_bleu(predictions, references):
    bleu.add_batch(predictions=predictions, references=references)
    result = bleu.compute()
    return result['bleu']

# Function to generate summaries
def generate_summary(dialogue):
    inputs = tokenizer(dialogue, return_tensors="pt", max_length=1024, truncation=True)
    with torch.no_grad():
        outputs = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Evaluate BLEU score on the test dataset
test_dialogues = [dataset_samsum['test'][i]['dialogue'] for i in range(len(dataset_samsum['test']))]
test_summaries = [dataset_samsum['test'][i]['summary'] for i in range(len(dataset_samsum['test']))]

predicted_summaries = [generate_summary(dialogue) for dialogue in test_dialogues]

bleu_score = compute_bleu(predictions=[pred.split() for pred in predicted_summaries],
                         references=[[ref.split()] for ref in test_summaries])
print(f"BLEU score: {bleu_score}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


BLEU score: 0.13333111566594982


In [19]:
from datasets import load_metric
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Clean CUDA cache
torch.cuda.empty_cache()

# Load the trained model
model_path = './t5_base_trained'
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Load ROUGE metric
rouge = load_metric('rouge')

def compute_rouge(predictions, references):
    rouge.add_batch(predictions=predictions, references=references)
    result = rouge.compute()
    return {
        'rouge1': result['rouge1'].mid,
        'rouge2': result['rouge2'].mid,
        'rougeL': result['rougeL'].mid
    }

# Function to generate summaries
def generate_summary(dialogue):
    inputs = tokenizer(dialogue, return_tensors="pt", max_length=1024, truncation=True)
    with torch.no_grad():
        outputs = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Evaluate ROUGE score on the test dataset
test_dialogues = [dataset_samsum['test'][i]['dialogue'] for i in range(len(dataset_samsum['test']))]
test_summaries = [dataset_samsum['test'][i]['summary'] for i in range(len(dataset_samsum['test']))]

predicted_summaries = [generate_summary(dialogue) for dialogue in test_dialogues]

rouge_scores = compute_rouge(predictions=predicted_summaries, references=test_summaries)
print(f"ROUGE-1: {rouge_scores['rouge1']}")
print(f"ROUGE-2: {rouge_scores['rouge2']}")
print(f"ROUGE-L: {rouge_scores['rougeL']}")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


ROUGE-1: Score(precision=0.5960998689074908, recall=0.4502035098873436, fmeasure=0.48343058752187307)
ROUGE-2: Score(precision=0.3073526717509122, recall=0.22752341219904754, fmeasure=0.24603915147122501)
ROUGE-L: Score(precision=0.5021220111476851, recall=0.3793270528013324, fmeasure=0.40781455767281716)


In [26]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('./t5_base_trained')  # Adjust path to your trained model

# Example dialogue and reference summary
sample_text = """We've updated our terms of use and privacy policy 

Hello Codewarrior,We’ve updated our Terms of Use and Privacy Policy, which took effect on June 14, 
2024. These updates are intended to make it easier for you to understand how we use your information, 
our content standards, and changes to our contact details. 
You can review the full updated Terms of Use and Privacy Policy linked above. We encourage you to read 
them carefully and contact us at info@codewars.com if you have any questions or concerns. 
Some of the key changes include: Explaining how we share your personal information with our affiliates, 
vendors, service providers, partners, and third parties for various purposes. 
Outlining your privacy rights and choices depending on your location and applicable laws.Updating our 
content standards and prohibited uses for the website and the interactive services we offer, such as 
Discussions, Discord, Kata discourse, etc.  
By continuing to use our website or services, you agree to the updated Terms of Use and Privacy Policy. 
The Codewars Team """
reference = """Codewars updated its Terms of Use and Privacy Policy on June 14, 
2024, to clarify information usage, content standards, and contact details. 
Changes include how personal information is shared, privacy rights based on location, 
and updated content standards for website use. Users are encouraged to review the full 
updates and contact Codewars at info@codewars.com for questions"""

# Tokenize the input dialogue
inputs = tokenizer.encode("summarize: " + sample_text, return_tensors="pt", max_length=512, truncation=True)

# Generate summary
summary_ids = model.generate(inputs, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print outputs
print("Dialogue:")
print(sample_text)

print("\nReference Summary:")
print(reference)

print("\nModel Summary:")
print(generated_summary)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Dialogue:
We've updated our terms of use and privacy policy 

Hello Codewarrior,We’ve updated our Terms of Use and Privacy Policy, which took effect on June 14, 
2024. These updates are intended to make it easier for you to understand how we use your information, 
our content standards, and changes to our contact details. 
You can review the full updated Terms of Use and Privacy Policy linked above. We encourage you to read 
them carefully and contact us at info@codewars.com if you have any questions or concerns. 
Some of the key changes include: Explaining how we share your personal information with our affiliates, 
vendors, service providers, partners, and third parties for various purposes. 
Outlining your privacy rights and choices depending on your location and applicable laws.Updating our 
content standards and prohibited uses for the website and the interactive services we offer, such as 
Discussions, Discord, Kata discourse, etc.  
By continuing to use our website or services, 