In [2]:
import os
import random

import torch
from torch.utils.data import DataLoader
from miditok import REMI, TokenizerConfig, MIDITokenizer, TokSequence
from miditok.pytorch_data import DatasetMIDI, DatasetJSON, DataCollator, split_midis_for_training
from miditok.data_augmentation import augment_midi_dataset
from miditok.utils import get_midi_programs
from pathlib import Path
from symusic import Score
import wandb
from tqdm import tqdm

from transformers.models.opt.modeling_opt import OPTForCausalLM
from transformers import AutoTokenizer

## Tokenizer

In [3]:
midi_paths = list(Path("/home/lklimkiewicz/priv/midi/data/vgmusic").glob("**/*.mid")) + \
    list(Path("/home/lklimkiewicz/priv/midi/data/khinsider").glob("**/*.mid"))

In [3]:
tokenizer_config = TokenizerConfig(num_velocities=16, use_chords=True, use_programs=True)
tokenizer = REMI(tokenizer_config)

In [None]:
random.shuffle(midi_paths)
tokenizer.learn_bpe(vocab_size=30000, files_paths=midi_paths[:10000])

In [6]:
# saving
tokenizer.save_pretrained('../logs/tokenizer')

In [6]:
# loading
tokenizer = MIDITokenizer.from_pretrained('../logs/tokenizer')

config.json not found in /home/lklimkiewicz/priv/midi/logs/tokenizer


In [None]:
tokenizer = AutoTokenizer.from_pretrained('../logs/tokenizer')

In [11]:
tokenizer.push_to_hub('midi-ganerator-game')

CommitInfo(commit_url='https://huggingface.co/lklimkiewicz/midi-ganerator-game/commit/777c949a787add8470f4bc1f8a922a40a8fdf47e', commit_message='Push model using huggingface_hub.', commit_description='', oid='777c949a787add8470f4bc1f8a922a40a8fdf47e', pr_url=None, pr_revision=None, pr_num=None)

In [9]:
MIDITokenizer.from_pretrained('lklimkiewicz/midi_tokenizer')

30000 tokens with ('T',) io format(one token stream), with BPE

## Model

In [6]:
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM

In [7]:
config = AutoConfig.from_pretrained(
    'facebook/opt-125m',
    bos_token_id=tokenizer['BOS_None'],
    eos_token_id=tokenizer['EOS_None'],
    pad_token_id=tokenizer['PAD_None'],
    vocab_size=len(tokenizer),
    prefix=None,
    max_length=1024,
    do_sample=True,
)

In [9]:
model = AutoModelForCausalLM.from_config(config)

In [12]:
model.config.bos_token_id

1

In [11]:
model.tokenizer

AttributeError: 'OPTForCausalLM' object has no attribute 'tokenizer'

In [None]:
type(model)

transformers.models.opt.modeling_opt.OPTForCausalLM

## Split dataset

In [None]:
midi_paths = list(Path("/home/lklimkiewicz/priv/midi/data/vgmusic").glob("**/*.mid")) + \
    list(Path("/home/lklimkiewicz/priv/midi/data/khinsider").glob("**/*.mid"))

In [None]:
def filter_dataset(paths, val_fun):
    correct = []
    for path in tqdm(paths):
        try:
            midi = Score(path)
            if val_fun(midi):
                correct.append(path)
        except:
            pass
    return correct

In [None]:
print('Initial count:', len(midi_paths))

def midi_valid(midi) -> bool:
    if midi.note_num() < 50 or len(midi.time_signatures) == 0 or len(midi.tempos) == 0:
        return False
    
    for time_sig in midi.time_signatures:
        if time_sig.denominator == 0 or time_sig.numerator == 0:
            return False
    
    return True

midi_paths = filter_dataset(midi_paths, midi_valid)

print('Filtered count:', len(midi_paths))

Initial count: 40447


100%|██████████| 40447/40447 [00:10<00:00, 3773.71it/s]

Filtered count: 40442





In [None]:
split_midis_for_training(
    files_paths=midi_paths,
    tokenizer=tokenizer,
    save_dir=Path('./chunks_for_training'),
    max_seq_len=1024,
)

## Tokenize dataset

In [4]:
midi_paths = list(Path("/home/lklimkiewicz/priv/midi/src/chunks_for_training/khinsider").glob("**/*.mid"))
tokenizer.tokenize_midi_dataset(midi_paths, out_dir="tokenized_dataset/khinsider", save_programs=True)

Tokenizing MIDIs (tokenized_dataset/khinsider):  44%|████▍     | 42160/95341 [09:03<15:17, 57.94it/s] 

.

Tokenizing MIDIs (tokenized_dataset/khinsider):  44%|████▍     | 42198/95341 [09:04<16:28, 53.78it/s]

.

Tokenizing MIDIs (tokenized_dataset/khinsider): 100%|██████████| 95341/95341 [21:09<00:00, 75.09it/s] 


## Augment dataset

In [7]:
augment_midi_dataset(
    Path('/home/lklimkiewicz/priv/midi/src/tokenized_dataset'),
    pitch_offsets=[-12, 12],
    velocity_offsets=[-4, 5],
    duration_offsets=[-0.5, 1],
    out_path="./augmented_dataset",
)

Performing data augmentation: 0it [00:00, ?it/s]


## Load dataset

In [8]:
# dataset = DatasetMIDI(
#     files_paths=midi_paths,
#     tokenizer=tokenizer,
#     max_seq_len=1024,
#     bos_token_id=tokenizer["BOS_None"],
#     eos_token_id=tokenizer["EOS_None"],
# )

json_paths = list(Path("/home/lklimkiewicz/priv/midi/src/tokenized_dataset").glob("**/*.json"))

dataset = DatasetJSON(
    files_paths=json_paths,
    max_seq_len=1024,
    bos_token_id=tokenizer["BOS_None"],
    eos_token_id=tokenizer["EOS_None"],
)

collator = DataCollator(
    tokenizer["PAD_None"],
    copy_inputs_as_labels=True,
    shift_labels=True,
)

## Train

In [11]:
from transformers import Trainer, TrainingArguments, TrainerCallback, TrainerState, TrainerControl

In [12]:
class MidiGenerationCallback(TrainerCallback):
    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        if state.global_step % 500 == 0:
            
            input = torch.tensor([[tokenizer['BOS_None']]], device=model.device)
            midi = model.generate(input, max_new_tokens=1024)
            generated_ts = TokSequence(ids=midi.tolist()[0], ids_bpe_encoded=True)
            generated_score = tokenizer(generated_ts)
            generated_score.dump_midi(f'outputs/v1/step-{state.global_step}.mid')
            
            input = torch.tensor([[4]], device=model.device)
            midi = model.generate(input, max_new_tokens=1024)
            generated_ts = TokSequence(ids=midi.tolist()[0], ids_bpe_encoded=True)
            generated_score = tokenizer(generated_ts)
            generated_score.dump_midi(f'outputs/v2/step-{state.global_step}.mid')

In [13]:
training_args = TrainingArguments(
    output_dir="test_trainer",
    per_device_train_batch_size=4,
    report_to="wandb",
    bf16=True,
    dataloader_num_workers=16,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    lr_scheduler_type="cosine",
    warmup_steps=600,
    save_steps=1000,
    save_total_limit=5,
    num_train_epochs=2,
    label_smoothing_factor=0.2,
    torch_compile=True,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=collator,
    tokenizer=tokenizer,
    callbacks=[MidiGenerationCallback()]
)

In [15]:
os.environ["WANDB_PROJECT"] = "midi"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [16]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mweights-and-biases[0m ([33mklima7-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 100/117610 [01:37<9:22:59,  3.48it/s]

{'loss': 10.381, 'learning_rate': 8.333333333333334e-06, 'epoch': 0.0}


  0%|          | 200/117610 [02:06<9:13:33,  3.54it/s]

{'loss': 10.0142, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.0}


  0%|          | 300/117610 [02:37<9:25:33,  3.46it/s] 

{'loss': 9.6699, 'learning_rate': 2.5e-05, 'epoch': 0.01}


  0%|          | 400/117610 [03:06<9:23:04,  3.47it/s]

{'loss': 9.4481, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.01}


  0%|          | 500/117610 [04:09<181:06:43,  5.57s/it]

{'loss': 9.3163, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.01}


  1%|          | 600/117610 [04:38<8:36:49,  3.77it/s]  

{'loss': 9.2035, 'learning_rate': 5e-05, 'epoch': 0.01}


  1%|          | 700/117610 [05:04<8:28:12,  3.83it/s]

{'loss': 9.1329, 'learning_rate': 4.99999098919609e-05, 'epoch': 0.01}


  1%|          | 800/117610 [05:31<8:20:03,  3.89it/s]

{'loss': 9.0203, 'learning_rate': 4.999963956849317e-05, 'epoch': 0.01}


  1%|          | 900/117610 [05:58<8:50:04,  3.67it/s]

{'loss': 8.9732, 'learning_rate': 4.9999189031545456e-05, 'epoch': 0.02}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 8.9189, 'learning_rate': 4.999855828436554e-05, 'epoch': 0.02}


  1%|          | 1100/117610 [07:08<8:49:34,  3.67it/s]  

{'loss': 8.8789, 'learning_rate': 4.999774733150024e-05, 'epoch': 0.02}


  1%|          | 1200/117610 [07:35<8:38:14,  3.74it/s]

{'loss': 8.8017, 'learning_rate': 4.999675617879542e-05, 'epoch': 0.02}


  1%|          | 1300/117610 [08:01<8:45:47,  3.69it/s]

{'loss': 8.7146, 'learning_rate': 4.999558483339596e-05, 'epoch': 0.02}


  1%|          | 1400/117610 [08:28<8:40:08,  3.72it/s]

{'loss': 8.7279, 'learning_rate': 4.9994233303745676e-05, 'epoch': 0.02}


  1%|▏         | 1500/117610 [09:09<147:51:46,  4.58s/it]

{'loss': 8.6569, 'learning_rate': 4.9992701599587236e-05, 'epoch': 0.03}


  1%|▏         | 1600/117610 [09:36<8:35:26,  3.75it/s]  

{'loss': 8.6036, 'learning_rate': 4.999098973196217e-05, 'epoch': 0.03}


  1%|▏         | 1700/117610 [10:02<8:12:26,  3.92it/s]

{'loss': 8.5577, 'learning_rate': 4.998909771321072e-05, 'epoch': 0.03}


  2%|▏         | 1800/117610 [10:29<8:45:28,  3.67it/s]

{'loss': 8.5143, 'learning_rate': 4.998702555697177e-05, 'epoch': 0.03}


  2%|▏         | 1900/117610 [10:56<7:57:52,  4.04it/s]

{'loss': 8.5127, 'learning_rate': 4.998477327818275e-05, 'epoch': 0.03}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 8.4886, 'learning_rate': 4.9982340893079536e-05, 'epoch': 0.03}


  2%|▏         | 2100/117610 [12:04<8:30:55,  3.77it/s]  

{'loss': 8.4315, 'learning_rate': 4.9979728419196336e-05, 'epoch': 0.04}


  2%|▏         | 2200/117610 [12:31<8:16:51,  3.87it/s]

{'loss': 8.3718, 'learning_rate': 4.997693587536553e-05, 'epoch': 0.04}


  2%|▏         | 2300/117610 [12:57<8:24:29,  3.81it/s]

{'loss': 8.3432, 'learning_rate': 4.997396328171757e-05, 'epoch': 0.04}


  2%|▏         | 2400/117610 [13:24<8:13:45,  3.89it/s]

{'loss': 8.3353, 'learning_rate': 4.9970810659680826e-05, 'epoch': 0.04}


  2%|▏         | 2500/117610 [14:05<149:20:11,  4.67s/it]

{'loss': 8.2817, 'learning_rate': 4.996747803198143e-05, 'epoch': 0.04}


  2%|▏         | 2600/117610 [14:31<8:46:35,  3.64it/s]  

{'loss': 8.1699, 'learning_rate': 4.99639654226431e-05, 'epoch': 0.04}


  2%|▏         | 2700/117610 [14:58<8:27:48,  3.77it/s]

{'loss': 8.0821, 'learning_rate': 4.9960272856986974e-05, 'epoch': 0.05}


  2%|▏         | 2800/117610 [15:24<8:37:02,  3.70it/s]

{'loss': 8.0186, 'learning_rate': 4.9956400361631455e-05, 'epoch': 0.05}


  2%|▏         | 2900/117610 [15:50<8:33:09,  3.73it/s]

{'loss': 7.965, 'learning_rate': 4.9952347964491976e-05, 'epoch': 0.05}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 7.8947, 'learning_rate': 4.994811569478082e-05, 'epoch': 0.05}


  3%|▎         | 3100/117610 [17:00<8:31:29,  3.73it/s]  

{'loss': 7.8504, 'learning_rate': 4.9943703583006905e-05, 'epoch': 0.05}


  3%|▎         | 3200/117610 [17:26<8:09:05,  3.90it/s]

{'loss': 7.7464, 'learning_rate': 4.993911166097558e-05, 'epoch': 0.05}


  3%|▎         | 3300/117610 [17:53<8:30:58,  3.73it/s]

{'loss': 7.6889, 'learning_rate': 4.993433996178837e-05, 'epoch': 0.06}


  3%|▎         | 3400/117610 [18:19<8:33:50,  3.70it/s]

{'loss': 7.7014, 'learning_rate': 4.992938851984273e-05, 'epoch': 0.06}


  3%|▎         | 3500/117610 [19:00<146:03:08,  4.61s/it]

{'loss': 7.6685, 'learning_rate': 4.992425737083188e-05, 'epoch': 0.06}


  3%|▎         | 3600/117610 [19:27<8:34:12,  3.70it/s]  

{'loss': 7.5379, 'learning_rate': 4.991894655174441e-05, 'epoch': 0.06}


  3%|▎         | 3700/117610 [19:53<8:11:34,  3.86it/s]

{'loss': 7.5658, 'learning_rate': 4.991345610086412e-05, 'epoch': 0.06}


  3%|▎         | 3800/117610 [20:20<8:18:19,  3.81it/s]

{'loss': 7.486, 'learning_rate': 4.9907786057769726e-05, 'epoch': 0.06}


  3%|▎         | 3900/117610 [20:46<7:59:26,  3.95it/s]

{'loss': 7.4266, 'learning_rate': 4.990193646333454e-05, 'epoch': 0.07}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 7.4244, 'learning_rate': 4.989590735972619e-05, 'epoch': 0.07}


  3%|▎         | 4100/117610 [21:55<8:03:37,  3.91it/s]  

{'loss': 7.4152, 'learning_rate': 4.988969879040635e-05, 'epoch': 0.07}


  4%|▎         | 4200/117610 [22:21<8:29:43,  3.71it/s]

{'loss': 7.367, 'learning_rate': 4.9883310800130366e-05, 'epoch': 0.07}


  4%|▎         | 4300/117610 [22:48<8:24:11,  3.75it/s]

{'loss': 7.3087, 'learning_rate': 4.9876743434946994e-05, 'epoch': 0.07}


  4%|▎         | 4400/117610 [23:14<8:38:40,  3.64it/s]

{'loss': 7.3471, 'learning_rate': 4.986999674219801e-05, 'epoch': 0.07}


  4%|▍         | 4500/117610 [23:55<143:02:15,  4.55s/it]

{'loss': 7.2907, 'learning_rate': 4.9863070770517925e-05, 'epoch': 0.08}


  4%|▍         | 4600/117610 [24:22<8:21:24,  3.76it/s]  

{'loss': 7.3025, 'learning_rate': 4.985596556983359e-05, 'epoch': 0.08}


  4%|▍         | 4700/117610 [24:49<8:23:10,  3.74it/s]

{'loss': 7.2519, 'learning_rate': 4.984868119136387e-05, 'epoch': 0.08}


  4%|▍         | 4800/117610 [25:15<8:19:23,  3.76it/s]

{'loss': 7.241, 'learning_rate': 4.9841217687619246e-05, 'epoch': 0.08}


  4%|▍         | 4900/117610 [25:41<8:34:22,  3.65it/s]

{'loss': 7.2109, 'learning_rate': 4.983357511240145e-05, 'epoch': 0.08}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 7.1768, 'learning_rate': 4.982575352080308e-05, 'epoch': 0.09}


  4%|▍         | 5100/117610 [26:50<8:24:46,  3.71it/s]  

{'loss': 7.2237, 'learning_rate': 4.9817752969207195e-05, 'epoch': 0.09}


  4%|▍         | 5200/117610 [27:17<8:20:37,  3.74it/s]

{'loss': 7.1056, 'learning_rate': 4.9809573515286935e-05, 'epoch': 0.09}


  5%|▍         | 5300/117610 [27:43<8:14:58,  3.78it/s]

{'loss': 7.1199, 'learning_rate': 4.9801215218005045e-05, 'epoch': 0.09}


  5%|▍         | 5400/117610 [28:10<8:26:11,  3.69it/s]

{'loss': 7.0969, 'learning_rate': 4.979267813761351e-05, 'epoch': 0.09}


  5%|▍         | 5500/117610 [28:51<143:37:48,  4.61s/it]

{'loss': 7.1107, 'learning_rate': 4.97839623356531e-05, 'epoch': 0.09}


  5%|▍         | 5600/117610 [29:17<8:22:34,  3.71it/s]  

{'loss': 7.0351, 'learning_rate': 4.977506787495292e-05, 'epoch': 0.1}


  5%|▍         | 5700/117610 [29:44<7:59:48,  3.89it/s]

{'loss': 7.055, 'learning_rate': 4.9765994819629955e-05, 'epoch': 0.1}


  5%|▍         | 5801/117610 [30:10<7:51:04,  3.96it/s]

{'loss': 6.9585, 'learning_rate': 4.975674323508864e-05, 'epoch': 0.1}


  5%|▌         | 5900/117610 [30:37<8:16:55,  3.75it/s]

{'loss': 7.021, 'learning_rate': 4.974731318802034e-05, 'epoch': 0.1}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 7.0099, 'learning_rate': 4.9737704746402886e-05, 'epoch': 0.1}


  5%|▌         | 6101/117610 [31:46<7:56:21,  3.90it/s]  

{'loss': 6.9326, 'learning_rate': 4.972791797950011e-05, 'epoch': 0.1}


  5%|▌         | 6200/117610 [32:12<8:29:26,  3.64it/s]

{'loss': 7.0109, 'learning_rate': 4.971795295786134e-05, 'epoch': 0.11}


  5%|▌         | 6300/117610 [32:38<8:20:16,  3.71it/s]

{'loss': 6.9074, 'learning_rate': 4.9707809753320844e-05, 'epoch': 0.11}


  5%|▌         | 6400/117610 [33:05<8:12:01,  3.77it/s]

{'loss': 6.92, 'learning_rate': 4.9697488438997366e-05, 'epoch': 0.11}


  6%|▌         | 6500/117610 [33:45<140:41:05,  4.56s/it]

{'loss': 6.9438, 'learning_rate': 4.968698908929358e-05, 'epoch': 0.11}


  6%|▌         | 6600/117610 [34:12<8:14:36,  3.74it/s]  

{'loss': 6.8668, 'learning_rate': 4.9676311779895545e-05, 'epoch': 0.11}


  6%|▌         | 6700/117610 [34:39<7:54:53,  3.89it/s]

{'loss': 6.8572, 'learning_rate': 4.9665456587772175e-05, 'epoch': 0.11}


  6%|▌         | 6800/117610 [35:05<8:01:33,  3.84it/s]

{'loss': 6.8719, 'learning_rate': 4.965442359117469e-05, 'epoch': 0.12}


  6%|▌         | 6900/117610 [35:32<8:22:55,  3.67it/s] 

{'loss': 6.845, 'learning_rate': 4.964321286963601e-05, 'epoch': 0.12}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.8699, 'learning_rate': 4.9631824503970225e-05, 'epoch': 0.12}


  6%|▌         | 7100/117610 [36:40<7:54:06,  3.88it/s]  

{'loss': 6.8057, 'learning_rate': 4.9620258576272017e-05, 'epoch': 0.12}


  6%|▌         | 7200/117610 [37:07<8:24:47,  3.65it/s]

{'loss': 6.7865, 'learning_rate': 4.9608515169916e-05, 'epoch': 0.12}


  6%|▌         | 7300/117610 [37:33<8:00:04,  3.83it/s]

{'loss': 6.7577, 'learning_rate': 4.9596594369556235e-05, 'epoch': 0.12}


  6%|▋         | 7400/117610 [38:00<7:45:49,  3.94it/s]

{'loss': 6.7784, 'learning_rate': 4.9584496261125494e-05, 'epoch': 0.13}


  6%|▋         | 7500/117610 [38:41<142:49:55,  4.67s/it]

{'loss': 6.7227, 'learning_rate': 4.9572220931834736e-05, 'epoch': 0.13}


  6%|▋         | 7600/117610 [39:08<7:48:40,  3.91it/s]  

{'loss': 6.8136, 'learning_rate': 4.9559768470172416e-05, 'epoch': 0.13}


  7%|▋         | 7700/117610 [39:34<7:39:41,  3.98it/s]

{'loss': 6.7805, 'learning_rate': 4.9547138965903894e-05, 'epoch': 0.13}


  7%|▋         | 7800/117610 [40:00<8:03:34,  3.78it/s]

{'loss': 6.7485, 'learning_rate': 4.9534332510070764e-05, 'epoch': 0.13}


  7%|▋         | 7900/117610 [40:27<8:08:54,  3.74it/s]

{'loss': 6.7371, 'learning_rate': 4.952134919499019e-05, 'epoch': 0.13}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.6997, 'learning_rate': 4.9508189114254246e-05, 'epoch': 0.14}


  7%|▋         | 8100/117610 [41:36<8:08:52,  3.73it/s]  

{'loss': 6.6818, 'learning_rate': 4.949485236272928e-05, 'epoch': 0.14}


  7%|▋         | 8200/117610 [42:02<8:11:29,  3.71it/s]

{'loss': 6.7014, 'learning_rate': 4.948133903655516e-05, 'epoch': 0.14}


  7%|▋         | 8300/117610 [42:29<7:52:20,  3.86it/s]

{'loss': 6.6804, 'learning_rate': 4.946764923314463e-05, 'epoch': 0.14}


  7%|▋         | 8400/117610 [42:55<7:44:16,  3.92it/s]

{'loss': 6.6705, 'learning_rate': 4.9453783051182614e-05, 'epoch': 0.14}


  7%|▋         | 8500/117610 [43:36<138:51:27,  4.58s/it]

{'loss': 6.65, 'learning_rate': 4.9439740590625445e-05, 'epoch': 0.14}


  7%|▋         | 8600/117610 [44:02<8:00:15,  3.78it/s]  

{'loss': 6.6241, 'learning_rate': 4.9425521952700227e-05, 'epoch': 0.15}


  7%|▋         | 8700/117610 [44:29<8:12:50,  3.68it/s]

{'loss': 6.6215, 'learning_rate': 4.941112723990404e-05, 'epoch': 0.15}


  7%|▋         | 8800/117610 [44:55<8:11:33,  3.69it/s]

{'loss': 6.5932, 'learning_rate': 4.9396556556003235e-05, 'epoch': 0.15}


  8%|▊         | 8900/117610 [45:22<7:45:53,  3.89it/s]

{'loss': 6.6154, 'learning_rate': 4.938181000603267e-05, 'epoch': 0.15}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.598, 'learning_rate': 4.936688769629497e-05, 'epoch': 0.15}


  8%|▊         | 9100/117610 [46:30<7:51:16,  3.84it/s]  

{'loss': 6.555, 'learning_rate': 4.9351789734359724e-05, 'epoch': 0.15}


  8%|▊         | 9200/117610 [46:56<7:53:59,  3.81it/s]

{'loss': 6.5498, 'learning_rate': 4.933651622906276e-05, 'epoch': 0.16}


  8%|▊         | 9300/117610 [47:23<8:18:29,  3.62it/s]

{'loss': 6.5221, 'learning_rate': 4.932106729050533e-05, 'epoch': 0.16}


  8%|▊         | 9400/117610 [47:49<7:38:10,  3.94it/s]

{'loss': 6.5535, 'learning_rate': 4.930544303005331e-05, 'epoch': 0.16}


  8%|▊         | 9500/117610 [48:30<137:11:45,  4.57s/it]

{'loss': 6.5461, 'learning_rate': 4.928964356033643e-05, 'epoch': 0.16}


  8%|▊         | 9600/117610 [48:57<7:54:42,  3.79it/s]  

{'loss': 6.5478, 'learning_rate': 4.9273668995247424e-05, 'epoch': 0.16}


  8%|▊         | 9700/117610 [49:23<8:08:32,  3.68it/s]

{'loss': 6.558, 'learning_rate': 4.9257519449941224e-05, 'epoch': 0.16}


  8%|▊         | 9800/117610 [49:50<8:03:02,  3.72it/s]

{'loss': 6.4985, 'learning_rate': 4.924119504083414e-05, 'epoch': 0.17}


  8%|▊         | 9900/117610 [50:16<7:52:23,  3.80it/s]

{'loss': 6.4835, 'learning_rate': 4.922469588560302e-05, 'epoch': 0.17}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.4784, 'learning_rate': 4.920802210318438e-05, 'epoch': 0.17}


  9%|▊         | 10100/117610 [51:25<7:53:36,  3.78it/s]  

{'loss': 6.4966, 'learning_rate': 4.919117381377357e-05, 'epoch': 0.17}


  9%|▊         | 10200/117610 [51:51<8:05:29,  3.69it/s]

{'loss': 6.5242, 'learning_rate': 4.9174151138823896e-05, 'epoch': 0.17}


  9%|▉         | 10300/117610 [52:17<7:45:31,  3.84it/s]

{'loss': 6.4712, 'learning_rate': 4.915695420104575e-05, 'epoch': 0.18}


  9%|▉         | 10400/117610 [52:44<7:43:16,  3.86it/s]

{'loss': 6.4686, 'learning_rate': 4.9139583124405694e-05, 'epoch': 0.18}


  9%|▉         | 10500/117610 [53:25<135:34:18,  4.56s/it]

{'loss': 6.4602, 'learning_rate': 4.912203803412566e-05, 'epoch': 0.18}


  9%|▉         | 10600/117610 [53:51<7:50:57,  3.79it/s]  

{'loss': 6.4347, 'learning_rate': 4.9104319056681916e-05, 'epoch': 0.18}


  9%|▉         | 10700/117610 [54:18<8:01:34,  3.70it/s]

{'loss': 6.4647, 'learning_rate': 4.908642631980426e-05, 'epoch': 0.18}


  9%|▉         | 10800/117610 [54:45<7:51:20,  3.78it/s]

{'loss': 6.5104, 'learning_rate': 4.906835995247503e-05, 'epoch': 0.18}


  9%|▉         | 10900/117610 [55:11<8:00:16,  3.70it/s]

{'loss': 6.4663, 'learning_rate': 4.9050120084928234e-05, 'epoch': 0.19}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.4596, 'learning_rate': 4.903170684864857e-05, 'epoch': 0.19}


  9%|▉         | 11100/117610 [56:20<8:01:05,  3.69it/s]  

{'loss': 6.4346, 'learning_rate': 4.901312037637048e-05, 'epoch': 0.19}


 10%|▉         | 11200/117610 [56:47<7:59:10,  3.70it/s]

{'loss': 6.4202, 'learning_rate': 4.899436080207721e-05, 'epoch': 0.19}


 10%|▉         | 11300/117610 [57:13<7:54:01,  3.74it/s]

{'loss': 6.448, 'learning_rate': 4.897542826099984e-05, 'epoch': 0.19}


 10%|▉         | 11400/117610 [57:39<7:26:16,  3.97it/s]

{'loss': 6.4034, 'learning_rate': 4.89563228896163e-05, 'epoch': 0.19}


 10%|▉         | 11500/117610 [58:20<135:47:18,  4.61s/it]

{'loss': 6.4017, 'learning_rate': 4.893704482565039e-05, 'epoch': 0.2}


 10%|▉         | 11600/117610 [58:46<7:46:43,  3.79it/s]  

{'loss': 6.4022, 'learning_rate': 4.89175942080708e-05, 'epoch': 0.2}


 10%|▉         | 11700/117610 [59:13<8:05:30,  3.64it/s]

{'loss': 6.3922, 'learning_rate': 4.88979711770901e-05, 'epoch': 0.2}


 10%|█         | 11800/117610 [59:39<7:40:05,  3.83it/s]

{'loss': 6.391, 'learning_rate': 4.88781758741637e-05, 'epoch': 0.2}


 10%|█         | 11900/117610 [1:00:06<7:55:25,  3.71it/s]

{'loss': 6.3847, 'learning_rate': 4.8858208441988874e-05, 'epoch': 0.2}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.3954, 'learning_rate': 4.8838069024503726e-05, 'epoch': 0.2}


 10%|█         | 12100/117610 [1:01:14<7:35:33,  3.86it/s]  

{'loss': 6.3756, 'learning_rate': 4.881775776688612e-05, 'epoch': 0.21}


 10%|█         | 12200/117610 [1:01:40<7:35:45,  3.85it/s]

{'loss': 6.3625, 'learning_rate': 4.879727481555267e-05, 'epoch': 0.21}


 10%|█         | 12300/117610 [1:02:07<7:56:15,  3.69it/s]

{'loss': 6.3319, 'learning_rate': 4.877662031815766e-05, 'epoch': 0.21}


 11%|█         | 12400/117610 [1:02:33<7:45:15,  3.77it/s]

{'loss': 6.323, 'learning_rate': 4.8755794423592e-05, 'epoch': 0.21}


 11%|█         | 12500/117610 [1:03:14<134:45:23,  4.62s/it]

{'loss': 6.305, 'learning_rate': 4.873479728198211e-05, 'epoch': 0.21}


 11%|█         | 12600/117610 [1:03:41<7:40:42,  3.80it/s]  

{'loss': 6.3475, 'learning_rate': 4.8713629044688916e-05, 'epoch': 0.21}


 11%|█         | 12700/117610 [1:04:07<7:39:38,  3.80it/s]

{'loss': 6.3305, 'learning_rate': 4.869228986430667e-05, 'epoch': 0.22}


 11%|█         | 12800/117610 [1:04:34<7:41:37,  3.78it/s]

{'loss': 6.3191, 'learning_rate': 4.867077989466191e-05, 'epoch': 0.22}


 11%|█         | 12900/117610 [1:05:00<7:40:56,  3.79it/s]

{'loss': 6.3485, 'learning_rate': 4.8649099290812335e-05, 'epoch': 0.22}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.3144, 'learning_rate': 4.862724820904567e-05, 'epoch': 0.22}


 11%|█         | 13100/117610 [1:06:08<7:46:23,  3.73it/s]  

{'loss': 6.3271, 'learning_rate': 4.860522680687858e-05, 'epoch': 0.22}


 11%|█         | 13200/117610 [1:06:35<7:39:36,  3.79it/s]

{'loss': 6.2803, 'learning_rate': 4.8583035243055496e-05, 'epoch': 0.22}


 11%|█▏        | 13300/117610 [1:07:02<7:13:47,  4.01it/s]

{'loss': 6.2504, 'learning_rate': 4.8560673677547465e-05, 'epoch': 0.23}


 11%|█▏        | 13400/117610 [1:07:28<7:41:30,  3.76it/s]

{'loss': 6.3013, 'learning_rate': 4.853814227155105e-05, 'epoch': 0.23}


 11%|█▏        | 13500/117610 [1:08:09<131:14:16,  4.54s/it]

{'loss': 6.3074, 'learning_rate': 4.8515441187487096e-05, 'epoch': 0.23}


 12%|█▏        | 13600/117610 [1:08:36<7:57:25,  3.63it/s]  

{'loss': 6.2975, 'learning_rate': 4.849257058899964e-05, 'epoch': 0.23}


 12%|█▏        | 13700/117610 [1:09:02<7:16:28,  3.97it/s]

{'loss': 6.2679, 'learning_rate': 4.846953064095465e-05, 'epoch': 0.23}


 12%|█▏        | 13800/117610 [1:09:29<7:48:45,  3.69it/s]

{'loss': 6.2977, 'learning_rate': 4.8446321509438894e-05, 'epoch': 0.23}


 12%|█▏        | 13900/117610 [1:09:55<7:48:39,  3.69it/s]

{'loss': 6.2623, 'learning_rate': 4.8422943361758716e-05, 'epoch': 0.24}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.2712, 'learning_rate': 4.8399396366438845e-05, 'epoch': 0.24}


 12%|█▏        | 14100/117610 [1:11:04<7:28:09,  3.85it/s]  

{'loss': 6.2361, 'learning_rate': 4.8375680693221146e-05, 'epoch': 0.24}


 12%|█▏        | 14200/117610 [1:11:30<7:27:57,  3.85it/s]

{'loss': 6.2133, 'learning_rate': 4.835179651306347e-05, 'epoch': 0.24}


 12%|█▏        | 14300/117610 [1:11:57<7:53:25,  3.64it/s]

{'loss': 6.2036, 'learning_rate': 4.8327743998138345e-05, 'epoch': 0.24}


 12%|█▏        | 14400/117610 [1:12:23<7:50:09,  3.66it/s]

{'loss': 6.2247, 'learning_rate': 4.830352332183176e-05, 'epoch': 0.24}


 12%|█▏        | 14500/117610 [1:13:04<131:39:03,  4.60s/it]

{'loss': 6.2095, 'learning_rate': 4.827913465874192e-05, 'epoch': 0.25}


 12%|█▏        | 14600/117610 [1:13:31<7:17:05,  3.93it/s]  

{'loss': 6.2584, 'learning_rate': 4.825457818467801e-05, 'epoch': 0.25}


 12%|█▏        | 14700/117610 [1:13:57<7:40:13,  3.73it/s]

{'loss': 6.2372, 'learning_rate': 4.822985407665887e-05, 'epoch': 0.25}


 13%|█▎        | 14800/117610 [1:14:24<7:25:49,  3.84it/s]

{'loss': 6.2137, 'learning_rate': 4.820496251291179e-05, 'epoch': 0.25}


 13%|█▎        | 14900/117610 [1:14:50<7:44:50,  3.68it/s]

{'loss': 6.2009, 'learning_rate': 4.817990367287115e-05, 'epoch': 0.25}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.1798, 'learning_rate': 4.81546777371772e-05, 'epoch': 0.26}


 13%|█▎        | 15100/117610 [1:15:59<7:41:30,  3.70it/s]  

{'loss': 6.2355, 'learning_rate': 4.812928488767471e-05, 'epoch': 0.26}


 13%|█▎        | 15200/117610 [1:16:25<7:23:51,  3.85it/s]

{'loss': 6.2255, 'learning_rate': 4.810372530741165e-05, 'epoch': 0.26}


 13%|█▎        | 15300/117610 [1:16:52<7:42:54,  3.68it/s]

{'loss': 6.2313, 'learning_rate': 4.807799918063794e-05, 'epoch': 0.26}


 13%|█▎        | 15400/117610 [1:17:18<7:33:25,  3.76it/s]

{'loss': 6.1838, 'learning_rate': 4.8052106692804026e-05, 'epoch': 0.26}


 13%|█▎        | 15500/117610 [1:17:59<130:16:26,  4.59s/it]

{'loss': 6.2102, 'learning_rate': 4.802604803055962e-05, 'epoch': 0.26}


 13%|█▎        | 15600/117610 [1:18:26<7:39:00,  3.70it/s]  

{'loss': 6.1553, 'learning_rate': 4.799982338175232e-05, 'epoch': 0.27}


 13%|█▎        | 15700/117610 [1:18:52<7:32:39,  3.75it/s]

{'loss': 6.1489, 'learning_rate': 4.797343293542626e-05, 'epoch': 0.27}


 13%|█▎        | 15800/117610 [1:19:19<7:33:13,  3.74it/s]

{'loss': 6.2053, 'learning_rate': 4.794687688182075e-05, 'epoch': 0.27}


 14%|█▎        | 15900/117610 [1:19:45<7:11:43,  3.93it/s]

{'loss': 6.155, 'learning_rate': 4.7920155412368896e-05, 'epoch': 0.27}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.1699, 'learning_rate': 4.789326871969624e-05, 'epoch': 0.27}


 14%|█▎        | 16100/117610 [1:20:53<7:12:54,  3.91it/s]  

{'loss': 6.2013, 'learning_rate': 4.786621699761936e-05, 'epoch': 0.27}


 14%|█▍        | 16200/117610 [1:21:20<7:13:44,  3.90it/s]

{'loss': 6.1447, 'learning_rate': 4.7839000441144456e-05, 'epoch': 0.28}


 14%|█▍        | 16300/117610 [1:21:46<7:41:34,  3.66it/s]

{'loss': 6.1445, 'learning_rate': 4.781161924646598e-05, 'epoch': 0.28}


 14%|█▍        | 16400/117610 [1:22:12<7:14:31,  3.88it/s]

{'loss': 6.1678, 'learning_rate': 4.778407361096519e-05, 'epoch': 0.28}


 14%|█▍        | 16500/117610 [1:22:53<130:29:09,  4.65s/it]

{'loss': 6.1741, 'learning_rate': 4.775636373320874e-05, 'epoch': 0.28}


 14%|█▍        | 16600/117610 [1:23:20<7:27:53,  3.76it/s]  

{'loss': 6.1461, 'learning_rate': 4.772848981294725e-05, 'epoch': 0.28}


 14%|█▍        | 16700/117610 [1:23:47<7:11:56,  3.89it/s]

{'loss': 6.1672, 'learning_rate': 4.770045205111386e-05, 'epoch': 0.28}


 14%|█▍        | 16800/117610 [1:24:13<7:28:11,  3.75it/s]

{'loss': 6.1294, 'learning_rate': 4.7672250649822804e-05, 'epoch': 0.29}


 14%|█▍        | 16900/117610 [1:24:40<7:20:34,  3.81it/s]

{'loss': 6.168, 'learning_rate': 4.7643885812367906e-05, 'epoch': 0.29}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.1821, 'learning_rate': 4.761535774322116e-05, 'epoch': 0.29}


 15%|█▍        | 17100/117610 [1:25:48<7:05:26,  3.94it/s]  

{'loss': 6.1603, 'learning_rate': 4.758666664803123e-05, 'epoch': 0.29}


 15%|█▍        | 17200/117610 [1:26:14<7:16:36,  3.83it/s]

{'loss': 6.1296, 'learning_rate': 4.7557812733622e-05, 'epoch': 0.29}


 15%|█▍        | 17300/117610 [1:26:41<7:25:06,  3.76it/s]

{'loss': 6.1275, 'learning_rate': 4.752879620799102e-05, 'epoch': 0.29}


 15%|█▍        | 17400/117610 [1:27:07<7:17:14,  3.82it/s]

{'loss': 6.1124, 'learning_rate': 4.7499617280308086e-05, 'epoch': 0.3}


 15%|█▍        | 17500/117610 [1:27:48<128:17:44,  4.61s/it]

{'loss': 6.0876, 'learning_rate': 4.7470276160913654e-05, 'epoch': 0.3}


 15%|█▍        | 17600/117610 [1:28:14<7:23:44,  3.76it/s]  

{'loss': 6.1071, 'learning_rate': 4.74407730613174e-05, 'epoch': 0.3}


 15%|█▌        | 17700/117610 [1:28:41<7:19:37,  3.79it/s]

{'loss': 6.0776, 'learning_rate': 4.741110819419664e-05, 'epoch': 0.3}


 15%|█▌        | 17800/117610 [1:29:07<7:12:57,  3.84it/s]

{'loss': 6.1042, 'learning_rate': 4.7381281773394804e-05, 'epoch': 0.3}


 15%|█▌        | 17900/117610 [1:29:34<7:33:02,  3.67it/s]

{'loss': 6.0299, 'learning_rate': 4.735129401391992e-05, 'epoch': 0.3}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.1051, 'learning_rate': 4.7321145131943054e-05, 'epoch': 0.31}


 15%|█▌        | 18100/117610 [1:30:43<7:19:50,  3.77it/s]  

{'loss': 6.1057, 'learning_rate': 4.729083534479672e-05, 'epoch': 0.31}


 15%|█▌        | 18200/117610 [1:31:09<7:14:33,  3.81it/s]

{'loss': 6.0906, 'learning_rate': 4.726036487097337e-05, 'epoch': 0.31}


 16%|█▌        | 18300/117610 [1:31:36<7:18:08,  3.78it/s]

{'loss': 6.0578, 'learning_rate': 4.7229733930123765e-05, 'epoch': 0.31}


 16%|█▌        | 18400/117610 [1:32:03<7:24:21,  3.72it/s]

{'loss': 6.0815, 'learning_rate': 4.719894274305544e-05, 'epoch': 0.31}


 16%|█▌        | 18500/117610 [1:32:43<123:10:25,  4.47s/it]

{'loss': 6.0659, 'learning_rate': 4.716799153173106e-05, 'epoch': 0.31}


 16%|█▌        | 18600/117610 [1:33:10<7:21:04,  3.74it/s]  

{'loss': 6.083, 'learning_rate': 4.713688051926687e-05, 'epoch': 0.32}


 16%|█▌        | 18700/117610 [1:33:36<6:56:03,  3.96it/s]

{'loss': 6.0721, 'learning_rate': 4.710560992993105e-05, 'epoch': 0.32}


 16%|█▌        | 18800/117610 [1:34:02<7:10:39,  3.82it/s]

{'loss': 6.0477, 'learning_rate': 4.707417998914213e-05, 'epoch': 0.32}


 16%|█▌        | 18900/117610 [1:34:29<7:25:00,  3.70it/s]

{'loss': 6.0199, 'learning_rate': 4.7042590923467325e-05, 'epoch': 0.32}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.0637, 'learning_rate': 4.7010842960620935e-05, 'epoch': 0.32}


 16%|█▌        | 19100/117610 [1:35:37<7:20:05,  3.73it/s]  

{'loss': 6.0733, 'learning_rate': 4.697893632946271e-05, 'epoch': 0.32}


 16%|█▋        | 19200/117610 [1:36:04<7:10:04,  3.81it/s]

{'loss': 6.0198, 'learning_rate': 4.694687125999615e-05, 'epoch': 0.33}


 16%|█▋        | 19300/117610 [1:36:30<7:00:45,  3.89it/s]

{'loss': 6.0528, 'learning_rate': 4.691464798336691e-05, 'epoch': 0.33}


 16%|█▋        | 19400/117610 [1:36:57<7:01:44,  3.88it/s]

{'loss': 6.0272, 'learning_rate': 4.688226673186109e-05, 'epoch': 0.33}


 17%|█▋        | 19500/117610 [1:37:37<122:18:29,  4.49s/it]

{'loss': 6.0351, 'learning_rate': 4.684972773890357e-05, 'epoch': 0.33}


 17%|█▋        | 19601/117610 [1:38:04<6:38:29,  4.10it/s]  

{'loss': 6.0251, 'learning_rate': 4.681703123905633e-05, 'epoch': 0.33}


 17%|█▋        | 19700/117610 [1:38:30<7:20:50,  3.70it/s]

{'loss': 6.0504, 'learning_rate': 4.6784177468016795e-05, 'epoch': 0.34}


 17%|█▋        | 19800/117610 [1:38:56<7:19:54,  3.71it/s]

{'loss': 6.0758, 'learning_rate': 4.6751166662616054e-05, 'epoch': 0.34}


 17%|█▋        | 19900/117610 [1:39:23<7:33:39,  3.59it/s]

{'loss': 6.0157, 'learning_rate': 4.671799906081723e-05, 'epoch': 0.34}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 6.0303, 'learning_rate': 4.668467490171372e-05, 'epoch': 0.34}


 17%|█▋        | 20100/117610 [1:40:33<7:12:41,  3.76it/s]  

{'loss': 6.0241, 'learning_rate': 4.6651194425527515e-05, 'epoch': 0.34}


 17%|█▋        | 20200/117610 [1:40:59<7:30:23,  3.60it/s]

{'loss': 6.0132, 'learning_rate': 4.6617557873607394e-05, 'epoch': 0.34}


 17%|█▋        | 20300/117610 [1:41:26<7:09:49,  3.77it/s]

{'loss': 5.9743, 'learning_rate': 4.658376548842727e-05, 'epoch': 0.35}


 17%|█▋        | 20400/117610 [1:41:52<7:11:55,  3.75it/s]

{'loss': 6.0318, 'learning_rate': 4.6549817513584386e-05, 'epoch': 0.35}


 17%|█▋        | 20500/117610 [1:42:33<124:00:41,  4.60s/it]

{'loss': 6.0119, 'learning_rate': 4.6515714193797585e-05, 'epoch': 0.35}


 18%|█▊        | 20600/117610 [1:43:00<7:10:48,  3.75it/s]  

{'loss': 5.9967, 'learning_rate': 4.648145577490551e-05, 'epoch': 0.35}


 18%|█▊        | 20700/117610 [1:43:27<7:07:26,  3.78it/s]

{'loss': 5.974, 'learning_rate': 4.644704250386489e-05, 'epoch': 0.35}


 18%|█▊        | 20800/117610 [1:43:53<7:14:20,  3.71it/s]

{'loss': 5.9732, 'learning_rate': 4.641247462874873e-05, 'epoch': 0.35}


 18%|█▊        | 20900/117610 [1:44:20<7:03:57,  3.80it/s]

{'loss': 5.9693, 'learning_rate': 4.637775239874447e-05, 'epoch': 0.36}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.9753, 'learning_rate': 4.63428760641523e-05, 'epoch': 0.36}


 18%|█▊        | 21100/117610 [1:45:29<7:13:06,  3.71it/s]  

{'loss': 5.9222, 'learning_rate': 4.630784587638326e-05, 'epoch': 0.36}


 18%|█▊        | 21200/117610 [1:45:55<7:12:35,  3.71it/s]

{'loss': 5.9842, 'learning_rate': 4.627266208795748e-05, 'epoch': 0.36}


 18%|█▊        | 21300/117610 [1:46:22<7:03:55,  3.79it/s]

{'loss': 5.9465, 'learning_rate': 4.623732495250232e-05, 'epoch': 0.36}


 18%|█▊        | 21400/117610 [1:46:48<6:54:00,  3.87it/s]

{'loss': 5.9135, 'learning_rate': 4.620183472475059e-05, 'epoch': 0.36}


 18%|█▊        | 21500/117610 [1:47:29<120:05:00,  4.50s/it]

{'loss': 5.9365, 'learning_rate': 4.616619166053868e-05, 'epoch': 0.37}


 18%|█▊        | 21601/117610 [1:47:55<6:36:00,  4.04it/s]  

{'loss': 5.9856, 'learning_rate': 4.613039601680471e-05, 'epoch': 0.37}


 18%|█▊        | 21700/117610 [1:48:22<7:01:24,  3.79it/s]

{'loss': 5.9072, 'learning_rate': 4.6094448051586714e-05, 'epoch': 0.37}


 19%|█▊        | 21800/117610 [1:48:48<6:53:34,  3.86it/s]

{'loss': 5.9492, 'learning_rate': 4.605834802402073e-05, 'epoch': 0.37}


 19%|█▊        | 21900/117610 [1:49:15<6:45:19,  3.94it/s]

{'loss': 5.9135, 'learning_rate': 4.602209619433899e-05, 'epoch': 0.37}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.9654, 'learning_rate': 4.598569282386798e-05, 'epoch': 0.37}


 19%|█▉        | 22101/117610 [1:50:24<6:31:04,  4.07it/s]  

{'loss': 5.9567, 'learning_rate': 4.5949138175026606e-05, 'epoch': 0.38}


 19%|█▉        | 22200/117610 [1:50:51<7:01:58,  3.77it/s]

{'loss': 5.9323, 'learning_rate': 4.59124325113243e-05, 'epoch': 0.38}


 19%|█▉        | 22300/117610 [1:51:17<6:58:48,  3.79it/s]

{'loss': 5.9351, 'learning_rate': 4.5875576097359084e-05, 'epoch': 0.38}


 19%|█▉        | 22400/117610 [1:51:43<6:59:36,  3.78it/s]

{'loss': 5.9106, 'learning_rate': 4.5838569198815696e-05, 'epoch': 0.38}


 19%|█▉        | 22500/117610 [1:52:24<120:50:14,  4.57s/it]

{'loss': 5.9685, 'learning_rate': 4.5801412082463656e-05, 'epoch': 0.38}


 19%|█▉        | 22600/117610 [1:52:51<6:51:53,  3.84it/s]  

{'loss': 5.9686, 'learning_rate': 4.576410501615537e-05, 'epoch': 0.38}


 19%|█▉        | 22700/117610 [1:53:17<6:54:28,  3.82it/s]

{'loss': 5.8988, 'learning_rate': 4.572664826882415e-05, 'epoch': 0.39}


 19%|█▉        | 22800/117610 [1:53:43<6:52:54,  3.83it/s]

{'loss': 5.8923, 'learning_rate': 4.568904211048232e-05, 'epoch': 0.39}


 19%|█▉        | 22900/117610 [1:54:10<6:47:59,  3.87it/s]

{'loss': 5.8916, 'learning_rate': 4.5651286812219266e-05, 'epoch': 0.39}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.8516, 'learning_rate': 4.561338264619944e-05, 'epoch': 0.39}


 20%|█▉        | 23100/117610 [1:55:19<6:41:34,  3.92it/s]  

{'loss': 5.8908, 'learning_rate': 4.557532988566047e-05, 'epoch': 0.39}


 20%|█▉        | 23200/117610 [1:55:46<6:59:48,  3.75it/s]

{'loss': 5.8766, 'learning_rate': 4.553712880491112e-05, 'epoch': 0.39}


 20%|█▉        | 23300/117610 [1:56:12<6:46:10,  3.87it/s]

{'loss': 5.8802, 'learning_rate': 4.549877967932934e-05, 'epoch': 0.4}


 20%|█▉        | 23400/117610 [1:56:38<7:07:47,  3.67it/s]

{'loss': 5.9132, 'learning_rate': 4.546028278536031e-05, 'epoch': 0.4}


 20%|█▉        | 23500/117610 [1:57:19<120:35:10,  4.61s/it]

{'loss': 5.8567, 'learning_rate': 4.542163840051437e-05, 'epoch': 0.4}


 20%|██        | 23600/117610 [1:57:46<6:43:27,  3.88it/s]  

{'loss': 5.8486, 'learning_rate': 4.538284680336513e-05, 'epoch': 0.4}


 20%|██        | 23700/117610 [1:58:12<6:50:09,  3.82it/s]

{'loss': 5.885, 'learning_rate': 4.534390827354734e-05, 'epoch': 0.4}


 20%|██        | 23800/117610 [1:58:38<7:01:09,  3.71it/s]

{'loss': 5.8905, 'learning_rate': 4.5304823091754996e-05, 'epoch': 0.4}


 20%|██        | 23900/117610 [1:59:05<7:00:04,  3.72it/s]

{'loss': 5.8733, 'learning_rate': 4.5265591539739204e-05, 'epoch': 0.41}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.8453, 'learning_rate': 4.5226213900306224e-05, 'epoch': 0.41}


 20%|██        | 24100/117610 [2:00:14<6:38:37,  3.91it/s]  

{'loss': 5.8669, 'learning_rate': 4.518669045731542e-05, 'epoch': 0.41}


 21%|██        | 24200/117610 [2:00:40<6:50:02,  3.80it/s]

{'loss': 5.8325, 'learning_rate': 4.514702149567717e-05, 'epoch': 0.41}


 21%|██        | 24300/117610 [2:01:07<6:52:05,  3.77it/s]

{'loss': 5.8502, 'learning_rate': 4.510720730135088e-05, 'epoch': 0.41}


 21%|██        | 24400/117610 [2:01:33<6:26:00,  4.02it/s]

{'loss': 5.8574, 'learning_rate': 4.506724816134285e-05, 'epoch': 0.41}


 21%|██        | 24500/117610 [2:02:14<120:55:58,  4.68s/it]

{'loss': 5.7774, 'learning_rate': 4.502714436370427e-05, 'epoch': 0.42}


 21%|██        | 24600/117610 [2:02:41<6:55:20,  3.73it/s]  

{'loss': 5.8526, 'learning_rate': 4.4986896197529097e-05, 'epoch': 0.42}


 21%|██        | 24700/117610 [2:03:08<6:53:51,  3.74it/s]

{'loss': 5.7765, 'learning_rate': 4.4946503952952e-05, 'epoch': 0.42}


 21%|██        | 24800/117610 [2:03:34<6:54:26,  3.73it/s]

{'loss': 5.8315, 'learning_rate': 4.490596792114627e-05, 'epoch': 0.42}


 21%|██        | 24900/117610 [2:04:00<6:34:26,  3.92it/s]

{'loss': 5.8344, 'learning_rate': 4.486528839432169e-05, 'epoch': 0.42}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.8039, 'learning_rate': 4.482446566572243e-05, 'epoch': 0.43}


 21%|██▏       | 25100/117610 [2:05:09<6:54:52,  3.72it/s]  

{'loss': 5.8342, 'learning_rate': 4.478350002962499e-05, 'epoch': 0.43}


 21%|██▏       | 25200/117610 [2:05:35<6:39:23,  3.86it/s]

{'loss': 5.8366, 'learning_rate': 4.4742391781336024e-05, 'epoch': 0.43}


 22%|██▏       | 25300/117610 [2:06:02<6:50:45,  3.75it/s]

{'loss': 5.7959, 'learning_rate': 4.4701141217190215e-05, 'epoch': 0.43}


 22%|██▏       | 25400/117610 [2:06:28<7:02:06,  3.64it/s]

{'loss': 5.8352, 'learning_rate': 4.465974863454815e-05, 'epoch': 0.43}


 22%|██▏       | 25500/117610 [2:07:09<113:15:08,  4.43s/it]

{'loss': 5.8077, 'learning_rate': 4.4618214331794214e-05, 'epoch': 0.43}


 22%|██▏       | 25600/117610 [2:07:35<6:39:28,  3.84it/s]  

{'loss': 5.7723, 'learning_rate': 4.457653860833434e-05, 'epoch': 0.44}


 22%|██▏       | 25700/117610 [2:08:01<6:47:57,  3.75it/s]

{'loss': 5.8356, 'learning_rate': 4.453472176459397e-05, 'epoch': 0.44}


 22%|██▏       | 25800/117610 [2:08:28<6:47:40,  3.75it/s]

{'loss': 5.7952, 'learning_rate': 4.44927641020158e-05, 'epoch': 0.44}


 22%|██▏       | 25900/117610 [2:08:54<6:31:39,  3.90it/s]

{'loss': 5.7914, 'learning_rate': 4.4450665923057646e-05, 'epoch': 0.44}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.7639, 'learning_rate': 4.4408427531190245e-05, 'epoch': 0.44}


 22%|██▏       | 26100/117610 [2:10:02<6:41:46,  3.80it/s]  

{'loss': 5.7743, 'learning_rate': 4.43660492308951e-05, 'epoch': 0.44}


 22%|██▏       | 26200/117610 [2:10:29<6:50:34,  3.71it/s]

{'loss': 5.7682, 'learning_rate': 4.432353132766226e-05, 'epoch': 0.45}


 22%|██▏       | 26300/117610 [2:10:55<6:29:01,  3.91it/s]

{'loss': 5.7267, 'learning_rate': 4.428087412798811e-05, 'epoch': 0.45}


 22%|██▏       | 26400/117610 [2:11:21<6:36:13,  3.84it/s]

{'loss': 5.7385, 'learning_rate': 4.423807793937318e-05, 'epoch': 0.45}


 23%|██▎       | 26500/117610 [2:11:59<93:09:45,  3.68s/it]

{'loss': 5.7034, 'learning_rate': 4.4195143070319914e-05, 'epoch': 0.45}


 23%|██▎       | 26600/117610 [2:12:25<6:50:52,  3.69it/s] 

{'loss': 5.7973, 'learning_rate': 4.415206983033047e-05, 'epoch': 0.45}


 23%|██▎       | 26700/117610 [2:12:52<6:24:27,  3.94it/s]

{'loss': 5.74, 'learning_rate': 4.410885852990446e-05, 'epoch': 0.45}


 23%|██▎       | 26800/117610 [2:13:18<6:49:10,  3.70it/s]

{'loss': 5.7812, 'learning_rate': 4.406550948053674e-05, 'epoch': 0.46}


 23%|██▎       | 26900/117610 [2:13:45<6:53:09,  3.66it/s]

{'loss': 5.7525, 'learning_rate': 4.4022022994715107e-05, 'epoch': 0.46}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.7311, 'learning_rate': 4.397839938591815e-05, 'epoch': 0.46}


 23%|██▎       | 27100/117610 [2:14:53<6:39:00,  3.78it/s]  

{'loss': 5.7096, 'learning_rate': 4.3934638968612873e-05, 'epoch': 0.46}


 23%|██▎       | 27200/117610 [2:15:20<6:39:03,  3.78it/s]

{'loss': 5.7678, 'learning_rate': 4.389074205825253e-05, 'epoch': 0.46}


 23%|██▎       | 27300/117610 [2:15:46<6:37:16,  3.79it/s]

{'loss': 5.8028, 'learning_rate': 4.384670897127426e-05, 'epoch': 0.46}


 23%|██▎       | 27400/117610 [2:16:12<6:34:13,  3.81it/s]

{'loss': 5.7231, 'learning_rate': 4.3802540025096884e-05, 'epoch': 0.47}


 23%|██▎       | 27500/117610 [2:16:52<108:23:23,  4.33s/it]

{'loss': 5.7433, 'learning_rate': 4.3758235538118574e-05, 'epoch': 0.47}


 23%|██▎       | 27600/117610 [2:17:19<6:19:24,  3.95it/s]  

{'loss': 5.6703, 'learning_rate': 4.371379582971456e-05, 'epoch': 0.47}


 24%|██▎       | 27700/117610 [2:17:45<6:32:06,  3.82it/s]

{'loss': 5.7122, 'learning_rate': 4.3669221220234844e-05, 'epoch': 0.47}


 24%|██▎       | 27800/117610 [2:18:11<6:25:48,  3.88it/s]

{'loss': 5.7555, 'learning_rate': 4.3624512031001875e-05, 'epoch': 0.47}


 24%|██▎       | 27900/117610 [2:18:38<6:33:23,  3.80it/s]

{'loss': 5.734, 'learning_rate': 4.3579668584308255e-05, 'epoch': 0.47}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.6992, 'learning_rate': 4.353469120341437e-05, 'epoch': 0.48}


 24%|██▍       | 28100/117610 [2:19:46<6:34:33,  3.78it/s]  

{'loss': 5.7236, 'learning_rate': 4.348958021254611e-05, 'epoch': 0.48}


 24%|██▍       | 28200/117610 [2:20:13<6:37:08,  3.75it/s]

{'loss': 5.7513, 'learning_rate': 4.344433593689252e-05, 'epoch': 0.48}


 24%|██▍       | 28300/117610 [2:20:39<6:40:53,  3.71it/s]

{'loss': 5.7511, 'learning_rate': 4.339895870260344e-05, 'epoch': 0.48}


 24%|██▍       | 28400/117610 [2:21:06<6:40:45,  3.71it/s]

{'loss': 5.6908, 'learning_rate': 4.3353448836787144e-05, 'epoch': 0.48}


 24%|██▍       | 28500/117610 [2:21:46<112:07:15,  4.53s/it]

{'loss': 5.6903, 'learning_rate': 4.3307806667508016e-05, 'epoch': 0.48}


 24%|██▍       | 28600/117610 [2:22:13<8:39:21,  2.86it/s]  

{'loss': 5.6963, 'learning_rate': 4.3262032523784176e-05, 'epoch': 0.49}


 24%|██▍       | 28700/117610 [2:22:40<6:39:37,  3.71it/s]

{'loss': 5.6845, 'learning_rate': 4.3216126735585074e-05, 'epoch': 0.49}


 24%|██▍       | 28800/117610 [2:23:06<6:30:18,  3.79it/s]

{'loss': 5.6645, 'learning_rate': 4.317008963382917e-05, 'epoch': 0.49}


 25%|██▍       | 28900/117610 [2:23:33<6:33:32,  3.76it/s]

{'loss': 5.6669, 'learning_rate': 4.31239215503815e-05, 'epoch': 0.49}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.6463, 'learning_rate': 4.30776228180513e-05, 'epoch': 0.49}


 25%|██▍       | 29100/117610 [2:24:41<6:34:45,  3.74it/s]  

{'loss': 5.6811, 'learning_rate': 4.3031193770589596e-05, 'epoch': 0.49}


 25%|██▍       | 29200/117610 [2:25:08<6:33:02,  3.75it/s]

{'loss': 5.6869, 'learning_rate': 4.298463474268683e-05, 'epoch': 0.5}


 25%|██▍       | 29300/117610 [2:25:34<6:37:35,  3.70it/s]

{'loss': 5.7008, 'learning_rate': 4.2937946069970424e-05, 'epoch': 0.5}


 25%|██▍       | 29400/117610 [2:26:00<6:26:41,  3.80it/s]

{'loss': 5.6381, 'learning_rate': 4.289112808900235e-05, 'epoch': 0.5}


 25%|██▌       | 29500/117610 [2:26:41<111:53:02,  4.57s/it]

{'loss': 5.6979, 'learning_rate': 4.284418113727674e-05, 'epoch': 0.5}


 25%|██▌       | 29600/117610 [2:27:07<6:29:50,  3.76it/s]  

{'loss': 5.6715, 'learning_rate': 4.2797105553217395e-05, 'epoch': 0.5}


 25%|██▌       | 29700/117610 [2:27:34<6:16:07,  3.90it/s]

{'loss': 5.6687, 'learning_rate': 4.2749901676175416e-05, 'epoch': 0.51}


 25%|██▌       | 29800/117610 [2:28:00<6:34:28,  3.71it/s]

{'loss': 5.6578, 'learning_rate': 4.270256984642669e-05, 'epoch': 0.51}


 25%|██▌       | 29900/117610 [2:28:26<6:24:12,  3.80it/s]

{'loss': 5.6843, 'learning_rate': 4.265511040516951e-05, 'epoch': 0.51}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.6599, 'learning_rate': 4.260752369452202e-05, 'epoch': 0.51}


 26%|██▌       | 30100/117610 [2:29:32<6:17:00,  3.87it/s] 

{'loss': 5.6561, 'learning_rate': 4.2559810057519866e-05, 'epoch': 0.51}


 26%|██▌       | 30200/117610 [2:29:59<6:21:04,  3.82it/s]

{'loss': 5.5996, 'learning_rate': 4.2511969838113606e-05, 'epoch': 0.51}


 26%|██▌       | 30300/117610 [2:30:25<6:27:22,  3.76it/s]

{'loss': 5.6772, 'learning_rate': 4.246400338116633e-05, 'epoch': 0.52}


 26%|██▌       | 30400/117610 [2:30:52<6:27:02,  3.76it/s]

{'loss': 5.6692, 'learning_rate': 4.2415911032451084e-05, 'epoch': 0.52}


 26%|██▌       | 30500/117610 [2:31:32<106:24:58,  4.40s/it]

{'loss': 5.6509, 'learning_rate': 4.236769313864847e-05, 'epoch': 0.52}


 26%|██▌       | 30600/117610 [2:31:58<6:33:39,  3.68it/s]  

{'loss': 5.5657, 'learning_rate': 4.2319350047344056e-05, 'epoch': 0.52}


 26%|██▌       | 30700/117610 [2:32:25<6:10:10,  3.91it/s]

{'loss': 5.6558, 'learning_rate': 4.227088210702595e-05, 'epoch': 0.52}


 26%|██▌       | 30800/117610 [2:32:51<6:07:34,  3.94it/s]

{'loss': 5.5895, 'learning_rate': 4.2222289667082235e-05, 'epoch': 0.52}


 26%|██▋       | 30900/117610 [2:33:17<6:09:48,  3.91it/s]

{'loss': 5.6222, 'learning_rate': 4.217357307779847e-05, 'epoch': 0.53}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.6449, 'learning_rate': 4.212473269035515e-05, 'epoch': 0.53}


 26%|██▋       | 31100/117610 [2:34:26<6:38:31,  3.62it/s]  

{'loss': 5.6508, 'learning_rate': 4.2075768856825204e-05, 'epoch': 0.53}


 27%|██▋       | 31200/117610 [2:34:52<6:23:57,  3.75it/s]

{'loss': 5.5529, 'learning_rate': 4.2026681930171443e-05, 'epoch': 0.53}


 27%|██▋       | 31300/117610 [2:35:18<6:05:51,  3.93it/s]

{'loss': 5.5623, 'learning_rate': 4.1977472264244e-05, 'epoch': 0.53}


 27%|██▋       | 31400/117610 [2:35:45<6:13:09,  3.85it/s]

{'loss': 5.5636, 'learning_rate': 4.1928140213777786e-05, 'epoch': 0.53}


 27%|██▋       | 31501/117610 [2:36:25<78:39:42,  3.29s/it] 

{'loss': 5.5818, 'learning_rate': 4.1878686134389954e-05, 'epoch': 0.54}


 27%|██▋       | 31600/117610 [2:36:52<6:00:25,  3.98it/s] 

{'loss': 5.6386, 'learning_rate': 4.182911038257732e-05, 'epoch': 0.54}


 27%|██▋       | 31700/117610 [2:37:18<6:36:00,  3.62it/s]

{'loss': 5.6205, 'learning_rate': 4.177941331571377e-05, 'epoch': 0.54}


 27%|██▋       | 31800/117610 [2:37:44<6:21:51,  3.75it/s]

{'loss': 5.6086, 'learning_rate': 4.172959529204774e-05, 'epoch': 0.54}


 27%|██▋       | 31900/117610 [2:38:11<6:17:57,  3.78it/s]

{'loss': 5.6202, 'learning_rate': 4.167965667069957e-05, 'epoch': 0.54}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.5945, 'learning_rate': 4.162959781165897e-05, 'epoch': 0.54}


 27%|██▋       | 32100/117610 [2:39:19<6:15:10,  3.80it/s]  

{'loss': 5.5704, 'learning_rate': 4.1579419075782394e-05, 'epoch': 0.55}


 27%|██▋       | 32200/117610 [2:39:45<6:08:16,  3.87it/s]

{'loss': 5.576, 'learning_rate': 4.152912082479042e-05, 'epoch': 0.55}


 27%|██▋       | 32300/117610 [2:40:11<6:16:28,  3.78it/s]

{'loss': 5.5716, 'learning_rate': 4.147870342126522e-05, 'epoch': 0.55}


 28%|██▊       | 32400/117610 [2:40:37<6:27:56,  3.66it/s]

{'loss': 5.612, 'learning_rate': 4.1428167228647844e-05, 'epoch': 0.55}


 28%|██▊       | 32500/117610 [2:41:18<110:44:42,  4.68s/it]

{'loss': 5.5447, 'learning_rate': 4.1377512611235666e-05, 'epoch': 0.55}


 28%|██▊       | 32600/117610 [2:41:45<6:20:27,  3.72it/s]  

{'loss': 5.5755, 'learning_rate': 4.1326739934179754e-05, 'epoch': 0.55}


 28%|██▊       | 32700/117610 [2:42:11<6:09:56,  3.83it/s]

{'loss': 5.5752, 'learning_rate': 4.127584956348221e-05, 'epoch': 0.56}


 28%|██▊       | 32800/117610 [2:42:37<6:16:28,  3.75it/s]

{'loss': 5.5698, 'learning_rate': 4.122484186599358e-05, 'epoch': 0.56}


 28%|██▊       | 32900/117610 [2:43:04<6:10:22,  3.81it/s]

{'loss': 5.5874, 'learning_rate': 4.117371720941012e-05, 'epoch': 0.56}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.5508, 'learning_rate': 4.112247596227125e-05, 'epoch': 0.56}


 28%|██▊       | 33100/117610 [2:44:12<6:04:42,  3.86it/s]  

{'loss': 5.6388, 'learning_rate': 4.107111849395683e-05, 'epoch': 0.56}


 28%|██▊       | 33200/117610 [2:44:39<6:17:44,  3.72it/s]

{'loss': 5.5401, 'learning_rate': 4.101964517468453e-05, 'epoch': 0.56}


 28%|██▊       | 33300/117610 [2:45:05<6:17:36,  3.72it/s]

{'loss': 5.5235, 'learning_rate': 4.096805637550712e-05, 'epoch': 0.57}


 28%|██▊       | 33400/117610 [2:45:31<6:12:46,  3.76it/s]

{'loss': 5.5233, 'learning_rate': 4.091635246830986e-05, 'epoch': 0.57}


 28%|██▊       | 33500/117610 [2:46:12<106:33:06,  4.56s/it]

{'loss': 5.5657, 'learning_rate': 4.0864533825807764e-05, 'epoch': 0.57}


 29%|██▊       | 33600/117610 [2:46:39<6:12:22,  3.76it/s]  

{'loss': 5.5432, 'learning_rate': 4.081260082154292e-05, 'epoch': 0.57}


 29%|██▊       | 33700/117610 [2:47:05<5:57:13,  3.91it/s]

{'loss': 5.5296, 'learning_rate': 4.076055382988184e-05, 'epoch': 0.57}


 29%|██▊       | 33800/117610 [2:47:31<6:15:47,  3.72it/s]

{'loss': 5.5505, 'learning_rate': 4.0708393226012696e-05, 'epoch': 0.57}


 29%|██▉       | 33900/117610 [2:47:58<5:57:28,  3.90it/s]

{'loss': 5.5308, 'learning_rate': 4.065611938594268e-05, 'epoch': 0.58}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.5222, 'learning_rate': 4.0603732686495247e-05, 'epoch': 0.58}


 29%|██▉       | 34100/117610 [2:49:06<5:55:54,  3.91it/s]  

{'loss': 5.5617, 'learning_rate': 4.055123350530741e-05, 'epoch': 0.58}


 29%|██▉       | 34200/117610 [2:49:32<6:04:42,  3.81it/s]

{'loss': 5.4989, 'learning_rate': 4.049862222082703e-05, 'epoch': 0.58}


 29%|██▉       | 34300/117610 [2:49:59<6:11:06,  3.74it/s]

{'loss': 5.5125, 'learning_rate': 4.04458992123101e-05, 'epoch': 0.58}


 29%|██▉       | 34400/117610 [2:50:25<6:12:46,  3.72it/s]

{'loss': 5.5269, 'learning_rate': 4.0393064859817954e-05, 'epoch': 0.58}


 29%|██▉       | 34500/117610 [2:50:58<48:58:30,  2.12s/it]

{'loss': 5.5024, 'learning_rate': 4.034011954421459e-05, 'epoch': 0.59}


 29%|██▉       | 34600/117610 [2:51:25<6:00:06,  3.84it/s] 

{'loss': 5.5296, 'learning_rate': 4.0287063647163903e-05, 'epoch': 0.59}


 30%|██▉       | 34700/117610 [2:51:51<5:50:23,  3.94it/s]

{'loss': 5.5076, 'learning_rate': 4.0233897551126906e-05, 'epoch': 0.59}


 30%|██▉       | 34800/117610 [2:52:18<6:05:21,  3.78it/s]

{'loss': 5.5035, 'learning_rate': 4.018062163935902e-05, 'epoch': 0.59}


 30%|██▉       | 34900/117610 [2:52:44<5:52:03,  3.92it/s]

{'loss': 5.5156, 'learning_rate': 4.012723629590727e-05, 'epoch': 0.59}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.473, 'learning_rate': 4.0073741905607553e-05, 'epoch': 0.6}


 30%|██▉       | 35100/117610 [2:53:51<5:55:26,  3.87it/s] 

{'loss': 5.5071, 'learning_rate': 4.0020138854081844e-05, 'epoch': 0.6}


 30%|██▉       | 35200/117610 [2:54:18<6:06:45,  3.74it/s]

{'loss': 5.4684, 'learning_rate': 3.9966427527735405e-05, 'epoch': 0.6}


 30%|███       | 35300/117610 [2:54:44<5:51:29,  3.90it/s]

{'loss': 5.4805, 'learning_rate': 3.9912608313754024e-05, 'epoch': 0.6}


 30%|███       | 35400/117610 [2:55:11<5:57:00,  3.84it/s]

{'loss': 5.536, 'learning_rate': 3.98586816001012e-05, 'epoch': 0.6}


 30%|███       | 35500/117610 [2:55:48<80:18:02,  3.52s/it]

{'loss': 5.5161, 'learning_rate': 3.980464777551538e-05, 'epoch': 0.6}


 30%|███       | 35600/117610 [2:56:15<5:39:04,  4.03it/s] 

{'loss': 5.4427, 'learning_rate': 3.97505072295071e-05, 'epoch': 0.61}


 30%|███       | 35700/117610 [2:56:41<6:09:11,  3.70it/s]

{'loss': 5.5044, 'learning_rate': 3.969626035235625e-05, 'epoch': 0.61}


 30%|███       | 35800/117610 [2:57:08<6:03:53,  3.75it/s]

{'loss': 5.5242, 'learning_rate': 3.964190753510921e-05, 'epoch': 0.61}


 31%|███       | 35900/117610 [2:57:34<6:03:04,  3.75it/s]

{'loss': 5.4776, 'learning_rate': 3.9587449169576044e-05, 'epoch': 0.61}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.4387, 'learning_rate': 3.9532885648327666e-05, 'epoch': 0.61}


 31%|███       | 36100/117610 [2:58:43<6:08:51,  3.68it/s]  

{'loss': 5.4588, 'learning_rate': 3.9478217364693035e-05, 'epoch': 0.61}


 31%|███       | 36200/117610 [2:59:09<6:04:52,  3.72it/s]

{'loss': 5.474, 'learning_rate': 3.942344471275629e-05, 'epoch': 0.62}


 31%|███       | 36300/117610 [2:59:36<5:43:28,  3.95it/s]

{'loss': 5.4838, 'learning_rate': 3.936856808735394e-05, 'epoch': 0.62}


 31%|███       | 36400/117610 [3:00:02<6:14:22,  3.62it/s]

{'loss': 5.4998, 'learning_rate': 3.931358788407199e-05, 'epoch': 0.62}


 31%|███       | 36500/117610 [3:00:43<105:13:03,  4.67s/it]

{'loss': 5.492, 'learning_rate': 3.925850449924311e-05, 'epoch': 0.62}


 31%|███       | 36600/117610 [3:01:10<5:55:08,  3.80it/s]  

{'loss': 5.4842, 'learning_rate': 3.920331832994375e-05, 'epoch': 0.62}


 31%|███       | 36700/117610 [3:01:36<6:01:03,  3.73it/s]

{'loss': 5.4293, 'learning_rate': 3.914802977399132e-05, 'epoch': 0.62}


 31%|███▏      | 36800/117610 [3:02:02<5:58:51,  3.75it/s]

{'loss': 5.4914, 'learning_rate': 3.9092639229941296e-05, 'epoch': 0.63}


 31%|███▏      | 36900/117610 [3:02:29<5:58:24,  3.75it/s]

{'loss': 5.4549, 'learning_rate': 3.9037147097084336e-05, 'epoch': 0.63}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.4167, 'learning_rate': 3.8981553775443414e-05, 'epoch': 0.63}


 32%|███▏      | 37100/117610 [3:03:40<5:50:33,  3.83it/s]  

{'loss': 5.4456, 'learning_rate': 3.8925859665770947e-05, 'epoch': 0.63}


 32%|███▏      | 37200/117610 [3:04:06<6:11:18,  3.61it/s]

{'loss': 5.4733, 'learning_rate': 3.887006516954591e-05, 'epoch': 0.63}


 32%|███▏      | 37301/117610 [3:04:33<5:39:57,  3.94it/s]

{'loss': 5.4669, 'learning_rate': 3.881417068897091e-05, 'epoch': 0.63}


 32%|███▏      | 37400/117610 [3:04:59<5:45:22,  3.87it/s]

{'loss': 5.4452, 'learning_rate': 3.87581766269693e-05, 'epoch': 0.64}


 32%|███▏      | 37500/117610 [3:05:40<102:26:51,  4.60s/it]

{'loss': 5.4369, 'learning_rate': 3.8702083387182294e-05, 'epoch': 0.64}


 32%|███▏      | 37600/117610 [3:06:07<5:57:42,  3.73it/s]  

{'loss': 5.4654, 'learning_rate': 3.864589137396606e-05, 'epoch': 0.64}


 32%|███▏      | 37701/117610 [3:06:33<5:30:46,  4.03it/s]

{'loss': 5.415, 'learning_rate': 3.858960099238873e-05, 'epoch': 0.64}


 32%|███▏      | 37800/117610 [3:06:59<5:54:04,  3.76it/s]

{'loss': 5.3896, 'learning_rate': 3.8533212648227604e-05, 'epoch': 0.64}


 32%|███▏      | 37900/117610 [3:07:25<5:59:32,  3.70it/s]

{'loss': 5.4862, 'learning_rate': 3.847672674796613e-05, 'epoch': 0.64}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.4067, 'learning_rate': 3.8420143698790986e-05, 'epoch': 0.65}


 32%|███▏      | 38100/117610 [3:08:34<5:50:45,  3.78it/s] 

{'loss': 5.4313, 'learning_rate': 3.83634639085892e-05, 'epoch': 0.65}


 32%|███▏      | 38200/117610 [3:09:00<5:49:17,  3.79it/s]

{'loss': 5.4407, 'learning_rate': 3.830668778594515e-05, 'epoch': 0.65}


 33%|███▎      | 38300/117610 [3:09:27<5:49:49,  3.78it/s]

{'loss': 5.4431, 'learning_rate': 3.824981574013764e-05, 'epoch': 0.65}


 33%|███▎      | 38400/117610 [3:09:53<5:47:43,  3.80it/s]

{'loss': 5.4057, 'learning_rate': 3.819284818113694e-05, 'epoch': 0.65}


 33%|███▎      | 38500/117610 [3:10:34<101:25:34,  4.62s/it]

{'loss': 5.4243, 'learning_rate': 3.813578551960187e-05, 'epoch': 0.65}


 33%|███▎      | 38600/117610 [3:11:00<5:41:32,  3.86it/s]  

{'loss': 5.3748, 'learning_rate': 3.8078628166876795e-05, 'epoch': 0.66}


 33%|███▎      | 38700/117610 [3:11:27<5:48:01,  3.78it/s]

{'loss': 5.4192, 'learning_rate': 3.802137653498865e-05, 'epoch': 0.66}


 33%|███▎      | 38800/117610 [3:11:53<5:30:54,  3.97it/s]

{'loss': 5.4395, 'learning_rate': 3.796403103664404e-05, 'epoch': 0.66}


 33%|███▎      | 38900/117610 [3:12:19<5:51:11,  3.74it/s]

{'loss': 5.3999, 'learning_rate': 3.790659208522618e-05, 'epoch': 0.66}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.4196, 'learning_rate': 3.7849060094791986e-05, 'epoch': 0.66}


 33%|███▎      | 39100/117610 [3:13:28<5:47:03,  3.77it/s] 

{'loss': 5.3015, 'learning_rate': 3.7791435480069036e-05, 'epoch': 0.66}


 33%|███▎      | 39200/117610 [3:13:55<5:42:16,  3.82it/s]

{'loss': 5.3957, 'learning_rate': 3.773371865645263e-05, 'epoch': 0.67}


 33%|███▎      | 39300/117610 [3:14:21<5:42:07,  3.81it/s]

{'loss': 5.3902, 'learning_rate': 3.7675910040002734e-05, 'epoch': 0.67}


 34%|███▎      | 39400/117610 [3:14:48<5:38:20,  3.85it/s]

{'loss': 5.4363, 'learning_rate': 3.761801004744103e-05, 'epoch': 0.67}


 34%|███▎      | 39500/117610 [3:15:29<98:21:57,  4.53s/it]

{'loss': 5.3724, 'learning_rate': 3.756001909614793e-05, 'epoch': 0.67}


 34%|███▎      | 39600/117610 [3:15:55<5:45:40,  3.76it/s] 

{'loss': 5.4136, 'learning_rate': 3.750193760415948e-05, 'epoch': 0.67}


 34%|███▍      | 39700/117610 [3:16:21<5:37:52,  3.84it/s]

{'loss': 5.3984, 'learning_rate': 3.744376599016443e-05, 'epoch': 0.68}


 34%|███▍      | 39800/117610 [3:16:48<5:36:47,  3.85it/s]

{'loss': 5.3606, 'learning_rate': 3.738550467350119e-05, 'epoch': 0.68}


 34%|███▍      | 39900/117610 [3:17:14<5:43:26,  3.77it/s]

{'loss': 5.4176, 'learning_rate': 3.7327154074154805e-05, 'epoch': 0.68}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.3671, 'learning_rate': 3.7268714612753916e-05, 'epoch': 0.68}


 34%|███▍      | 40100/117610 [3:18:14<5:47:26,  3.72it/s] 

{'loss': 5.3575, 'learning_rate': 3.7210186710567754e-05, 'epoch': 0.68}


 34%|███▍      | 40200/117610 [3:18:41<5:41:55,  3.77it/s]

{'loss': 5.3634, 'learning_rate': 3.715157078950307e-05, 'epoch': 0.68}


 34%|███▍      | 40300/117610 [3:19:07<5:39:56,  3.79it/s]

{'loss': 5.3259, 'learning_rate': 3.7092867272101116e-05, 'epoch': 0.69}


 34%|███▍      | 40400/117610 [3:19:34<5:31:29,  3.88it/s]

{'loss': 5.3709, 'learning_rate': 3.7034076581534616e-05, 'epoch': 0.69}


 34%|███▍      | 40500/117610 [3:20:14<97:03:37,  4.53s/it]

{'loss': 5.3445, 'learning_rate': 3.697519914160465e-05, 'epoch': 0.69}


 35%|███▍      | 40600/117610 [3:20:41<5:44:24,  3.73it/s] 

{'loss': 5.4288, 'learning_rate': 3.691623537673771e-05, 'epoch': 0.69}


 35%|███▍      | 40700/117610 [3:21:07<5:18:26,  4.03it/s]

{'loss': 5.3401, 'learning_rate': 3.68571857119825e-05, 'epoch': 0.69}


 35%|███▍      | 40800/117610 [3:21:33<5:37:25,  3.79it/s]

{'loss': 5.3942, 'learning_rate': 3.6798050573006987e-05, 'epoch': 0.69}


 35%|███▍      | 40900/117610 [3:22:00<5:36:52,  3.80it/s]

{'loss': 5.3928, 'learning_rate': 3.673883038609529e-05, 'epoch': 0.7}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.3763, 'learning_rate': 3.667952557814462e-05, 'epoch': 0.7}


 35%|███▍      | 41100/117610 [3:23:07<5:39:35,  3.75it/s] 

{'loss': 5.373, 'learning_rate': 3.6620136576662135e-05, 'epoch': 0.7}


 35%|███▌      | 41200/117610 [3:23:34<5:35:49,  3.79it/s]

{'loss': 5.3515, 'learning_rate': 3.656066380976198e-05, 'epoch': 0.7}


 35%|███▌      | 41300/117610 [3:24:00<5:44:54,  3.69it/s]

{'loss': 5.3322, 'learning_rate': 3.6501107706162106e-05, 'epoch': 0.7}


 35%|███▌      | 41400/117610 [3:24:27<5:38:55,  3.75it/s]

{'loss': 5.3878, 'learning_rate': 3.6441468695181196e-05, 'epoch': 0.7}


 35%|███▌      | 41500/117610 [3:25:04<78:22:42,  3.71s/it]

{'loss': 5.3554, 'learning_rate': 3.638174720673561e-05, 'epoch': 0.71}


 35%|███▌      | 41600/117610 [3:25:31<5:24:57,  3.90it/s] 

{'loss': 5.3468, 'learning_rate': 3.6321943671336237e-05, 'epoch': 0.71}


 35%|███▌      | 41700/117610 [3:25:57<5:37:38,  3.75it/s]

{'loss': 5.3447, 'learning_rate': 3.626205852008542e-05, 'epoch': 0.71}


 36%|███▌      | 41800/117610 [3:26:24<5:44:18,  3.67it/s]

{'loss': 5.3471, 'learning_rate': 3.620209218467385e-05, 'epoch': 0.71}


 36%|███▌      | 41900/117610 [3:26:50<5:12:59,  4.03it/s]

{'loss': 5.334, 'learning_rate': 3.614204509737744e-05, 'epoch': 0.71}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.3808, 'learning_rate': 3.6081917691054206e-05, 'epoch': 0.71}


 36%|███▌      | 42100/117610 [3:27:59<5:27:24,  3.84it/s] 

{'loss': 5.3443, 'learning_rate': 3.6021710399141164e-05, 'epoch': 0.72}


 36%|███▌      | 42200/117610 [3:28:25<5:33:54,  3.76it/s]

{'loss': 5.2425, 'learning_rate': 3.596142365565121e-05, 'epoch': 0.72}


 36%|███▌      | 42300/117610 [3:28:52<5:27:33,  3.83it/s]

{'loss': 5.3438, 'learning_rate': 3.5901057895169935e-05, 'epoch': 0.72}


 36%|███▌      | 42400/117610 [3:29:18<5:29:34,  3.80it/s]

{'loss': 5.2541, 'learning_rate': 3.584061355285258e-05, 'epoch': 0.72}


 36%|███▌      | 42500/117610 [3:29:59<93:48:37,  4.50s/it]

{'loss': 5.3434, 'learning_rate': 3.578009106442084e-05, 'epoch': 0.72}


 36%|███▌      | 42600/117610 [3:30:25<5:39:56,  3.68it/s] 

{'loss': 5.3454, 'learning_rate': 3.571949086615972e-05, 'epoch': 0.72}


 36%|███▋      | 42700/117610 [3:30:51<5:23:13,  3.86it/s]

{'loss': 5.3005, 'learning_rate': 3.565881339491445e-05, 'epoch': 0.73}


 36%|███▋      | 42800/117610 [3:31:18<5:35:43,  3.71it/s]

{'loss': 5.3768, 'learning_rate': 3.559805908808724e-05, 'epoch': 0.73}


 36%|███▋      | 42900/117610 [3:31:45<5:25:59,  3.82it/s]

{'loss': 5.321, 'learning_rate': 3.5537228383634225e-05, 'epoch': 0.73}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.3194, 'learning_rate': 3.547632172006223e-05, 'epoch': 0.73}


 37%|███▋      | 43100/117610 [3:32:53<5:33:42,  3.72it/s] 

{'loss': 5.2883, 'learning_rate': 3.541533953642567e-05, 'epoch': 0.73}


 37%|███▋      | 43200/117610 [3:33:19<5:26:46,  3.80it/s]

{'loss': 5.3168, 'learning_rate': 3.535428227232333e-05, 'epoch': 0.73}


 37%|███▋      | 43300/117610 [3:33:46<5:31:55,  3.73it/s]

{'loss': 5.296, 'learning_rate': 3.529315036789524e-05, 'epoch': 0.74}


 37%|███▋      | 43400/117610 [3:34:13<5:20:09,  3.86it/s]

{'loss': 5.334, 'learning_rate': 3.5231944263819495e-05, 'epoch': 0.74}


 37%|███▋      | 43500/117610 [3:34:48<61:37:15,  2.99s/it]

{'loss': 5.3393, 'learning_rate': 3.517066440130905e-05, 'epoch': 0.74}


 37%|███▋      | 43600/117610 [3:35:15<5:23:00,  3.82it/s] 

{'loss': 5.3137, 'learning_rate': 3.510931122210856e-05, 'epoch': 0.74}


 37%|███▋      | 43700/117610 [3:35:41<5:38:43,  3.64it/s]

{'loss': 5.317, 'learning_rate': 3.5047885168491204e-05, 'epoch': 0.74}


 37%|███▋      | 43800/117610 [3:36:08<5:25:38,  3.78it/s]

{'loss': 5.2892, 'learning_rate': 3.498638668325548e-05, 'epoch': 0.74}


 37%|███▋      | 43900/117610 [3:36:34<5:24:44,  3.78it/s]

{'loss': 5.3051, 'learning_rate': 3.4924816209722016e-05, 'epoch': 0.75}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.2857, 'learning_rate': 3.486317419173039e-05, 'epoch': 0.75}


 37%|███▋      | 44100/117610 [3:37:42<5:24:00,  3.78it/s] 

{'loss': 5.3432, 'learning_rate': 3.480146107363591e-05, 'epoch': 0.75}


 38%|███▊      | 44200/117610 [3:38:09<5:30:24,  3.70it/s]

{'loss': 5.2932, 'learning_rate': 3.4739677300306417e-05, 'epoch': 0.75}


 38%|███▊      | 44300/117610 [3:38:36<5:21:29,  3.80it/s]

{'loss': 5.3077, 'learning_rate': 3.4677823317119084e-05, 'epoch': 0.75}


 38%|███▊      | 44400/117610 [3:39:02<5:16:44,  3.85it/s]

{'loss': 5.2891, 'learning_rate': 3.461589956995721e-05, 'epoch': 0.76}


 38%|███▊      | 44500/117610 [3:39:43<89:16:38,  4.40s/it]

{'loss': 5.2572, 'learning_rate': 3.455390650520698e-05, 'epoch': 0.76}


 38%|███▊      | 44600/117610 [3:40:09<5:24:41,  3.75it/s] 

{'loss': 5.2377, 'learning_rate': 3.449184456975428e-05, 'epoch': 0.76}


 38%|███▊      | 44700/117610 [3:40:35<5:23:21,  3.76it/s]

{'loss': 5.2768, 'learning_rate': 3.4429714210981445e-05, 'epoch': 0.76}


 38%|███▊      | 44800/117610 [3:41:01<5:15:33,  3.85it/s]

{'loss': 5.2868, 'learning_rate': 3.4367515876764075e-05, 'epoch': 0.76}


 38%|███▊      | 44900/117610 [3:41:28<5:29:45,  3.67it/s]

{'loss': 5.2844, 'learning_rate': 3.430525001546775e-05, 'epoch': 0.76}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.254, 'learning_rate': 3.4242917075944856e-05, 'epoch': 0.77}


 38%|███▊      | 45100/117610 [3:42:37<5:24:48,  3.72it/s] 

{'loss': 5.267, 'learning_rate': 3.418051750753129e-05, 'epoch': 0.77}


 38%|███▊      | 45200/117610 [3:43:04<5:20:36,  3.76it/s]

{'loss': 5.2854, 'learning_rate': 3.411805176004329e-05, 'epoch': 0.77}


 39%|███▊      | 45300/117610 [3:43:30<5:11:30,  3.87it/s]

{'loss': 5.2902, 'learning_rate': 3.405552028377413e-05, 'epoch': 0.77}


 39%|███▊      | 45400/117610 [3:43:56<5:12:46,  3.85it/s]

{'loss': 5.265, 'learning_rate': 3.399292352949091e-05, 'epoch': 0.77}


 39%|███▊      | 45500/117610 [3:44:34<75:06:04,  3.75s/it]

{'loss': 5.2459, 'learning_rate': 3.3930261948431284e-05, 'epoch': 0.77}


 39%|███▉      | 45600/117610 [3:45:00<5:17:56,  3.77it/s] 

{'loss': 5.2248, 'learning_rate': 3.386753599230024e-05, 'epoch': 0.78}


 39%|███▉      | 45700/117610 [3:45:27<5:15:16,  3.80it/s]

{'loss': 5.2218, 'learning_rate': 3.380474611326681e-05, 'epoch': 0.78}


 39%|███▉      | 45800/117610 [3:45:53<5:13:36,  3.82it/s]

{'loss': 5.2202, 'learning_rate': 3.37418927639608e-05, 'epoch': 0.78}


 39%|███▉      | 45900/117610 [3:46:20<5:13:56,  3.81it/s]

{'loss': 5.2722, 'learning_rate': 3.3678976397469616e-05, 'epoch': 0.78}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.2467, 'learning_rate': 3.361599746733485e-05, 'epoch': 0.78}


 39%|███▉      | 46100/117610 [3:47:28<5:20:17,  3.72it/s] 

{'loss': 5.2418, 'learning_rate': 3.3552956427549165e-05, 'epoch': 0.78}


 39%|███▉      | 46200/117610 [3:47:54<5:14:35,  3.78it/s]

{'loss': 5.2534, 'learning_rate': 3.34898537325529e-05, 'epoch': 0.79}


 39%|███▉      | 46300/117610 [3:48:21<5:19:33,  3.72it/s]

{'loss': 5.2335, 'learning_rate': 3.3426689837230865e-05, 'epoch': 0.79}


 39%|███▉      | 46400/117610 [3:48:47<5:12:43,  3.80it/s]

{'loss': 5.2643, 'learning_rate': 3.336346519690905e-05, 'epoch': 0.79}


 40%|███▉      | 46500/117610 [3:49:27<89:44:48,  4.54s/it]

{'loss': 5.2387, 'learning_rate': 3.330018026735132e-05, 'epoch': 0.79}


 40%|███▉      | 46600/117610 [3:49:53<5:19:09,  3.71it/s] 

{'loss': 5.2304, 'learning_rate': 3.3236835504756145e-05, 'epoch': 0.79}


 40%|███▉      | 46700/117610 [3:50:20<5:18:13,  3.71it/s]

{'loss': 5.2446, 'learning_rate': 3.3173431365753314e-05, 'epoch': 0.79}


 40%|███▉      | 46800/117610 [3:50:46<5:12:52,  3.77it/s]

{'loss': 5.206, 'learning_rate': 3.310996830740064e-05, 'epoch': 0.8}


 40%|███▉      | 46900/117610 [3:51:13<5:07:04,  3.84it/s]

{'loss': 5.2337, 'learning_rate': 3.3046446787180666e-05, 'epoch': 0.8}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.223, 'learning_rate': 3.298286726299734e-05, 'epoch': 0.8}


 40%|████      | 47100/117610 [3:52:22<5:11:35,  3.77it/s] 

{'loss': 5.2416, 'learning_rate': 3.2919230193172786e-05, 'epoch': 0.8}


 40%|████      | 47200/117610 [3:52:48<5:04:47,  3.85it/s]

{'loss': 5.2189, 'learning_rate': 3.285553603644392e-05, 'epoch': 0.8}


 40%|████      | 47300/117610 [3:53:15<5:09:42,  3.78it/s]

{'loss': 5.2304, 'learning_rate': 3.279178525195919e-05, 'epoch': 0.8}


 40%|████      | 47400/117610 [3:53:41<5:25:12,  3.60it/s]

{'loss': 5.1883, 'learning_rate': 3.2727978299275255e-05, 'epoch': 0.81}


 40%|████      | 47500/117610 [3:54:16<55:12:20,  2.83s/it]

{'loss': 5.2174, 'learning_rate': 3.266411563835367e-05, 'epoch': 0.81}


 40%|████      | 47600/117610 [3:54:42<5:10:21,  3.76it/s] 

{'loss': 5.2205, 'learning_rate': 3.2600197729557546e-05, 'epoch': 0.81}


 41%|████      | 47700/117610 [3:55:09<5:20:18,  3.64it/s]

{'loss': 5.1738, 'learning_rate': 3.253622503364829e-05, 'epoch': 0.81}


 41%|████      | 47800/117610 [3:55:35<5:05:50,  3.80it/s]

{'loss': 5.1756, 'learning_rate': 3.247219801178224e-05, 'epoch': 0.81}


 41%|████      | 47900/117610 [3:56:01<5:05:41,  3.80it/s]

{'loss': 5.232, 'learning_rate': 3.240811712550733e-05, 'epoch': 0.81}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.1995, 'learning_rate': 3.2343982836759825e-05, 'epoch': 0.82}


 41%|████      | 48100/117610 [3:57:10<5:06:13,  3.78it/s] 

{'loss': 5.2158, 'learning_rate': 3.2279795607860904e-05, 'epoch': 0.82}


 41%|████      | 48201/117610 [3:57:36<4:39:12,  4.14it/s]

{'loss': 5.1961, 'learning_rate': 3.2215555901513416e-05, 'epoch': 0.82}


 41%|████      | 48300/117610 [3:58:03<5:12:38,  3.69it/s]

{'loss': 5.1725, 'learning_rate': 3.215126418079846e-05, 'epoch': 0.82}


 41%|████      | 48400/117610 [3:58:29<5:07:40,  3.75it/s]

{'loss': 5.1969, 'learning_rate': 3.208692090917211e-05, 'epoch': 0.82}


 41%|████      | 48500/117610 [3:59:10<87:20:16,  4.55s/it]

{'loss': 5.2205, 'learning_rate': 3.202252655046205e-05, 'epoch': 0.82}


 41%|████▏     | 48600/117610 [3:59:37<5:04:42,  3.77it/s] 

{'loss': 5.1997, 'learning_rate': 3.1958081568864234e-05, 'epoch': 0.83}


 41%|████▏     | 48700/117610 [4:00:03<4:56:46,  3.87it/s]

{'loss': 5.2218, 'learning_rate': 3.189358642893953e-05, 'epoch': 0.83}


 41%|████▏     | 48800/117610 [4:00:29<5:04:06,  3.77it/s]

{'loss': 5.194, 'learning_rate': 3.1829041595610396e-05, 'epoch': 0.83}


 42%|████▏     | 48900/117610 [4:00:56<4:57:09,  3.85it/s]

{'loss': 5.2234, 'learning_rate': 3.1764447534157495e-05, 'epoch': 0.83}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.2453, 'learning_rate': 3.169980471021635e-05, 'epoch': 0.83}


 42%|████▏     | 49100/117610 [4:02:09<5:05:38,  3.74it/s] 

{'loss': 5.21, 'learning_rate': 3.163511358977404e-05, 'epoch': 0.83}


 42%|████▏     | 49200/117610 [4:02:35<4:58:04,  3.83it/s]

{'loss': 5.187, 'learning_rate': 3.157037463916574e-05, 'epoch': 0.84}


 42%|████▏     | 49300/117610 [4:03:01<5:01:24,  3.78it/s]

{'loss': 5.1929, 'learning_rate': 3.150558832507146e-05, 'epoch': 0.84}


 42%|████▏     | 49400/117610 [4:03:28<4:48:55,  3.93it/s]

{'loss': 5.1824, 'learning_rate': 3.1440755114512596e-05, 'epoch': 0.84}


 42%|████▏     | 49500/117610 [4:04:08<87:06:47,  4.60s/it]

{'loss': 5.2167, 'learning_rate': 3.1375875474848646e-05, 'epoch': 0.84}


 42%|████▏     | 49600/117610 [4:04:34<4:45:38,  3.97it/s] 

{'loss': 5.155, 'learning_rate': 3.1310949873773756e-05, 'epoch': 0.84}


 42%|████▏     | 49700/117610 [4:05:01<4:58:13,  3.80it/s]

{'loss': 5.1555, 'learning_rate': 3.124597877931345e-05, 'epoch': 0.85}


 42%|████▏     | 49800/117610 [4:05:27<4:56:36,  3.81it/s]

{'loss': 5.1978, 'learning_rate': 3.118096265982112e-05, 'epoch': 0.85}


 42%|████▏     | 49900/117610 [4:05:53<5:06:28,  3.68it/s]

{'loss': 5.1481, 'learning_rate': 3.1115901983974804e-05, 'epoch': 0.85}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.1525, 'learning_rate': 3.105079722077367e-05, 'epoch': 0.85}


 43%|████▎     | 50100/117610 [4:07:01<5:08:05,  3.65it/s] 

{'loss': 5.1825, 'learning_rate': 3.098564883953474e-05, 'epoch': 0.85}


 43%|████▎     | 50200/117610 [4:07:27<4:57:57,  3.77it/s]

{'loss': 5.0936, 'learning_rate': 3.092045730988944e-05, 'epoch': 0.85}


 43%|████▎     | 50300/117610 [4:07:54<4:57:58,  3.76it/s]

{'loss': 5.1159, 'learning_rate': 3.0855223101780236e-05, 'epoch': 0.86}


 43%|████▎     | 50400/117610 [4:08:20<4:48:49,  3.88it/s]

{'loss': 5.1748, 'learning_rate': 3.078994668545726e-05, 'epoch': 0.86}


 43%|████▎     | 50500/117610 [4:09:01<86:12:04,  4.62s/it]

{'loss': 5.1503, 'learning_rate': 3.072462853147489e-05, 'epoch': 0.86}


 43%|████▎     | 50600/117610 [4:09:28<4:46:06,  3.90it/s] 

{'loss': 5.1521, 'learning_rate': 3.0659269110688404e-05, 'epoch': 0.86}


 43%|████▎     | 50700/117610 [4:09:54<5:01:17,  3.70it/s]

{'loss': 5.1377, 'learning_rate': 3.0593868894250546e-05, 'epoch': 0.86}


 43%|████▎     | 50800/117610 [4:10:20<4:43:41,  3.92it/s]

{'loss': 5.1881, 'learning_rate': 3.052842835360811e-05, 'epoch': 0.86}


 43%|████▎     | 50900/117610 [4:10:47<4:57:13,  3.74it/s]

{'loss': 5.1219, 'learning_rate': 3.0462947960498616e-05, 'epoch': 0.87}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.13, 'learning_rate': 3.0397428186946858e-05, 'epoch': 0.87}


 43%|████▎     | 51100/117610 [4:11:55<4:54:54,  3.76it/s] 

{'loss': 5.1273, 'learning_rate': 3.0331869505261484e-05, 'epoch': 0.87}


 44%|████▎     | 51200/117610 [4:12:22<4:50:26,  3.81it/s]

{'loss': 5.122, 'learning_rate': 3.0266272388031653e-05, 'epoch': 0.87}


 44%|████▎     | 51300/117610 [4:12:48<4:46:24,  3.86it/s]

{'loss': 5.1426, 'learning_rate': 3.0200637308123554e-05, 'epoch': 0.87}


 44%|████▎     | 51400/117610 [4:13:14<4:48:54,  3.82it/s]

{'loss': 5.1328, 'learning_rate': 3.0134964738677064e-05, 'epoch': 0.87}


 44%|████▍     | 51500/117610 [4:13:55<84:08:14,  4.58s/it]

{'loss': 5.1988, 'learning_rate': 3.00692551531023e-05, 'epoch': 0.88}


 44%|████▍     | 51600/117610 [4:14:22<4:50:54,  3.78it/s] 

{'loss': 5.1204, 'learning_rate': 3.000350902507621e-05, 'epoch': 0.88}


 44%|████▍     | 51700/117610 [4:14:48<4:47:17,  3.82it/s]

{'loss': 5.1726, 'learning_rate': 2.993772682853918e-05, 'epoch': 0.88}


 44%|████▍     | 51800/117610 [4:15:15<4:46:04,  3.83it/s]

{'loss': 5.1683, 'learning_rate': 2.9871909037691576e-05, 'epoch': 0.88}


 44%|████▍     | 51900/117610 [4:15:41<4:48:17,  3.80it/s]

{'loss': 5.1071, 'learning_rate': 2.9806056126990373e-05, 'epoch': 0.88}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.1238, 'learning_rate': 2.9740168571145693e-05, 'epoch': 0.88}


 44%|████▍     | 52100/117610 [4:16:47<4:55:01,  3.70it/s] 

{'loss': 5.1636, 'learning_rate': 2.967424684511742e-05, 'epoch': 0.89}


 44%|████▍     | 52200/117610 [4:17:13<4:52:54,  3.72it/s]

{'loss': 5.0929, 'learning_rate': 2.9608291424111746e-05, 'epoch': 0.89}


 44%|████▍     | 52300/117610 [4:17:40<4:51:48,  3.73it/s]

{'loss': 5.1322, 'learning_rate': 2.9542302783577768e-05, 'epoch': 0.89}


 45%|████▍     | 52400/117610 [4:18:06<4:53:12,  3.71it/s]

{'loss': 5.0885, 'learning_rate': 2.9476281399204042e-05, 'epoch': 0.89}


 45%|████▍     | 52500/117610 [4:18:40<44:42:00,  2.47s/it]

{'loss': 5.1239, 'learning_rate': 2.9410227746915176e-05, 'epoch': 0.89}


 45%|████▍     | 52601/117610 [4:19:07<4:35:07,  3.94it/s] 

{'loss': 5.1072, 'learning_rate': 2.934414230286837e-05, 'epoch': 0.89}


 45%|████▍     | 52700/117610 [4:19:33<4:50:51,  3.72it/s]

{'loss': 5.0533, 'learning_rate': 2.9278025543450005e-05, 'epoch': 0.9}


 45%|████▍     | 52800/117610 [4:20:00<4:43:54,  3.80it/s]

{'loss': 5.0758, 'learning_rate': 2.9211877945272202e-05, 'epoch': 0.9}


 45%|████▍     | 52900/117610 [4:20:26<4:50:27,  3.71it/s]

{'loss': 5.1019, 'learning_rate': 2.91456999851694e-05, 'epoch': 0.9}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.1051, 'learning_rate': 2.9079492140194886e-05, 'epoch': 0.9}


 45%|████▌     | 53100/117610 [4:21:34<4:43:01,  3.80it/s] 

{'loss': 5.0992, 'learning_rate': 2.9013254887617385e-05, 'epoch': 0.9}


 45%|████▌     | 53200/117610 [4:22:01<4:35:00,  3.90it/s]

{'loss': 5.1182, 'learning_rate': 2.894698870491762e-05, 'epoch': 0.9}


 45%|████▌     | 53300/117610 [4:22:27<4:50:37,  3.69it/s]

{'loss': 5.1025, 'learning_rate': 2.8880694069784862e-05, 'epoch': 0.91}


 45%|████▌     | 53400/117610 [4:22:53<4:42:16,  3.79it/s]

{'loss': 5.1052, 'learning_rate': 2.881437146011346e-05, 'epoch': 0.91}


 45%|████▌     | 53500/117610 [4:23:34<82:34:24,  4.64s/it]

{'loss': 5.1823, 'learning_rate': 2.874802135399945e-05, 'epoch': 0.91}


 46%|████▌     | 53600/117610 [4:24:00<4:37:38,  3.84it/s] 

{'loss': 5.1284, 'learning_rate': 2.868164422973706e-05, 'epoch': 0.91}


 46%|████▌     | 53700/117610 [4:24:27<4:37:17,  3.84it/s]

{'loss': 5.0808, 'learning_rate': 2.8615240565815298e-05, 'epoch': 0.91}


 46%|████▌     | 53800/117610 [4:24:53<4:46:40,  3.71it/s]

{'loss': 5.0966, 'learning_rate': 2.854881084091447e-05, 'epoch': 0.91}


 46%|████▌     | 53900/117610 [4:25:20<4:43:34,  3.74it/s]

{'loss': 5.0488, 'learning_rate': 2.848235553390276e-05, 'epoch': 0.92}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.1108, 'learning_rate': 2.8415875123832763e-05, 'epoch': 0.92}


 46%|████▌     | 54100/117610 [4:26:24<4:34:31,  3.86it/s] 

{'loss': 5.0888, 'learning_rate': 2.834937008993803e-05, 'epoch': 0.92}


 46%|████▌     | 54200/117610 [4:26:51<4:38:39,  3.79it/s]

{'loss': 5.0118, 'learning_rate': 2.8282840911629627e-05, 'epoch': 0.92}


 46%|████▌     | 54300/117610 [4:27:17<4:43:26,  3.72it/s]

{'loss': 5.076, 'learning_rate': 2.8216288068492636e-05, 'epoch': 0.92}


 46%|████▋     | 54400/117610 [4:27:43<4:25:01,  3.98it/s]

{'loss': 5.1018, 'learning_rate': 2.8149712040282765e-05, 'epoch': 0.93}


 46%|████▋     | 54501/117610 [4:28:23<54:12:35,  3.09s/it]

{'loss': 5.0721, 'learning_rate': 2.8083113306922837e-05, 'epoch': 0.93}


 46%|████▋     | 54600/117610 [4:28:50<4:24:25,  3.97it/s] 

{'loss': 5.1149, 'learning_rate': 2.8016492348499356e-05, 'epoch': 0.93}


 47%|████▋     | 54700/117610 [4:29:16<4:37:53,  3.77it/s]

{'loss': 5.0906, 'learning_rate': 2.7949849645259046e-05, 'epoch': 0.93}


 47%|████▋     | 54800/117610 [4:29:42<4:38:06,  3.76it/s]

{'loss': 5.1153, 'learning_rate': 2.7883185677605355e-05, 'epoch': 0.93}


 47%|████▋     | 54900/117610 [4:30:09<4:38:48,  3.75it/s]

{'loss': 5.0754, 'learning_rate': 2.7816500926095047e-05, 'epoch': 0.93}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.1123, 'learning_rate': 2.7749795871434685e-05, 'epoch': 0.94}


 47%|████▋     | 55100/117610 [4:31:14<4:35:51,  3.78it/s] 

{'loss': 5.1116, 'learning_rate': 2.7683070994477223e-05, 'epoch': 0.94}


 47%|████▋     | 55200/117610 [4:31:40<4:32:51,  3.81it/s]

{'loss': 5.0821, 'learning_rate': 2.7616326776218472e-05, 'epoch': 0.94}


 47%|████▋     | 55300/117610 [4:32:07<4:40:57,  3.70it/s]

{'loss': 5.0804, 'learning_rate': 2.754956369779369e-05, 'epoch': 0.94}


 47%|████▋     | 55400/117610 [4:32:33<4:38:13,  3.73it/s]

{'loss': 5.0767, 'learning_rate': 2.748278224047407e-05, 'epoch': 0.94}


 47%|████▋     | 55500/117610 [4:33:10<57:50:16,  3.35s/it]

{'loss': 5.0226, 'learning_rate': 2.7415982885663315e-05, 'epoch': 0.94}


 47%|████▋     | 55600/117610 [4:33:37<4:30:36,  3.82it/s] 

{'loss': 5.1044, 'learning_rate': 2.7349166114894144e-05, 'epoch': 0.95}


 47%|████▋     | 55701/117610 [4:34:04<4:19:12,  3.98it/s]

{'loss': 5.0842, 'learning_rate': 2.728233240982479e-05, 'epoch': 0.95}


 47%|████▋     | 55800/117610 [4:34:30<4:43:37,  3.63it/s]

{'loss': 5.0258, 'learning_rate': 2.72154822522356e-05, 'epoch': 0.95}


 48%|████▊     | 55900/117610 [4:34:56<4:23:59,  3.90it/s]

{'loss': 5.05, 'learning_rate': 2.7148616124025495e-05, 'epoch': 0.95}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.0678, 'learning_rate': 2.7081734507208538e-05, 'epoch': 0.95}


 48%|████▊     | 56100/117610 [4:36:00<4:36:43,  3.70it/s] 

{'loss': 5.0927, 'learning_rate': 2.7014837883910433e-05, 'epoch': 0.95}


 48%|████▊     | 56200/117610 [4:36:26<4:30:27,  3.78it/s]

{'loss': 5.023, 'learning_rate': 2.6947926736365058e-05, 'epoch': 0.96}


 48%|████▊     | 56300/117610 [4:36:52<4:30:57,  3.77it/s]

{'loss': 5.0048, 'learning_rate': 2.6881001546911005e-05, 'epoch': 0.96}


 48%|████▊     | 56400/117610 [4:37:18<4:12:02,  4.05it/s]

{'loss': 5.0258, 'learning_rate': 2.6814062797988077e-05, 'epoch': 0.96}


 48%|████▊     | 56500/117610 [4:37:54<54:18:49,  3.20s/it]

{'loss': 5.0622, 'learning_rate': 2.6747110972133826e-05, 'epoch': 0.96}


 48%|████▊     | 56600/117610 [4:38:20<4:15:03,  3.99it/s] 

{'loss': 5.0451, 'learning_rate': 2.6680146551980074e-05, 'epoch': 0.96}


 48%|████▊     | 56700/117610 [4:38:47<4:19:07,  3.92it/s]

{'loss': 5.0086, 'learning_rate': 2.6613170020249416e-05, 'epoch': 0.96}


 48%|████▊     | 56800/117610 [4:39:13<4:32:02,  3.73it/s]

{'loss': 5.0048, 'learning_rate': 2.654618185975179e-05, 'epoch': 0.97}


 48%|████▊     | 56900/117610 [4:39:40<4:28:38,  3.77it/s]

{'loss': 4.9857, 'learning_rate': 2.6479182553380915e-05, 'epoch': 0.97}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.9956, 'learning_rate': 2.6412172584110906e-05, 'epoch': 0.97}


 49%|████▊     | 57100/117610 [4:40:45<4:13:57,  3.97it/s] 

{'loss': 5.0032, 'learning_rate': 2.634515243499269e-05, 'epoch': 0.97}


 49%|████▊     | 57200/117610 [4:41:12<4:22:10,  3.84it/s]

{'loss': 4.9385, 'learning_rate': 2.6278122589150634e-05, 'epoch': 0.97}


 49%|████▊     | 57300/117610 [4:41:38<4:26:40,  3.77it/s]

{'loss': 5.0092, 'learning_rate': 2.6211083529778946e-05, 'epoch': 0.97}


 49%|████▉     | 57400/117610 [4:42:05<4:33:13,  3.67it/s]

{'loss': 4.9841, 'learning_rate': 2.6144035740138302e-05, 'epoch': 0.98}


 49%|████▉     | 57500/117610 [4:42:42<59:40:01,  3.57s/it]

{'loss': 5.0197, 'learning_rate': 2.6076979703552275e-05, 'epoch': 0.98}


 49%|████▉     | 57600/117610 [4:43:08<4:21:02,  3.83it/s] 

{'loss': 5.021, 'learning_rate': 2.600991590340391e-05, 'epoch': 0.98}


 49%|████▉     | 57700/117610 [4:43:35<4:25:45,  3.76it/s]

{'loss': 4.9995, 'learning_rate': 2.5942844823132207e-05, 'epoch': 0.98}


 49%|████▉     | 57800/117610 [4:44:01<4:26:38,  3.74it/s]

{'loss': 5.0221, 'learning_rate': 2.5875766946228648e-05, 'epoch': 0.98}


 49%|████▉     | 57900/117610 [4:44:28<4:24:14,  3.77it/s]

{'loss': 5.0515, 'learning_rate': 2.5808682756233715e-05, 'epoch': 0.98}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 5.0428, 'learning_rate': 2.574159273673339e-05, 'epoch': 0.99}


 49%|████▉     | 58100/117610 [4:45:33<4:14:47,  3.89it/s] 

{'loss': 5.004, 'learning_rate': 2.5674497371355678e-05, 'epoch': 0.99}


 49%|████▉     | 58200/117610 [4:45:59<4:09:39,  3.97it/s]

{'loss': 4.9568, 'learning_rate': 2.5607397143767126e-05, 'epoch': 0.99}


 50%|████▉     | 58300/117610 [4:46:26<4:16:47,  3.85it/s]

{'loss': 4.9984, 'learning_rate': 2.554029253766933e-05, 'epoch': 0.99}


 50%|████▉     | 58401/117610 [4:46:53<4:00:17,  4.11it/s]

{'loss': 5.023, 'learning_rate': 2.5473184036795435e-05, 'epoch': 0.99}


 50%|████▉     | 58500/117610 [4:47:34<77:56:01,  4.75s/it]

{'loss': 4.994, 'learning_rate': 2.5406072124906688e-05, 'epoch': 0.99}


 50%|████▉     | 58600/117610 [4:48:00<4:23:56,  3.73it/s] 

{'loss': 4.9958, 'learning_rate': 2.5338957285788918e-05, 'epoch': 1.0}


 50%|████▉     | 58700/117610 [4:48:26<4:16:05,  3.83it/s]

{'loss': 4.9541, 'learning_rate': 2.5271840003249032e-05, 'epoch': 1.0}


 50%|████▉     | 58800/117610 [4:48:53<4:17:57,  3.80it/s]

{'loss': 5.0028, 'learning_rate': 2.520472076111158e-05, 'epoch': 1.0}


 50%|█████     | 58900/117610 [4:49:57<4:23:10,  3.72it/s]  

{'loss': 4.963, 'learning_rate': 2.5137600043215216e-05, 'epoch': 1.0}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.9141, 'learning_rate': 2.5070478333409247e-05, 'epoch': 1.0}


 50%|█████     | 59100/117610 [4:51:01<4:16:23,  3.80it/s] 

{'loss': 4.9121, 'learning_rate': 2.500335611555012e-05, 'epoch': 1.01}


 50%|█████     | 59200/117610 [4:51:27<4:09:00,  3.91it/s]

{'loss': 4.9003, 'learning_rate': 2.4936233873497957e-05, 'epoch': 1.01}


 50%|█████     | 59300/117610 [4:51:53<4:08:38,  3.91it/s]

{'loss': 4.937, 'learning_rate': 2.4869112091113042e-05, 'epoch': 1.01}


 51%|█████     | 59400/117610 [4:52:20<4:13:26,  3.83it/s]

{'loss': 4.9474, 'learning_rate': 2.4801991252252357e-05, 'epoch': 1.01}


 51%|█████     | 59500/117610 [4:53:01<76:14:30,  4.72s/it]

{'loss': 4.9623, 'learning_rate': 2.4734871840766062e-05, 'epoch': 1.01}


 51%|█████     | 59600/117610 [4:53:27<4:13:57,  3.81it/s] 

{'loss': 4.9076, 'learning_rate': 2.466775434049406e-05, 'epoch': 1.01}


 51%|█████     | 59700/117610 [4:53:53<4:17:27,  3.75it/s]

{'loss': 4.9284, 'learning_rate': 2.4600639235262445e-05, 'epoch': 1.02}


 51%|█████     | 59800/117610 [4:54:19<4:13:53,  3.79it/s]

{'loss': 4.9523, 'learning_rate': 2.4533527008880065e-05, 'epoch': 1.02}


 51%|█████     | 59900/117610 [4:54:46<4:09:11,  3.86it/s]

{'loss': 4.8976, 'learning_rate': 2.4466418145134995e-05, 'epoch': 1.02}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.8871, 'learning_rate': 2.439931312779111e-05, 'epoch': 1.02}


 51%|█████     | 60100/117610 [4:55:54<3:55:03,  4.08it/s] 

{'loss': 4.8916, 'learning_rate': 2.433221244058451e-05, 'epoch': 1.02}


 51%|█████     | 60200/117610 [4:56:20<4:09:34,  3.83it/s]

{'loss': 4.9454, 'learning_rate': 2.4265116567220112e-05, 'epoch': 1.02}


 51%|█████▏    | 60300/117610 [4:56:46<4:01:45,  3.95it/s]

{'loss': 4.8964, 'learning_rate': 2.419802599136812e-05, 'epoch': 1.03}


 51%|█████▏    | 60400/117610 [4:57:13<4:08:13,  3.84it/s]

{'loss': 4.9439, 'learning_rate': 2.413094119666056e-05, 'epoch': 1.03}


 51%|█████▏    | 60500/117610 [4:57:50<56:19:49,  3.55s/it]

{'loss': 4.9567, 'learning_rate': 2.406386266668777e-05, 'epoch': 1.03}


 52%|█████▏    | 60600/117610 [4:58:16<4:09:36,  3.81it/s] 

{'loss': 4.9351, 'learning_rate': 2.3996790884994924e-05, 'epoch': 1.03}


 52%|█████▏    | 60700/117610 [4:58:43<4:08:48,  3.81it/s]

{'loss': 4.9345, 'learning_rate': 2.3929726335078584e-05, 'epoch': 1.03}


 52%|█████▏    | 60800/117610 [4:59:09<4:08:26,  3.81it/s]

{'loss': 4.9457, 'learning_rate': 2.386266950038314e-05, 'epoch': 1.03}


 52%|█████▏    | 60900/117610 [4:59:36<4:10:06,  3.78it/s]

{'loss': 4.9586, 'learning_rate': 2.3795620864297377e-05, 'epoch': 1.04}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.9184, 'learning_rate': 2.3728580910151004e-05, 'epoch': 1.04}


 52%|█████▏    | 61100/117610 [5:00:42<4:02:17,  3.89it/s] 

{'loss': 4.895, 'learning_rate': 2.366155012121111e-05, 'epoch': 1.04}


 52%|█████▏    | 61200/117610 [5:01:09<3:51:35,  4.06it/s]

{'loss': 4.9086, 'learning_rate': 2.3594528980678743e-05, 'epoch': 1.04}


 52%|█████▏    | 61300/117610 [5:01:36<4:12:34,  3.72it/s]

{'loss': 4.9139, 'learning_rate': 2.352751797168537e-05, 'epoch': 1.04}


 52%|█████▏    | 61400/117610 [5:02:02<4:10:02,  3.75it/s]

{'loss': 4.8941, 'learning_rate': 2.346051757728946e-05, 'epoch': 1.04}


 52%|█████▏    | 61500/117610 [5:02:36<41:29:29,  2.66s/it]

{'loss': 4.9398, 'learning_rate': 2.339352828047294e-05, 'epoch': 1.05}


 52%|█████▏    | 61600/117610 [5:03:03<3:56:37,  3.95it/s] 

{'loss': 4.9033, 'learning_rate': 2.3326550564137727e-05, 'epoch': 1.05}


 52%|█████▏    | 61700/117610 [5:03:29<4:11:24,  3.71it/s]

{'loss': 4.8852, 'learning_rate': 2.3259584911102302e-05, 'epoch': 1.05}


 53%|█████▎    | 61800/117610 [5:03:56<4:19:35,  3.58it/s]

{'loss': 4.836, 'learning_rate': 2.319263180409815e-05, 'epoch': 1.05}


 53%|█████▎    | 61900/117610 [5:04:23<4:06:05,  3.77it/s]

{'loss': 4.8973, 'learning_rate': 2.312569172576631e-05, 'epoch': 1.05}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.8515, 'learning_rate': 2.3058765158653938e-05, 'epoch': 1.05}


 53%|█████▎    | 62100/117610 [5:05:27<4:04:55,  3.78it/s] 

{'loss': 4.8972, 'learning_rate': 2.299185258521076e-05, 'epoch': 1.06}


 53%|█████▎    | 62200/117610 [5:05:54<3:57:28,  3.89it/s]

{'loss': 4.9356, 'learning_rate': 2.292495448778565e-05, 'epoch': 1.06}


 53%|█████▎    | 62300/117610 [5:06:20<3:58:21,  3.87it/s]

{'loss': 4.8725, 'learning_rate': 2.28580713486231e-05, 'epoch': 1.06}


 53%|█████▎    | 62400/117610 [5:06:46<4:02:12,  3.80it/s]

{'loss': 4.8946, 'learning_rate': 2.2791203649859813e-05, 'epoch': 1.06}


 53%|█████▎    | 62500/117610 [5:07:19<32:24:00,  2.12s/it]

{'loss': 4.8901, 'learning_rate': 2.2724351873521156e-05, 'epoch': 1.06}


 53%|█████▎    | 62600/117610 [5:07:46<4:01:43,  3.79it/s] 

{'loss': 4.9162, 'learning_rate': 2.265751650151773e-05, 'epoch': 1.06}


 53%|█████▎    | 62700/117610 [5:08:12<4:11:05,  3.64it/s]

{'loss': 4.8841, 'learning_rate': 2.2590698015641864e-05, 'epoch': 1.07}


 53%|█████▎    | 62800/117610 [5:08:39<3:56:24,  3.86it/s]

{'loss': 4.8948, 'learning_rate': 2.2523896897564204e-05, 'epoch': 1.07}


 53%|█████▎    | 62900/117610 [5:09:05<3:58:57,  3.82it/s]

{'loss': 4.936, 'learning_rate': 2.245711362883015e-05, 'epoch': 1.07}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.882, 'learning_rate': 2.2390348690856456e-05, 'epoch': 1.07}


 54%|█████▎    | 63100/117610 [5:10:13<4:05:28,  3.70it/s] 

{'loss': 4.8526, 'learning_rate': 2.2323602564927746e-05, 'epoch': 1.07}


 54%|█████▎    | 63200/117610 [5:10:40<4:07:01,  3.67it/s]

{'loss': 4.8985, 'learning_rate': 2.2256875732193015e-05, 'epoch': 1.07}


 54%|█████▍    | 63300/117610 [5:11:06<3:53:56,  3.87it/s]

{'loss': 4.8907, 'learning_rate': 2.2190168673662185e-05, 'epoch': 1.08}


 54%|█████▍    | 63400/117610 [5:11:32<3:47:16,  3.98it/s]

{'loss': 4.9241, 'learning_rate': 2.2123481870202626e-05, 'epoch': 1.08}


 54%|█████▍    | 63500/117610 [5:12:13<70:11:07,  4.67s/it]

{'loss': 4.8696, 'learning_rate': 2.2056815802535724e-05, 'epoch': 1.08}


 54%|█████▍    | 63600/117610 [5:12:39<3:49:18,  3.93it/s] 

{'loss': 4.8866, 'learning_rate': 2.1990170951233365e-05, 'epoch': 1.08}


 54%|█████▍    | 63700/117610 [5:13:06<3:53:46,  3.84it/s]

{'loss': 4.8688, 'learning_rate': 2.1923547796714487e-05, 'epoch': 1.08}


 54%|█████▍    | 63800/117610 [5:13:32<3:48:00,  3.93it/s]

{'loss': 4.894, 'learning_rate': 2.1856946819241634e-05, 'epoch': 1.08}


 54%|█████▍    | 63900/117610 [5:13:59<3:53:48,  3.83it/s]

{'loss': 4.818, 'learning_rate': 2.1790368498917507e-05, 'epoch': 1.09}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.8674, 'learning_rate': 2.172381331568144e-05, 'epoch': 1.09}


 55%|█████▍    | 64100/117610 [5:15:04<4:03:54,  3.66it/s] 

{'loss': 4.8269, 'learning_rate': 2.165728174930599e-05, 'epoch': 1.09}


 55%|█████▍    | 64200/117610 [5:15:30<3:57:03,  3.76it/s]

{'loss': 4.8456, 'learning_rate': 2.1590774279393498e-05, 'epoch': 1.09}


 55%|█████▍    | 64300/117610 [5:15:57<3:56:04,  3.76it/s]

{'loss': 4.8492, 'learning_rate': 2.152429138537257e-05, 'epoch': 1.09}


 55%|█████▍    | 64400/117610 [5:16:23<3:48:30,  3.88it/s]

{'loss': 4.8372, 'learning_rate': 2.1457833546494653e-05, 'epoch': 1.1}


 55%|█████▍    | 64501/117610 [5:17:02<41:37:55,  2.82s/it]

{'loss': 4.8154, 'learning_rate': 2.13914012418306e-05, 'epoch': 1.1}


 55%|█████▍    | 64600/117610 [5:17:28<3:50:46,  3.83it/s] 

{'loss': 4.86, 'learning_rate': 2.1324994950267185e-05, 'epoch': 1.1}


 55%|█████▌    | 64700/117610 [5:17:54<3:51:09,  3.81it/s]

{'loss': 4.855, 'learning_rate': 2.1258615150503675e-05, 'epoch': 1.1}


 55%|█████▌    | 64800/117610 [5:18:20<3:48:07,  3.86it/s]

{'loss': 4.876, 'learning_rate': 2.1192262321048333e-05, 'epoch': 1.1}


 55%|█████▌    | 64900/117610 [5:18:47<3:55:55,  3.72it/s]

{'loss': 4.8548, 'learning_rate': 2.1125936940215054e-05, 'epoch': 1.1}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.8524, 'learning_rate': 2.1059639486119833e-05, 'epoch': 1.11}


 55%|█████▌    | 65100/117610 [5:19:54<3:56:03,  3.71it/s] 

{'loss': 4.8167, 'learning_rate': 2.0993370436677346e-05, 'epoch': 1.11}


 55%|█████▌    | 65200/117610 [5:20:20<3:53:47,  3.74it/s]

{'loss': 4.8651, 'learning_rate': 2.0927130269597516e-05, 'epoch': 1.11}


 56%|█████▌    | 65300/117610 [5:20:47<3:47:01,  3.84it/s]

{'loss': 4.8508, 'learning_rate': 2.0860919462382093e-05, 'epoch': 1.11}


 56%|█████▌    | 65400/117610 [5:21:13<3:55:41,  3.69it/s]

{'loss': 4.8255, 'learning_rate': 2.0794738492321133e-05, 'epoch': 1.11}


 56%|█████▌    | 65500/117610 [5:21:53<65:27:45,  4.52s/it]

{'loss': 4.834, 'learning_rate': 2.0728587836489645e-05, 'epoch': 1.11}


 56%|█████▌    | 65600/117610 [5:22:20<3:50:36,  3.76it/s] 

{'loss': 4.8455, 'learning_rate': 2.0662467971744092e-05, 'epoch': 1.12}


 56%|█████▌    | 65700/117610 [5:22:46<4:21:16,  3.31it/s]

{'loss': 4.8868, 'learning_rate': 2.059637937471899e-05, 'epoch': 1.12}


 56%|█████▌    | 65800/117610 [5:23:12<3:45:59,  3.82it/s]

{'loss': 4.8635, 'learning_rate': 2.053032252182345e-05, 'epoch': 1.12}


 56%|█████▌    | 65900/117610 [5:23:39<3:42:05,  3.88it/s]

{'loss': 4.7969, 'learning_rate': 2.0464297889237734e-05, 'epoch': 1.12}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.7995, 'learning_rate': 2.0398305952909878e-05, 'epoch': 1.12}


 56%|█████▌    | 66100/117610 [5:24:47<3:36:12,  3.97it/s] 

{'loss': 4.8352, 'learning_rate': 2.0332347188552194e-05, 'epoch': 1.12}


 56%|█████▋    | 66200/117610 [5:25:14<3:44:46,  3.81it/s]

{'loss': 4.8469, 'learning_rate': 2.0266422071637862e-05, 'epoch': 1.13}


 56%|█████▋    | 66300/117610 [5:25:40<3:45:28,  3.79it/s]

{'loss': 4.8477, 'learning_rate': 2.020053107739754e-05, 'epoch': 1.13}


 56%|█████▋    | 66400/117610 [5:26:06<3:47:45,  3.75it/s]

{'loss': 4.8266, 'learning_rate': 2.013467468081588e-05, 'epoch': 1.13}


 57%|█████▋    | 66500/117610 [5:26:46<64:25:41,  4.54s/it]

{'loss': 4.7936, 'learning_rate': 2.0068853356628154e-05, 'epoch': 1.13}


 57%|█████▋    | 66600/117610 [5:27:13<3:46:23,  3.76it/s] 

{'loss': 4.8393, 'learning_rate': 2.0003067579316774e-05, 'epoch': 1.13}


 57%|█████▋    | 66700/117610 [5:27:39<3:41:54,  3.82it/s]

{'loss': 4.8497, 'learning_rate': 1.9937317823107966e-05, 'epoch': 1.13}


 57%|█████▋    | 66800/117610 [5:28:05<3:41:07,  3.83it/s]

{'loss': 4.824, 'learning_rate': 1.987160456196824e-05, 'epoch': 1.14}


 57%|█████▋    | 66900/117610 [5:28:31<3:42:27,  3.80it/s]

{'loss': 4.8169, 'learning_rate': 1.9805928269601032e-05, 'epoch': 1.14}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.8348, 'learning_rate': 1.974028941944332e-05, 'epoch': 1.14}


 57%|█████▋    | 67100/117610 [5:29:36<3:46:50,  3.71it/s] 

{'loss': 4.842, 'learning_rate': 1.967468848466214e-05, 'epoch': 1.14}


 57%|█████▋    | 67200/117610 [5:30:02<3:42:58,  3.77it/s]

{'loss': 4.8691, 'learning_rate': 1.960912593815121e-05, 'epoch': 1.14}


 57%|█████▋    | 67300/117610 [5:30:28<3:42:09,  3.77it/s]

{'loss': 4.7828, 'learning_rate': 1.954360225252754e-05, 'epoch': 1.14}


 57%|█████▋    | 67400/117610 [5:30:55<3:37:29,  3.85it/s]

{'loss': 4.8128, 'learning_rate': 1.9478117900127997e-05, 'epoch': 1.15}


 57%|█████▋    | 67500/117610 [5:31:33<56:28:23,  4.06s/it]

{'loss': 4.8223, 'learning_rate': 1.9412673353005905e-05, 'epoch': 1.15}


 57%|█████▋    | 67600/117610 [5:32:00<3:27:34,  4.02it/s] 

{'loss': 4.8083, 'learning_rate': 1.9347269082927645e-05, 'epoch': 1.15}


 58%|█████▊    | 67700/117610 [5:32:26<3:42:14,  3.74it/s]

{'loss': 4.8496, 'learning_rate': 1.9281905561369273e-05, 'epoch': 1.15}


 58%|█████▊    | 67800/117610 [5:32:53<3:36:42,  3.83it/s]

{'loss': 4.7876, 'learning_rate': 1.921658325951308e-05, 'epoch': 1.15}


 58%|█████▊    | 67900/117610 [5:33:19<3:27:29,  3.99it/s]

{'loss': 4.7727, 'learning_rate': 1.9151302648244228e-05, 'epoch': 1.15}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.7986, 'learning_rate': 1.9086064198147334e-05, 'epoch': 1.16}


 58%|█████▊    | 68100/117610 [5:34:27<3:38:53,  3.77it/s] 

{'loss': 4.7909, 'learning_rate': 1.9020868379503133e-05, 'epoch': 1.16}


 58%|█████▊    | 68200/117610 [5:34:53<3:44:28,  3.67it/s]

{'loss': 4.8122, 'learning_rate': 1.895571566228499e-05, 'epoch': 1.16}


 58%|█████▊    | 68300/117610 [5:35:20<3:40:07,  3.73it/s]

{'loss': 4.8056, 'learning_rate': 1.8890606516155594e-05, 'epoch': 1.16}


 58%|█████▊    | 68400/117610 [5:35:46<3:38:19,  3.76it/s]

{'loss': 4.8611, 'learning_rate': 1.882554141046356e-05, 'epoch': 1.16}


 58%|█████▊    | 68500/117610 [5:36:27<62:37:02,  4.59s/it]

{'loss': 4.7991, 'learning_rate': 1.8760520814240008e-05, 'epoch': 1.16}


 58%|█████▊    | 68600/117610 [5:36:53<3:35:59,  3.78it/s] 

{'loss': 4.7801, 'learning_rate': 1.8695545196195206e-05, 'epoch': 1.17}


 58%|█████▊    | 68700/117610 [5:37:19<3:30:45,  3.87it/s]

{'loss': 4.7829, 'learning_rate': 1.8630615024715193e-05, 'epoch': 1.17}


 58%|█████▊    | 68800/117610 [5:37:46<3:33:34,  3.81it/s]

{'loss': 4.7942, 'learning_rate': 1.856573076785842e-05, 'epoch': 1.17}


 59%|█████▊    | 68900/117610 [5:38:12<3:25:55,  3.94it/s]

{'loss': 4.8064, 'learning_rate': 1.850089289335233e-05, 'epoch': 1.17}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.8312, 'learning_rate': 1.8436101868590014e-05, 'epoch': 1.17}


 59%|█████▉    | 69100/117610 [5:39:17<3:41:13,  3.65it/s] 

{'loss': 4.7782, 'learning_rate': 1.837135816062685e-05, 'epoch': 1.18}


 59%|█████▉    | 69200/117610 [5:39:43<3:21:50,  4.00it/s]

{'loss': 4.7938, 'learning_rate': 1.830666223617714e-05, 'epoch': 1.18}


 59%|█████▉    | 69300/117610 [5:40:10<3:24:30,  3.94it/s]

{'loss': 4.8352, 'learning_rate': 1.8242014561610702e-05, 'epoch': 1.18}


 59%|█████▉    | 69400/117610 [5:40:36<3:30:03,  3.83it/s]

{'loss': 4.8013, 'learning_rate': 1.8177415602949542e-05, 'epoch': 1.18}


 59%|█████▉    | 69500/117610 [5:41:16<58:02:00,  4.34s/it]

{'loss': 4.7946, 'learning_rate': 1.8112865825864518e-05, 'epoch': 1.18}


 59%|█████▉    | 69600/117610 [5:41:42<3:26:27,  3.88it/s] 

{'loss': 4.7918, 'learning_rate': 1.8048365695671933e-05, 'epoch': 1.18}


 59%|█████▉    | 69700/117610 [5:42:09<3:35:15,  3.71it/s]

{'loss': 4.7423, 'learning_rate': 1.7983915677330195e-05, 'epoch': 1.19}


 59%|█████▉    | 69800/117610 [5:42:35<3:15:21,  4.08it/s]

{'loss': 4.7558, 'learning_rate': 1.791951623543649e-05, 'epoch': 1.19}


 59%|█████▉    | 69900/117610 [5:43:01<3:28:55,  3.81it/s]

{'loss': 4.7962, 'learning_rate': 1.7855167834223425e-05, 'epoch': 1.19}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.8076, 'learning_rate': 1.7790870937555647e-05, 'epoch': 1.19}


 60%|█████▉    | 70100/117610 [5:44:10<3:20:09,  3.96it/s] 

{'loss': 4.7862, 'learning_rate': 1.772662600892654e-05, 'epoch': 1.19}


 60%|█████▉    | 70200/117610 [5:44:36<3:28:52,  3.78it/s]

{'loss': 4.7338, 'learning_rate': 1.7662433511454874e-05, 'epoch': 1.19}


 60%|█████▉    | 70300/117610 [5:45:03<3:16:33,  4.01it/s]

{'loss': 4.7637, 'learning_rate': 1.759829390788145e-05, 'epoch': 1.2}


 60%|█████▉    | 70400/117610 [5:45:29<3:33:28,  3.69it/s]

{'loss': 4.7333, 'learning_rate': 1.7534207660565784e-05, 'epoch': 1.2}


 60%|█████▉    | 70500/117610 [5:46:10<60:43:19,  4.64s/it]

{'loss': 4.7895, 'learning_rate': 1.747017523148275e-05, 'epoch': 1.2}


 60%|██████    | 70600/117610 [5:46:36<3:30:29,  3.72it/s] 

{'loss': 4.7407, 'learning_rate': 1.7406197082219294e-05, 'epoch': 1.2}


 60%|██████    | 70700/117610 [5:47:02<3:22:19,  3.86it/s]

{'loss': 4.7591, 'learning_rate': 1.7342273673971053e-05, 'epoch': 1.2}


 60%|██████    | 70800/117610 [5:47:28<3:30:41,  3.70it/s]

{'loss': 4.7694, 'learning_rate': 1.7278405467539064e-05, 'epoch': 1.2}


 60%|██████    | 70900/117610 [5:47:55<3:22:58,  3.84it/s]

{'loss': 4.791, 'learning_rate': 1.721459292332644e-05, 'epoch': 1.21}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.8025, 'learning_rate': 1.7150836501335037e-05, 'epoch': 1.21}


 60%|██████    | 71100/117610 [5:49:02<3:21:19,  3.85it/s] 

{'loss': 4.8426, 'learning_rate': 1.708713666116215e-05, 'epoch': 1.21}


 61%|██████    | 71200/117610 [5:49:28<3:12:25,  4.02it/s]

{'loss': 4.7734, 'learning_rate': 1.702349386199718e-05, 'epoch': 1.21}


 61%|██████    | 71300/117610 [5:49:55<3:23:13,  3.80it/s]

{'loss': 4.7817, 'learning_rate': 1.6959908562618378e-05, 'epoch': 1.21}


 61%|██████    | 71400/117610 [5:50:21<3:23:54,  3.78it/s]

{'loss': 4.7483, 'learning_rate': 1.6896381221389463e-05, 'epoch': 1.21}


 61%|██████    | 71500/117610 [5:50:55<33:01:33,  2.58s/it]

{'loss': 4.7887, 'learning_rate': 1.6832912296256352e-05, 'epoch': 1.22}


 61%|██████    | 71600/117610 [5:51:21<3:23:04,  3.78it/s] 

{'loss': 4.7683, 'learning_rate': 1.6769502244743907e-05, 'epoch': 1.22}


 61%|██████    | 71700/117610 [5:51:48<3:15:41,  3.91it/s]

{'loss': 4.7419, 'learning_rate': 1.6706151523952537e-05, 'epoch': 1.22}


 61%|██████    | 71801/117610 [5:52:14<3:08:41,  4.05it/s]

{'loss': 4.7442, 'learning_rate': 1.6642860590554993e-05, 'epoch': 1.22}


 61%|██████    | 71900/117610 [5:52:40<3:28:11,  3.66it/s]

{'loss': 4.7346, 'learning_rate': 1.657962990079301e-05, 'epoch': 1.22}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.8053, 'learning_rate': 1.6516459910474083e-05, 'epoch': 1.22}


 61%|██████▏   | 72100/117610 [5:53:46<3:21:02,  3.77it/s] 

{'loss': 4.7126, 'learning_rate': 1.6453351074968125e-05, 'epoch': 1.23}


 61%|██████▏   | 72200/117610 [5:54:13<3:20:16,  3.78it/s]

{'loss': 4.7764, 'learning_rate': 1.639030384920419e-05, 'epoch': 1.23}


 61%|██████▏   | 72300/117610 [5:54:39<3:20:17,  3.77it/s]

{'loss': 4.7595, 'learning_rate': 1.6327318687667264e-05, 'epoch': 1.23}


 62%|██████▏   | 72400/117610 [5:55:06<3:10:24,  3.96it/s]

{'loss': 4.7089, 'learning_rate': 1.6264396044394876e-05, 'epoch': 1.23}


 62%|██████▏   | 72500/117610 [5:55:40<33:35:38,  2.68s/it]

{'loss': 4.7312, 'learning_rate': 1.62015363729739e-05, 'epoch': 1.23}


 62%|██████▏   | 72600/117610 [5:56:06<3:21:24,  3.72it/s] 

{'loss': 4.7375, 'learning_rate': 1.613874012653729e-05, 'epoch': 1.23}


 62%|██████▏   | 72700/117610 [5:56:32<3:12:47,  3.88it/s]

{'loss': 4.6921, 'learning_rate': 1.607600775776077e-05, 'epoch': 1.24}


 62%|██████▏   | 72800/117610 [5:56:59<3:15:16,  3.82it/s]

{'loss': 4.7568, 'learning_rate': 1.6013339718859605e-05, 'epoch': 1.24}


 62%|██████▏   | 72900/117610 [5:57:25<3:20:57,  3.71it/s]

{'loss': 4.7298, 'learning_rate': 1.5950736461585304e-05, 'epoch': 1.24}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.7118, 'learning_rate': 1.5888198437222435e-05, 'epoch': 1.24}


 62%|██████▏   | 73100/117610 [5:58:33<3:17:58,  3.75it/s] 

{'loss': 4.7771, 'learning_rate': 1.582572609658528e-05, 'epoch': 1.24}


 62%|██████▏   | 73200/117610 [5:58:59<3:16:49,  3.76it/s]

{'loss': 4.7201, 'learning_rate': 1.576331989001465e-05, 'epoch': 1.24}


 62%|██████▏   | 73300/117610 [5:59:26<3:07:43,  3.93it/s]

{'loss': 4.7265, 'learning_rate': 1.570098026737461e-05, 'epoch': 1.25}


 62%|██████▏   | 73400/117610 [5:59:52<3:03:32,  4.01it/s]

{'loss': 4.7312, 'learning_rate': 1.5638707678049274e-05, 'epoch': 1.25}


 62%|██████▏   | 73500/117610 [6:00:32<56:01:39,  4.57s/it]

{'loss': 4.7568, 'learning_rate': 1.5576502570939493e-05, 'epoch': 1.25}


 63%|██████▎   | 73600/117610 [6:00:59<3:13:55,  3.78it/s] 

{'loss': 4.7836, 'learning_rate': 1.551436539445969e-05, 'epoch': 1.25}


 63%|██████▎   | 73700/117610 [6:01:26<3:10:42,  3.84it/s]

{'loss': 4.771, 'learning_rate': 1.545229659653461e-05, 'epoch': 1.25}


 63%|██████▎   | 73800/117610 [6:01:52<3:12:25,  3.79it/s]

{'loss': 4.7121, 'learning_rate': 1.5390296624596057e-05, 'epoch': 1.25}


 63%|██████▎   | 73900/117610 [6:02:18<3:11:29,  3.80it/s]

{'loss': 4.7823, 'learning_rate': 1.5328365925579696e-05, 'epoch': 1.26}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.7176, 'learning_rate': 1.5266504945921833e-05, 'epoch': 1.26}


 63%|██████▎   | 74100/117610 [6:03:19<3:06:08,  3.90it/s] 

{'loss': 4.7015, 'learning_rate': 1.5204714131556214e-05, 'epoch': 1.26}


 63%|██████▎   | 74200/117610 [6:03:45<3:16:56,  3.67it/s]

{'loss': 4.7502, 'learning_rate': 1.5142993927910751e-05, 'epoch': 1.26}


 63%|██████▎   | 74300/117610 [6:04:11<3:04:46,  3.91it/s]

{'loss': 4.6835, 'learning_rate': 1.5081344779904366e-05, 'epoch': 1.26}


 63%|██████▎   | 74400/117610 [6:04:38<3:08:21,  3.82it/s]

{'loss': 4.7745, 'learning_rate': 1.5019767131943765e-05, 'epoch': 1.27}


 63%|██████▎   | 74500/117610 [6:05:10<24:06:07,  2.01s/it]

{'loss': 4.7387, 'learning_rate': 1.4958261427920256e-05, 'epoch': 1.27}


 63%|██████▎   | 74600/117610 [6:05:36<3:05:06,  3.87it/s] 

{'loss': 4.6965, 'learning_rate': 1.489682811120649e-05, 'epoch': 1.27}


 64%|██████▎   | 74700/117610 [6:06:03<3:05:03,  3.86it/s]

{'loss': 4.6814, 'learning_rate': 1.4835467624653324e-05, 'epoch': 1.27}


 64%|██████▎   | 74800/117610 [6:06:29<3:03:35,  3.89it/s]

{'loss': 4.7145, 'learning_rate': 1.477418041058662e-05, 'epoch': 1.27}


 64%|██████▎   | 74900/117610 [6:06:56<3:17:11,  3.61it/s]

{'loss': 4.7401, 'learning_rate': 1.4712966910804027e-05, 'epoch': 1.27}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.7051, 'learning_rate': 1.465182756657181e-05, 'epoch': 1.28}


 64%|██████▍   | 75100/117610 [6:07:54<3:01:07,  3.91it/s] 

{'loss': 4.7287, 'learning_rate': 1.4590762818621692e-05, 'epoch': 1.28}


 64%|██████▍   | 75201/117610 [6:08:20<2:52:31,  4.10it/s]

{'loss': 4.6729, 'learning_rate': 1.4529773107147654e-05, 'epoch': 1.28}


 64%|██████▍   | 75300/117610 [6:08:46<3:12:52,  3.66it/s]

{'loss': 4.7803, 'learning_rate': 1.4468858871802752e-05, 'epoch': 1.28}


 64%|██████▍   | 75400/117610 [6:09:12<3:07:20,  3.76it/s]

{'loss': 4.6909, 'learning_rate': 1.4408020551695962e-05, 'epoch': 1.28}


 64%|██████▍   | 75500/117610 [6:09:53<52:43:31,  4.51s/it]

{'loss': 4.7512, 'learning_rate': 1.434725858538904e-05, 'epoch': 1.28}


 64%|██████▍   | 75600/117610 [6:10:19<3:07:40,  3.73it/s] 

{'loss': 4.7353, 'learning_rate': 1.42865734108933e-05, 'epoch': 1.29}


 64%|██████▍   | 75700/117610 [6:10:46<3:08:39,  3.70it/s]

{'loss': 4.6755, 'learning_rate': 1.4225965465666527e-05, 'epoch': 1.29}


 64%|██████▍   | 75800/117610 [6:11:12<3:01:03,  3.85it/s]

{'loss': 4.6448, 'learning_rate': 1.416543518660975e-05, 'epoch': 1.29}


 65%|██████▍   | 75900/117610 [6:11:38<3:00:42,  3.85it/s]

{'loss': 4.7355, 'learning_rate': 1.4104983010064166e-05, 'epoch': 1.29}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.6736, 'learning_rate': 1.4044609371807932e-05, 'epoch': 1.29}


 65%|██████▍   | 76100/117610 [6:12:46<2:59:05,  3.86it/s] 

{'loss': 4.7047, 'learning_rate': 1.3984314707053054e-05, 'epoch': 1.29}


 65%|██████▍   | 76200/117610 [6:13:12<3:03:19,  3.76it/s]

{'loss': 4.7183, 'learning_rate': 1.3924099450442269e-05, 'epoch': 1.3}


 65%|██████▍   | 76300/117610 [6:13:38<2:58:57,  3.85it/s]

{'loss': 4.7348, 'learning_rate': 1.3863964036045868e-05, 'epoch': 1.3}


 65%|██████▍   | 76401/117610 [6:14:05<2:51:36,  4.00it/s]

{'loss': 4.6798, 'learning_rate': 1.3803908897358587e-05, 'epoch': 1.3}


 65%|██████▌   | 76500/117610 [6:14:42<42:58:50,  3.76s/it]

{'loss': 4.7073, 'learning_rate': 1.3743934467296481e-05, 'epoch': 1.3}


 65%|██████▌   | 76600/117610 [6:15:09<3:02:15,  3.75it/s] 

{'loss': 4.7163, 'learning_rate': 1.3684041178193835e-05, 'epoch': 1.3}


 65%|██████▌   | 76700/117610 [6:15:35<2:58:14,  3.83it/s]

{'loss': 4.702, 'learning_rate': 1.3624229461799984e-05, 'epoch': 1.3}


 65%|██████▌   | 76800/117610 [6:16:01<2:58:28,  3.81it/s]

{'loss': 4.7202, 'learning_rate': 1.3564499749276239e-05, 'epoch': 1.31}


 65%|██████▌   | 76900/117610 [6:16:28<2:59:03,  3.79it/s]

{'loss': 4.6699, 'learning_rate': 1.3504852471192794e-05, 'epoch': 1.31}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.66, 'learning_rate': 1.3445288057525587e-05, 'epoch': 1.31}


 66%|██████▌   | 77100/117610 [6:17:37<2:58:52,  3.77it/s] 

{'loss': 4.6902, 'learning_rate': 1.3385806937653219e-05, 'epoch': 1.31}


 66%|██████▌   | 77200/117610 [6:18:03<2:56:05,  3.82it/s]

{'loss': 4.7017, 'learning_rate': 1.3326409540353847e-05, 'epoch': 1.31}


 66%|██████▌   | 77301/117610 [6:18:29<2:42:04,  4.15it/s]

{'loss': 4.6847, 'learning_rate': 1.3267096293802122e-05, 'epoch': 1.31}


 66%|██████▌   | 77400/117610 [6:18:55<2:55:54,  3.81it/s]

{'loss': 4.6986, 'learning_rate': 1.3207867625566064e-05, 'epoch': 1.32}


 66%|██████▌   | 77500/117610 [6:19:33<41:48:09,  3.75s/it]

{'loss': 4.6179, 'learning_rate': 1.3148723962604018e-05, 'epoch': 1.32}


 66%|██████▌   | 77600/117610 [6:19:59<2:58:07,  3.74it/s] 

{'loss': 4.7609, 'learning_rate': 1.3089665731261524e-05, 'epoch': 1.32}


 66%|██████▌   | 77700/117610 [6:20:26<2:58:13,  3.73it/s]

{'loss': 4.66, 'learning_rate': 1.3030693357268314e-05, 'epoch': 1.32}


 66%|██████▌   | 77800/117610 [6:20:52<2:53:52,  3.82it/s]

{'loss': 4.7001, 'learning_rate': 1.297180726573518e-05, 'epoch': 1.32}


 66%|██████▌   | 77900/117610 [6:21:18<2:58:03,  3.72it/s]

{'loss': 4.7107, 'learning_rate': 1.2913007881150929e-05, 'epoch': 1.32}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.6873, 'learning_rate': 1.2854295627379365e-05, 'epoch': 1.33}


 66%|██████▋   | 78100/117610 [6:22:25<2:55:22,  3.75it/s] 

{'loss': 4.6655, 'learning_rate': 1.2795670927656153e-05, 'epoch': 1.33}


 66%|██████▋   | 78200/117610 [6:22:51<2:55:33,  3.74it/s]

{'loss': 4.6931, 'learning_rate': 1.2737134204585833e-05, 'epoch': 1.33}


 67%|██████▋   | 78300/117610 [6:23:17<2:50:57,  3.83it/s]

{'loss': 4.6836, 'learning_rate': 1.2678685880138762e-05, 'epoch': 1.33}


 67%|██████▋   | 78400/117610 [6:23:44<2:52:18,  3.79it/s]

{'loss': 4.6713, 'learning_rate': 1.2620326375648048e-05, 'epoch': 1.33}


 67%|██████▋   | 78500/117610 [6:24:23<45:32:08,  4.19s/it]

{'loss': 4.7178, 'learning_rate': 1.256205611180653e-05, 'epoch': 1.33}


 67%|██████▋   | 78600/117610 [6:24:49<2:52:13,  3.78it/s] 

{'loss': 4.6987, 'learning_rate': 1.2503875508663738e-05, 'epoch': 1.34}


 67%|██████▋   | 78701/117610 [6:25:16<2:40:35,  4.04it/s]

{'loss': 4.6236, 'learning_rate': 1.2445784985622894e-05, 'epoch': 1.34}


 67%|██████▋   | 78800/117610 [6:25:42<2:42:09,  3.99it/s]

{'loss': 4.6779, 'learning_rate': 1.2387784961437835e-05, 'epoch': 1.34}


 67%|██████▋   | 78900/117610 [6:26:08<2:51:18,  3.77it/s]

{'loss': 4.6697, 'learning_rate': 1.2329875854210044e-05, 'epoch': 1.34}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.66, 'learning_rate': 1.227205808138559e-05, 'epoch': 1.34}


 67%|██████▋   | 79100/117610 [6:27:11<2:52:00,  3.73it/s] 

{'loss': 4.7304, 'learning_rate': 1.2214332059752193e-05, 'epoch': 1.35}


 67%|██████▋   | 79200/117610 [6:27:38<2:48:19,  3.80it/s]

{'loss': 4.6323, 'learning_rate': 1.2156698205436115e-05, 'epoch': 1.35}


 67%|██████▋   | 79300/117610 [6:28:04<2:51:18,  3.73it/s]

{'loss': 4.6321, 'learning_rate': 1.2099156933899258e-05, 'epoch': 1.35}


 68%|██████▊   | 79400/117610 [6:28:30<2:48:19,  3.78it/s]

{'loss': 4.6846, 'learning_rate': 1.2041708659936124e-05, 'epoch': 1.35}


 68%|██████▊   | 79500/117610 [6:29:09<42:21:06,  4.00s/it]

{'loss': 4.6305, 'learning_rate': 1.1984353797670803e-05, 'epoch': 1.35}


 68%|██████▊   | 79600/117610 [6:29:36<2:42:00,  3.91it/s] 

{'loss': 4.6786, 'learning_rate': 1.1927092760554034e-05, 'epoch': 1.35}


 68%|██████▊   | 79700/117610 [6:30:02<2:49:32,  3.73it/s]

{'loss': 4.6421, 'learning_rate': 1.1869925961360187e-05, 'epoch': 1.36}


 68%|██████▊   | 79800/117610 [6:30:29<2:45:24,  3.81it/s]

{'loss': 4.6607, 'learning_rate': 1.1812853812184334e-05, 'epoch': 1.36}


 68%|██████▊   | 79900/117610 [6:30:55<2:48:26,  3.73it/s]

{'loss': 4.6696, 'learning_rate': 1.1755876724439227e-05, 'epoch': 1.36}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.6217, 'learning_rate': 1.1698995108852342e-05, 'epoch': 1.36}


 68%|██████▊   | 80100/117610 [6:32:02<2:48:56,  3.70it/s] 

{'loss': 4.6346, 'learning_rate': 1.1642209375462968e-05, 'epoch': 1.36}


 68%|██████▊   | 80200/117610 [6:32:29<2:46:49,  3.74it/s]

{'loss': 4.7033, 'learning_rate': 1.1585519933619182e-05, 'epoch': 1.36}


 68%|██████▊   | 80300/117610 [6:32:55<2:49:42,  3.66it/s]

{'loss': 4.6313, 'learning_rate': 1.152892719197494e-05, 'epoch': 1.37}


 68%|██████▊   | 80400/117610 [6:33:22<2:37:30,  3.94it/s]

{'loss': 4.7039, 'learning_rate': 1.147243155848711e-05, 'epoch': 1.37}


 68%|██████▊   | 80501/117610 [6:34:02<32:19:17,  3.14s/it]

{'loss': 4.69, 'learning_rate': 1.141603344041257e-05, 'epoch': 1.37}


 69%|██████▊   | 80600/117610 [6:34:28<2:41:00,  3.83it/s] 

{'loss': 4.7042, 'learning_rate': 1.1359733244305218e-05, 'epoch': 1.37}


 69%|██████▊   | 80700/117610 [6:34:54<2:34:52,  3.97it/s]

{'loss': 4.6295, 'learning_rate': 1.1303531376013068e-05, 'epoch': 1.37}


 69%|██████▊   | 80800/117610 [6:35:21<2:42:14,  3.78it/s]

{'loss': 4.7099, 'learning_rate': 1.1247428240675347e-05, 'epoch': 1.37}


 69%|██████▉   | 80900/117610 [6:35:47<2:44:15,  3.72it/s]

{'loss': 4.6681, 'learning_rate': 1.119142424271952e-05, 'epoch': 1.38}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5995, 'learning_rate': 1.113551978585844e-05, 'epoch': 1.38}


 69%|██████▉   | 81100/117610 [6:36:56<2:37:04,  3.87it/s] 

{'loss': 4.6056, 'learning_rate': 1.1079715273087365e-05, 'epoch': 1.38}


 69%|██████▉   | 81200/117610 [6:37:23<2:35:56,  3.89it/s]

{'loss': 4.6317, 'learning_rate': 1.1024011106681135e-05, 'epoch': 1.38}


 69%|██████▉   | 81300/117610 [6:37:49<2:35:57,  3.88it/s]

{'loss': 4.6324, 'learning_rate': 1.0968407688191192e-05, 'epoch': 1.38}


 69%|██████▉   | 81400/117610 [6:38:15<2:40:25,  3.76it/s]

{'loss': 4.634, 'learning_rate': 1.0912905418442735e-05, 'epoch': 1.38}


 69%|██████▉   | 81500/117610 [6:38:52<34:26:57,  3.43s/it]

{'loss': 4.6813, 'learning_rate': 1.0857504697531828e-05, 'epoch': 1.39}


 69%|██████▉   | 81600/117610 [6:39:19<2:42:38,  3.69it/s] 

{'loss': 4.6633, 'learning_rate': 1.0802205924822498e-05, 'epoch': 1.39}


 69%|██████▉   | 81700/117610 [6:39:45<2:39:23,  3.76it/s]

{'loss': 4.6517, 'learning_rate': 1.0747009498943855e-05, 'epoch': 1.39}


 70%|██████▉   | 81801/117610 [6:40:12<2:33:33,  3.89it/s]

{'loss': 4.6795, 'learning_rate': 1.0691915817787228e-05, 'epoch': 1.39}


 70%|██████▉   | 81900/117610 [6:40:38<2:39:16,  3.74it/s]

{'loss': 4.6529, 'learning_rate': 1.0636925278503323e-05, 'epoch': 1.39}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.6441, 'learning_rate': 1.0582038277499303e-05, 'epoch': 1.39}


 70%|██████▉   | 82100/117610 [6:41:45<2:38:14,  3.74it/s] 

{'loss': 4.6646, 'learning_rate': 1.0527255210435963e-05, 'epoch': 1.4}


 70%|██████▉   | 82200/117610 [6:42:11<2:41:36,  3.65it/s]

{'loss': 4.6631, 'learning_rate': 1.0472576472224898e-05, 'epoch': 1.4}


 70%|██████▉   | 82300/117610 [6:42:39<2:38:12,  3.72it/s]

{'loss': 4.6524, 'learning_rate': 1.0418002457025613e-05, 'epoch': 1.4}


 70%|███████   | 82400/117610 [6:43:05<2:38:03,  3.71it/s]

{'loss': 4.6084, 'learning_rate': 1.0363533558242705e-05, 'epoch': 1.4}


 70%|███████   | 82500/117610 [6:43:46<43:11:05,  4.43s/it]

{'loss': 4.6654, 'learning_rate': 1.0309170168523016e-05, 'epoch': 1.4}


 70%|███████   | 82600/117610 [6:44:13<2:37:31,  3.70it/s] 

{'loss': 4.6282, 'learning_rate': 1.0254912679752846e-05, 'epoch': 1.4}


 70%|███████   | 82700/117610 [6:44:40<2:39:00,  3.66it/s]

{'loss': 4.6866, 'learning_rate': 1.0200761483055044e-05, 'epoch': 1.41}


 70%|███████   | 82800/117610 [6:45:07<2:39:45,  3.63it/s]

{'loss': 4.6445, 'learning_rate': 1.0146716968786286e-05, 'epoch': 1.41}


 70%|███████   | 82900/117610 [6:45:33<2:27:28,  3.92it/s]

{'loss': 4.6305, 'learning_rate': 1.0092779526534162e-05, 'epoch': 1.41}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.6301, 'learning_rate': 1.0038949545114472e-05, 'epoch': 1.41}


 71%|███████   | 83100/117610 [6:46:41<2:30:43,  3.82it/s] 

{'loss': 4.6553, 'learning_rate': 9.985227412568327e-06, 'epoch': 1.41}


 71%|███████   | 83200/117610 [6:47:07<2:32:49,  3.75it/s]

{'loss': 4.6151, 'learning_rate': 9.9316135161594e-06, 'epoch': 1.41}


 71%|███████   | 83301/117610 [6:47:34<2:15:05,  4.23it/s]

{'loss': 4.6397, 'learning_rate': 9.878108242371156e-06, 'epoch': 1.42}


 71%|███████   | 83400/117610 [6:48:00<2:33:59,  3.70it/s]

{'loss': 4.6799, 'learning_rate': 9.82471197690401e-06, 'epoch': 1.42}


 71%|███████   | 83500/117610 [6:48:33<22:23:10,  2.36s/it]

{'loss': 4.6467, 'learning_rate': 9.771425104672577e-06, 'epoch': 1.42}


 71%|███████   | 83600/117610 [6:49:00<2:34:25,  3.67it/s] 

{'loss': 4.6667, 'learning_rate': 9.718248009802916e-06, 'epoch': 1.42}


 71%|███████   | 83700/117610 [6:49:27<2:30:55,  3.74it/s]

{'loss': 4.6398, 'learning_rate': 9.66518107562972e-06, 'epoch': 1.42}


 71%|███████▏  | 83800/117610 [6:49:54<2:31:45,  3.71it/s]

{'loss': 4.6113, 'learning_rate': 9.61222468469358e-06, 'epoch': 1.43}


 71%|███████▏  | 83900/117610 [6:50:20<2:30:12,  3.74it/s]

{'loss': 4.6351, 'learning_rate': 9.559379218738206e-06, 'epoch': 1.43}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.6132, 'learning_rate': 9.506645058707731e-06, 'epoch': 1.43}


 72%|███████▏  | 84100/117610 [6:51:29<2:32:20,  3.67it/s] 

{'loss': 4.6411, 'learning_rate': 9.454022584743874e-06, 'epoch': 1.43}


 72%|███████▏  | 84200/117610 [6:51:55<2:27:27,  3.78it/s]

{'loss': 4.6506, 'learning_rate': 9.40151217618328e-06, 'epoch': 1.43}


 72%|███████▏  | 84300/117610 [6:52:22<2:25:41,  3.81it/s]

{'loss': 4.6797, 'learning_rate': 9.349114211554732e-06, 'epoch': 1.43}


 72%|███████▏  | 84400/117610 [6:52:48<2:24:48,  3.82it/s]

{'loss': 4.6347, 'learning_rate': 9.296829068576473e-06, 'epoch': 1.44}


 72%|███████▏  | 84500/117610 [6:53:29<42:13:44,  4.59s/it]

{'loss': 4.6847, 'learning_rate': 9.244657124153428e-06, 'epoch': 1.44}


 72%|███████▏  | 84600/117610 [6:53:56<2:24:24,  3.81it/s] 

{'loss': 4.6165, 'learning_rate': 9.19259875437453e-06, 'epoch': 1.44}


 72%|███████▏  | 84700/117610 [6:54:22<2:18:43,  3.95it/s]

{'loss': 4.6268, 'learning_rate': 9.140654334509999e-06, 'epoch': 1.44}


 72%|███████▏  | 84800/117610 [6:54:48<2:23:35,  3.81it/s]

{'loss': 4.614, 'learning_rate': 9.088824239008609e-06, 'epoch': 1.44}


 72%|███████▏  | 84900/117610 [6:55:15<2:19:25,  3.91it/s]

{'loss': 4.6528, 'learning_rate': 9.037108841495023e-06, 'epoch': 1.44}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.6278, 'learning_rate': 8.98550851476708e-06, 'epoch': 1.45}


 72%|███████▏  | 85100/117610 [6:56:24<2:27:31,  3.67it/s] 

{'loss': 4.6102, 'learning_rate': 8.934023630793134e-06, 'epoch': 1.45}


 72%|███████▏  | 85200/117610 [6:56:50<2:23:28,  3.77it/s]

{'loss': 4.6432, 'learning_rate': 8.882654560709336e-06, 'epoch': 1.45}


 73%|███████▎  | 85300/117610 [6:57:16<2:20:18,  3.84it/s]

{'loss': 4.5969, 'learning_rate': 8.831401674816967e-06, 'epoch': 1.45}


 73%|███████▎  | 85400/117610 [6:57:43<2:27:54,  3.63it/s]

{'loss': 4.6338, 'learning_rate': 8.780265342579808e-06, 'epoch': 1.45}


 73%|███████▎  | 85500/117610 [6:58:20<28:56:42,  3.25s/it]

{'loss': 4.6602, 'learning_rate': 8.729245932621421e-06, 'epoch': 1.45}


 73%|███████▎  | 85600/117610 [6:58:46<2:25:39,  3.66it/s] 

{'loss': 4.6293, 'learning_rate': 8.678343812722531e-06, 'epoch': 1.46}


 73%|███████▎  | 85700/117610 [6:59:13<2:17:46,  3.86it/s]

{'loss': 4.5968, 'learning_rate': 8.627559349818337e-06, 'epoch': 1.46}


 73%|███████▎  | 85800/117610 [6:59:40<2:21:09,  3.76it/s]

{'loss': 4.5955, 'learning_rate': 8.576892909995932e-06, 'epoch': 1.46}


 73%|███████▎  | 85900/117610 [7:00:06<2:25:22,  3.64it/s]

{'loss': 4.5799, 'learning_rate': 8.526344858491586e-06, 'epoch': 1.46}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.601, 'learning_rate': 8.475915559688157e-06, 'epoch': 1.46}


 73%|███████▎  | 86100/117610 [7:01:14<2:19:15,  3.77it/s] 

{'loss': 4.5894, 'learning_rate': 8.425605377112483e-06, 'epoch': 1.46}


 73%|███████▎  | 86200/117610 [7:01:41<2:17:50,  3.80it/s]

{'loss': 4.5795, 'learning_rate': 8.375414673432694e-06, 'epoch': 1.47}


 73%|███████▎  | 86300/117610 [7:02:07<2:22:35,  3.66it/s]

{'loss': 4.5811, 'learning_rate': 8.32534381045568e-06, 'epoch': 1.47}


 73%|███████▎  | 86400/117610 [7:02:34<2:19:11,  3.74it/s]

{'loss': 4.6601, 'learning_rate': 8.275393149124409e-06, 'epoch': 1.47}


 74%|███████▎  | 86500/117610 [7:03:15<39:12:11,  4.54s/it]

{'loss': 4.5697, 'learning_rate': 8.225563049515383e-06, 'epoch': 1.47}


 74%|███████▎  | 86600/117610 [7:03:42<2:14:24,  3.85it/s] 

{'loss': 4.5992, 'learning_rate': 8.175853870836007e-06, 'epoch': 1.47}


 74%|███████▎  | 86700/117610 [7:04:08<2:21:06,  3.65it/s]

{'loss': 4.6116, 'learning_rate': 8.126265971421995e-06, 'epoch': 1.47}


 74%|███████▍  | 86800/117610 [7:04:34<2:12:01,  3.89it/s]

{'loss': 4.6, 'learning_rate': 8.076799708734836e-06, 'epoch': 1.48}


 74%|███████▍  | 86900/117610 [7:05:02<2:21:40,  3.61it/s]

{'loss': 4.6144, 'learning_rate': 8.02745543935916e-06, 'epoch': 1.48}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.6142, 'learning_rate': 7.978233519000188e-06, 'epoch': 1.48}


 74%|███████▍  | 87100/117610 [7:06:08<2:14:09,  3.79it/s] 

{'loss': 4.6006, 'learning_rate': 7.929134302481178e-06, 'epoch': 1.48}


 74%|███████▍  | 87200/117610 [7:06:34<2:09:43,  3.91it/s]

{'loss': 4.5999, 'learning_rate': 7.88015814374087e-06, 'epoch': 1.48}


 74%|███████▍  | 87300/117610 [7:07:01<2:12:18,  3.82it/s]

{'loss': 4.6263, 'learning_rate': 7.83130539583091e-06, 'epoch': 1.48}


 74%|███████▍  | 87400/117610 [7:07:27<2:13:10,  3.78it/s]

{'loss': 4.5865, 'learning_rate': 7.782576410913311e-06, 'epoch': 1.49}


 74%|███████▍  | 87500/117610 [7:08:07<37:58:58,  4.54s/it]

{'loss': 4.6226, 'learning_rate': 7.733971540257956e-06, 'epoch': 1.49}


 74%|███████▍  | 87600/117610 [7:08:34<2:11:08,  3.81it/s] 

{'loss': 4.6333, 'learning_rate': 7.68549113424e-06, 'epoch': 1.49}


 75%|███████▍  | 87700/117610 [7:09:00<2:12:52,  3.75it/s]

{'loss': 4.6485, 'learning_rate': 7.637135542337392e-06, 'epoch': 1.49}


 75%|███████▍  | 87800/117610 [7:09:27<2:15:40,  3.66it/s]

{'loss': 4.6011, 'learning_rate': 7.588905113128325e-06, 'epoch': 1.49}


 75%|███████▍  | 87900/117610 [7:09:54<2:13:52,  3.70it/s]

{'loss': 4.5792, 'learning_rate': 7.540800194288772e-06, 'epoch': 1.49}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5863, 'learning_rate': 7.492821132589906e-06, 'epoch': 1.5}


 75%|███████▍  | 88100/117610 [7:11:01<2:08:40,  3.82it/s] 

{'loss': 4.6107, 'learning_rate': 7.4449682738956775e-06, 'epoch': 1.5}


 75%|███████▍  | 88200/117610 [7:11:27<2:11:46,  3.72it/s]

{'loss': 4.6084, 'learning_rate': 7.3972419631602535e-06, 'epoch': 1.5}


 75%|███████▌  | 88300/117610 [7:11:54<2:13:38,  3.66it/s]

{'loss': 4.637, 'learning_rate': 7.349642544425589e-06, 'epoch': 1.5}


 75%|███████▌  | 88400/117610 [7:12:21<2:08:21,  3.79it/s]

{'loss': 4.6495, 'learning_rate': 7.3021703608189e-06, 'epoch': 1.5}


 75%|███████▌  | 88500/117610 [7:12:59<28:18:59,  3.50s/it]

{'loss': 4.5486, 'learning_rate': 7.254825754550207e-06, 'epoch': 1.5}


 75%|███████▌  | 88600/117610 [7:13:25<2:05:59,  3.84it/s] 

{'loss': 4.6693, 'learning_rate': 7.207609066909898e-06, 'epoch': 1.51}


 75%|███████▌  | 88700/117610 [7:13:52<2:07:50,  3.77it/s]

{'loss': 4.5623, 'learning_rate': 7.160520638266216e-06, 'epoch': 1.51}


 76%|███████▌  | 88800/117610 [7:14:18<2:03:02,  3.90it/s]

{'loss': 4.5743, 'learning_rate': 7.1135608080628305e-06, 'epoch': 1.51}


 76%|███████▌  | 88900/117610 [7:14:44<2:04:31,  3.84it/s]

{'loss': 4.5784, 'learning_rate': 7.066729914816414e-06, 'epoch': 1.51}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.6119, 'learning_rate': 7.020028296114159e-06, 'epoch': 1.51}


 76%|███████▌  | 89100/117610 [7:15:48<2:11:18,  3.62it/s] 

{'loss': 4.5686, 'learning_rate': 6.973456288611363e-06, 'epoch': 1.52}


 76%|███████▌  | 89200/117610 [7:16:15<2:09:41,  3.65it/s]

{'loss': 4.5654, 'learning_rate': 6.927014228029003e-06, 'epoch': 1.52}


 76%|███████▌  | 89300/117610 [7:16:42<2:08:59,  3.66it/s]

{'loss': 4.588, 'learning_rate': 6.8807024491513325e-06, 'epoch': 1.52}


 76%|███████▌  | 89400/117610 [7:17:09<2:09:45,  3.62it/s]

{'loss': 4.63, 'learning_rate': 6.834521285823433e-06, 'epoch': 1.52}


 76%|███████▌  | 89500/117610 [7:17:47<30:49:00,  3.95s/it]

{'loss': 4.6072, 'learning_rate': 6.788471070948829e-06, 'epoch': 1.52}


 76%|███████▌  | 89600/117610 [7:18:14<1:59:46,  3.90it/s] 

{'loss': 4.58, 'learning_rate': 6.742552136487082e-06, 'epoch': 1.52}


 76%|███████▋  | 89700/117610 [7:18:40<2:03:34,  3.76it/s]

{'loss': 4.5958, 'learning_rate': 6.696764813451412e-06, 'epoch': 1.53}


 76%|███████▋  | 89800/117610 [7:19:07<2:01:49,  3.80it/s]

{'loss': 4.5875, 'learning_rate': 6.6511094319062776e-06, 'epoch': 1.53}


 76%|███████▋  | 89900/117610 [7:19:34<2:06:38,  3.65it/s]

{'loss': 4.5632, 'learning_rate': 6.60558632096504e-06, 'epoch': 1.53}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5396, 'learning_rate': 6.56019580878757e-06, 'epoch': 1.53}


 77%|███████▋  | 90100/117610 [7:20:39<1:56:11,  3.95it/s] 

{'loss': 4.5618, 'learning_rate': 6.51493822257786e-06, 'epoch': 1.53}


 77%|███████▋  | 90200/117610 [7:21:05<2:00:41,  3.79it/s]

{'loss': 4.5489, 'learning_rate': 6.469813888581696e-06, 'epoch': 1.53}


 77%|███████▋  | 90300/117610 [7:21:32<2:00:35,  3.77it/s]

{'loss': 4.6212, 'learning_rate': 6.424823132084295e-06, 'epoch': 1.54}


 77%|███████▋  | 90400/117610 [7:21:58<2:02:17,  3.71it/s]

{'loss': 4.5531, 'learning_rate': 6.379966277407978e-06, 'epoch': 1.54}


 77%|███████▋  | 90500/117610 [7:22:39<32:39:12,  4.34s/it]

{'loss': 4.6111, 'learning_rate': 6.335243647909794e-06, 'epoch': 1.54}


 77%|███████▋  | 90600/117610 [7:23:05<1:58:00,  3.81it/s] 

{'loss': 4.571, 'learning_rate': 6.290655565979212e-06, 'epoch': 1.54}


 77%|███████▋  | 90700/117610 [7:23:32<2:00:40,  3.72it/s]

{'loss': 4.5653, 'learning_rate': 6.2462023530358185e-06, 'epoch': 1.54}


 77%|███████▋  | 90800/117610 [7:23:59<1:57:56,  3.79it/s]

{'loss': 4.5827, 'learning_rate': 6.201884329526953e-06, 'epoch': 1.54}


 77%|███████▋  | 90901/117610 [7:24:25<1:53:13,  3.93it/s]

{'loss': 4.585, 'learning_rate': 6.157701814925429e-06, 'epoch': 1.55}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.6466, 'learning_rate': 6.113655127727222e-06, 'epoch': 1.55}


 77%|███████▋  | 91100/117610 [7:25:33<2:01:04,  3.65it/s] 

{'loss': 4.5772, 'learning_rate': 6.069744585449194e-06, 'epoch': 1.55}


 78%|███████▊  | 91200/117610 [7:26:00<1:51:08,  3.96it/s]

{'loss': 4.5472, 'learning_rate': 6.0259705046267715e-06, 'epoch': 1.55}


 78%|███████▊  | 91300/117610 [7:26:27<1:57:47,  3.72it/s]

{'loss': 4.5716, 'learning_rate': 5.9823332008116655e-06, 'epoch': 1.55}


 78%|███████▊  | 91400/117610 [7:26:53<1:52:10,  3.89it/s]

{'loss': 4.5861, 'learning_rate': 5.938832988569648e-06, 'epoch': 1.55}


 78%|███████▊  | 91500/117610 [7:27:34<32:02:52,  4.42s/it]

{'loss': 4.5928, 'learning_rate': 5.8954701814782076e-06, 'epoch': 1.56}


 78%|███████▊  | 91600/117610 [7:28:00<1:51:30,  3.89it/s] 

{'loss': 4.5662, 'learning_rate': 5.852245092124359e-06, 'epoch': 1.56}


 78%|███████▊  | 91700/117610 [7:28:26<1:49:10,  3.96it/s]

{'loss': 4.5557, 'learning_rate': 5.809158032102336e-06, 'epoch': 1.56}


 78%|███████▊  | 91800/117610 [7:28:53<1:53:58,  3.77it/s]

{'loss': 4.5779, 'learning_rate': 5.766209312011384e-06, 'epoch': 1.56}


 78%|███████▊  | 91900/117610 [7:29:19<1:55:32,  3.71it/s]

{'loss': 4.576, 'learning_rate': 5.7233992414535e-06, 'epoch': 1.56}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.6166, 'learning_rate': 5.680728129031193e-06, 'epoch': 1.56}


 78%|███████▊  | 92100/117610 [7:30:28<1:54:37,  3.71it/s] 

{'loss': 4.5871, 'learning_rate': 5.638196282345298e-06, 'epoch': 1.57}


 78%|███████▊  | 92200/117610 [7:30:56<1:55:25,  3.67it/s]

{'loss': 4.6167, 'learning_rate': 5.595804007992716e-06, 'epoch': 1.57}


 78%|███████▊  | 92300/117610 [7:31:22<1:50:45,  3.81it/s]

{'loss': 4.535, 'learning_rate': 5.553551611564223e-06, 'epoch': 1.57}


 79%|███████▊  | 92400/117610 [7:31:48<1:52:19,  3.74it/s]

{'loss': 4.5943, 'learning_rate': 5.511439397642257e-06, 'epoch': 1.57}


 79%|███████▊  | 92500/117610 [7:32:28<30:04:51,  4.31s/it]

{'loss': 4.5438, 'learning_rate': 5.469467669798753e-06, 'epoch': 1.57}


 79%|███████▊  | 92600/117610 [7:32:55<1:52:38,  3.70it/s] 

{'loss': 4.5806, 'learning_rate': 5.427636730592917e-06, 'epoch': 1.57}


 79%|███████▉  | 92700/117610 [7:33:22<1:53:52,  3.65it/s]

{'loss': 4.5442, 'learning_rate': 5.385946881569048e-06, 'epoch': 1.58}


 79%|███████▉  | 92800/117610 [7:33:49<1:46:32,  3.88it/s]

{'loss': 4.5789, 'learning_rate': 5.344398423254407e-06, 'epoch': 1.58}


 79%|███████▉  | 92900/117610 [7:34:16<1:46:57,  3.85it/s]

{'loss': 4.5597, 'learning_rate': 5.302991655156994e-06, 'epoch': 1.58}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5652, 'learning_rate': 5.261726875763426e-06, 'epoch': 1.58}


 79%|███████▉  | 93100/117610 [7:35:21<1:49:35,  3.73it/s] 

{'loss': 4.5901, 'learning_rate': 5.220604382536762e-06, 'epoch': 1.58}


 79%|███████▉  | 93200/117610 [7:35:47<1:48:56,  3.73it/s]

{'loss': 4.5554, 'learning_rate': 5.179624471914396e-06, 'epoch': 1.58}


 79%|███████▉  | 93300/117610 [7:36:14<1:45:50,  3.83it/s]

{'loss': 4.5404, 'learning_rate': 5.138787439305862e-06, 'epoch': 1.59}


 79%|███████▉  | 93400/117610 [7:36:40<1:50:02,  3.67it/s]

{'loss': 4.5465, 'learning_rate': 5.098093579090771e-06, 'epoch': 1.59}


 80%|███████▉  | 93500/117610 [7:37:10<8:20:48,  1.25s/it]

{'loss': 4.6354, 'learning_rate': 5.0575431846166224e-06, 'epoch': 1.59}


 80%|███████▉  | 93600/117610 [7:37:37<1:46:26,  3.76it/s]

{'loss': 4.5486, 'learning_rate': 5.0171365481967545e-06, 'epoch': 1.59}


 80%|███████▉  | 93700/117610 [7:38:04<1:45:38,  3.77it/s]

{'loss': 4.5306, 'learning_rate': 4.976873961108186e-06, 'epoch': 1.59}


 80%|███████▉  | 93800/117610 [7:38:30<1:42:02,  3.89it/s]

{'loss': 4.5122, 'learning_rate': 4.936755713589524e-06, 'epoch': 1.6}


 80%|███████▉  | 93900/117610 [7:38:57<1:46:40,  3.70it/s]

{'loss': 4.5388, 'learning_rate': 4.8967820948389174e-06, 'epoch': 1.6}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5716, 'learning_rate': 4.856953393011907e-06, 'epoch': 1.6}


 80%|████████  | 94100/117610 [7:40:06<1:46:55,  3.66it/s] 

{'loss': 4.5593, 'learning_rate': 4.817269895219395e-06, 'epoch': 1.6}


 80%|████████  | 94200/117610 [7:40:32<1:48:13,  3.61it/s]

{'loss': 4.5503, 'learning_rate': 4.777731887525544e-06, 'epoch': 1.6}


 80%|████████  | 94301/117610 [7:40:59<1:37:39,  3.98it/s]

{'loss': 4.5123, 'learning_rate': 4.738339654945759e-06, 'epoch': 1.6}


 80%|████████  | 94400/117610 [7:41:25<1:42:49,  3.76it/s]

{'loss': 4.5794, 'learning_rate': 4.699093481444577e-06, 'epoch': 1.61}


 80%|████████  | 94500/117610 [7:42:05<28:19:30,  4.41s/it]

{'loss': 4.5284, 'learning_rate': 4.659993649933653e-06, 'epoch': 1.61}


 80%|████████  | 94600/117610 [7:42:32<1:41:36,  3.77it/s] 

{'loss': 4.5566, 'learning_rate': 4.62104044226973e-06, 'epoch': 1.61}


 81%|████████  | 94700/117610 [7:42:58<1:41:03,  3.78it/s]

{'loss': 4.5726, 'learning_rate': 4.582234139252573e-06, 'epoch': 1.61}


 81%|████████  | 94800/117610 [7:43:24<1:39:33,  3.82it/s]

{'loss': 4.5334, 'learning_rate': 4.543575020622975e-06, 'epoch': 1.61}


 81%|████████  | 94900/117610 [7:43:51<1:44:26,  3.62it/s]

{'loss': 4.5723, 'learning_rate': 4.5050633650607164e-06, 'epoch': 1.61}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5571, 'learning_rate': 4.46669945018259e-06, 'epoch': 1.62}


 81%|████████  | 95100/117610 [7:45:01<1:40:32,  3.73it/s] 

{'loss': 4.5822, 'learning_rate': 4.428483552540361e-06, 'epoch': 1.62}


 81%|████████  | 95200/117610 [7:45:27<1:40:47,  3.71it/s]

{'loss': 4.5135, 'learning_rate': 4.390415947618795e-06, 'epoch': 1.62}


 81%|████████  | 95300/117610 [7:45:53<1:37:26,  3.82it/s]

{'loss': 4.5513, 'learning_rate': 4.352496909833684e-06, 'epoch': 1.62}


 81%|████████  | 95400/117610 [7:46:20<1:41:39,  3.64it/s]

{'loss': 4.5494, 'learning_rate': 4.314726712529829e-06, 'epoch': 1.62}


 81%|████████  | 95500/117610 [7:47:01<27:26:50,  4.47s/it]

{'loss': 4.5488, 'learning_rate': 4.277105627979105e-06, 'epoch': 1.62}


 81%|████████▏ | 95600/117610 [7:47:28<1:38:28,  3.72it/s] 

{'loss': 4.5261, 'learning_rate': 4.2396339273784755e-06, 'epoch': 1.63}


 81%|████████▏ | 95700/117610 [7:47:54<1:34:55,  3.85it/s]

{'loss': 4.5685, 'learning_rate': 4.202311880848075e-06, 'epoch': 1.63}


 81%|████████▏ | 95800/117610 [7:48:20<1:36:27,  3.77it/s]

{'loss': 4.5619, 'learning_rate': 4.165139757429207e-06, 'epoch': 1.63}


 82%|████████▏ | 95900/117610 [7:48:46<1:32:08,  3.93it/s]

{'loss': 4.5741, 'learning_rate': 4.128117825082439e-06, 'epoch': 1.63}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5571, 'learning_rate': 4.091246350685682e-06, 'epoch': 1.63}


 82%|████████▏ | 96100/117610 [7:49:55<1:34:14,  3.80it/s] 

{'loss': 4.5817, 'learning_rate': 4.05452560003223e-06, 'epoch': 1.63}


 82%|████████▏ | 96200/117610 [7:50:22<1:34:21,  3.78it/s]

{'loss': 4.5318, 'learning_rate': 4.0179558378288684e-06, 'epoch': 1.64}


 82%|████████▏ | 96300/117610 [7:50:49<1:32:04,  3.86it/s]

{'loss': 4.5837, 'learning_rate': 3.98153732769396e-06, 'epoch': 1.64}


 82%|████████▏ | 96400/117610 [7:51:16<1:35:31,  3.70it/s]

{'loss': 4.5981, 'learning_rate': 3.945270332155554e-06, 'epoch': 1.64}


 82%|████████▏ | 96500/117610 [7:51:57<26:53:29,  4.59s/it]

{'loss': 4.5648, 'learning_rate': 3.9091551126494765e-06, 'epoch': 1.64}


 82%|████████▏ | 96600/117610 [7:52:23<1:34:01,  3.72it/s] 

{'loss': 4.5196, 'learning_rate': 3.87319192951745e-06, 'epoch': 1.64}


 82%|████████▏ | 96700/117610 [7:52:50<1:27:18,  3.99it/s]

{'loss': 4.56, 'learning_rate': 3.837381042005242e-06, 'epoch': 1.64}


 82%|████████▏ | 96800/117610 [7:53:16<1:33:34,  3.71it/s]

{'loss': 4.4777, 'learning_rate': 3.801722708260741e-06, 'epoch': 1.65}


 82%|████████▏ | 96900/117610 [7:53:43<1:31:42,  3.76it/s]

{'loss': 4.5225, 'learning_rate': 3.7662171853321686e-06, 'epoch': 1.65}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5282, 'learning_rate': 3.730864729166153e-06, 'epoch': 1.65}


 83%|████████▎ | 97101/117610 [7:54:48<1:23:04,  4.11it/s] 

{'loss': 4.5569, 'learning_rate': 3.6956655946059487e-06, 'epoch': 1.65}


 83%|████████▎ | 97200/117610 [7:55:14<1:30:30,  3.76it/s]

{'loss': 4.605, 'learning_rate': 3.6606200353895476e-06, 'epoch': 1.65}


 83%|████████▎ | 97300/117610 [7:55:41<1:29:12,  3.79it/s]

{'loss': 4.5605, 'learning_rate': 3.625728304147874e-06, 'epoch': 1.65}


 83%|████████▎ | 97400/117610 [7:56:07<1:29:54,  3.75it/s]

{'loss': 4.5232, 'learning_rate': 3.5909906524029784e-06, 'epoch': 1.66}


 83%|████████▎ | 97500/117610 [7:56:46<23:15:23,  4.16s/it]

{'loss': 4.5617, 'learning_rate': 3.5564073305661887e-06, 'epoch': 1.66}


 83%|████████▎ | 97600/117610 [7:57:13<1:28:27,  3.77it/s] 

{'loss': 4.5483, 'learning_rate': 3.521978587936331e-06, 'epoch': 1.66}


 83%|████████▎ | 97700/117610 [7:57:39<1:27:42,  3.78it/s]

{'loss': 4.5593, 'learning_rate': 3.4877046726979197e-06, 'epoch': 1.66}


 83%|████████▎ | 97800/117610 [7:58:06<1:32:23,  3.57it/s]

{'loss': 4.5482, 'learning_rate': 3.453585831919387e-06, 'epoch': 1.66}


 83%|████████▎ | 97900/117610 [7:58:33<1:29:31,  3.67it/s]

{'loss': 4.5335, 'learning_rate': 3.4196223115512766e-06, 'epoch': 1.66}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5482, 'learning_rate': 3.3858143564244773e-06, 'epoch': 1.67}


 83%|████████▎ | 98100/117610 [7:59:40<1:27:34,  3.71it/s] 

{'loss': 4.533, 'learning_rate': 3.3521622102484883e-06, 'epoch': 1.67}


 83%|████████▎ | 98200/117610 [8:00:07<1:25:08,  3.80it/s]

{'loss': 4.5627, 'learning_rate': 3.3186661156096114e-06, 'epoch': 1.67}


 84%|████████▎ | 98300/117610 [8:00:33<1:28:41,  3.63it/s]

{'loss': 4.569, 'learning_rate': 3.2853263139692447e-06, 'epoch': 1.67}


 84%|████████▎ | 98400/117610 [8:01:00<1:27:04,  3.68it/s]

{'loss': 4.5494, 'learning_rate': 3.2521430456621095e-06, 'epoch': 1.67}


 84%|████████▍ | 98500/117610 [8:01:41<23:48:57,  4.49s/it]

{'loss': 4.5416, 'learning_rate': 3.2191165498945574e-06, 'epoch': 1.68}


 84%|████████▍ | 98600/117610 [8:02:08<1:23:49,  3.78it/s] 

{'loss': 4.4949, 'learning_rate': 3.186247064742803e-06, 'epoch': 1.68}


 84%|████████▍ | 98700/117610 [8:02:34<1:23:29,  3.77it/s]

{'loss': 4.5329, 'learning_rate': 3.1535348271512388e-06, 'epoch': 1.68}


 84%|████████▍ | 98800/117610 [8:03:00<1:24:47,  3.70it/s]

{'loss': 4.5344, 'learning_rate': 3.120980072930704e-06, 'epoch': 1.68}


 84%|████████▍ | 98900/117610 [8:03:27<1:21:05,  3.85it/s]

{'loss': 4.515, 'learning_rate': 3.0885830367568174e-06, 'epoch': 1.68}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5801, 'learning_rate': 3.056343952168242e-06, 'epoch': 1.68}


 84%|████████▍ | 99100/117610 [8:04:34<1:24:16,  3.66it/s] 

{'loss': 4.5507, 'learning_rate': 3.0242630515650293e-06, 'epoch': 1.69}


 84%|████████▍ | 99200/117610 [8:05:01<1:24:59,  3.61it/s]

{'loss': 4.5548, 'learning_rate': 2.9923405662069527e-06, 'epoch': 1.69}


 84%|████████▍ | 99300/117610 [8:05:28<1:22:11,  3.71it/s]

{'loss': 4.5588, 'learning_rate': 2.960576726211814e-06, 'epoch': 1.69}


 85%|████████▍ | 99400/117610 [8:05:55<1:21:52,  3.71it/s]

{'loss': 4.5511, 'learning_rate': 2.9289717605537947e-06, 'epoch': 1.69}


 85%|████████▍ | 99500/117610 [8:06:31<16:50:00,  3.35s/it]

{'loss': 4.5875, 'learning_rate': 2.897525897061815e-06, 'epoch': 1.69}


 85%|████████▍ | 99600/117610 [8:06:58<1:17:17,  3.88it/s] 

{'loss': 4.5543, 'learning_rate': 2.8662393624178884e-06, 'epoch': 1.69}


 85%|████████▍ | 99700/117610 [8:07:25<1:20:43,  3.70it/s]

{'loss': 4.5337, 'learning_rate': 2.8351123821554775e-06, 'epoch': 1.7}


 85%|████████▍ | 99800/117610 [8:07:51<1:20:12,  3.70it/s]

{'loss': 4.5514, 'learning_rate': 2.8041451806578633e-06, 'epoch': 1.7}


 85%|████████▍ | 99900/117610 [8:08:19<1:20:37,  3.66it/s]

{'loss': 4.5363, 'learning_rate': 2.7733379811565624e-06, 'epoch': 1.7}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5237, 'learning_rate': 2.7426910057296777e-06, 'epoch': 1.7}


 85%|████████▌ | 100100/117610 [8:09:28<1:17:23,  3.77it/s] 

{'loss': 4.5357, 'learning_rate': 2.71220447530032e-06, 'epoch': 1.7}


 85%|████████▌ | 100200/117610 [8:09:55<1:16:38,  3.79it/s]

{'loss': 4.5982, 'learning_rate': 2.681878609634997e-06, 'epoch': 1.7}


 85%|████████▌ | 100300/117610 [8:10:21<1:15:39,  3.81it/s]

{'loss': 4.5275, 'learning_rate': 2.6517136273420674e-06, 'epoch': 1.71}


 85%|████████▌ | 100400/117610 [8:10:47<1:13:35,  3.90it/s]

{'loss': 4.5428, 'learning_rate': 2.6217097458701103e-06, 'epoch': 1.71}


 85%|████████▌ | 100500/117610 [8:11:26<19:46:58,  4.16s/it]

{'loss': 4.5904, 'learning_rate': 2.591867181506405e-06, 'epoch': 1.71}


 86%|████████▌ | 100600/117610 [8:11:54<1:16:52,  3.69it/s] 

{'loss': 4.5713, 'learning_rate': 2.5621861493753536e-06, 'epoch': 1.71}


 86%|████████▌ | 100700/117610 [8:12:21<1:14:02,  3.81it/s]

{'loss': 4.5036, 'learning_rate': 2.5326668634369196e-06, 'epoch': 1.71}


 86%|████████▌ | 100800/117610 [8:12:47<1:15:24,  3.72it/s]

{'loss': 4.509, 'learning_rate': 2.503309536485102e-06, 'epoch': 1.71}


 86%|████████▌ | 100900/117610 [8:13:14<1:14:01,  3.76it/s]

{'loss': 4.4918, 'learning_rate': 2.4741143801463836e-06, 'epoch': 1.72}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5403, 'learning_rate': 2.445081604878244e-06, 'epoch': 1.72}


 86%|████████▌ | 101100/117610 [8:14:21<1:15:01,  3.67it/s] 

{'loss': 4.491, 'learning_rate': 2.41621141996759e-06, 'epoch': 1.72}


 86%|████████▌ | 101200/117610 [8:14:48<1:12:26,  3.78it/s]

{'loss': 4.5269, 'learning_rate': 2.387504033529278e-06, 'epoch': 1.72}


 86%|████████▌ | 101300/117610 [8:15:15<1:12:34,  3.75it/s]

{'loss': 4.5401, 'learning_rate': 2.358959652504622e-06, 'epoch': 1.72}


 86%|████████▌ | 101400/117610 [8:15:41<1:11:54,  3.76it/s]

{'loss': 4.5207, 'learning_rate': 2.3305784826598732e-06, 'epoch': 1.72}


 86%|████████▋ | 101500/117610 [8:16:23<21:01:27,  4.70s/it]

{'loss': 4.5022, 'learning_rate': 2.302360728584757e-06, 'epoch': 1.73}


 86%|████████▋ | 101601/117610 [8:16:49<1:06:39,  4.00it/s] 

{'loss': 4.535, 'learning_rate': 2.2743065936909863e-06, 'epoch': 1.73}


 86%|████████▋ | 101700/117610 [8:17:16<1:09:17,  3.83it/s]

{'loss': 4.5349, 'learning_rate': 2.2464162802108197e-06, 'epoch': 1.73}


 87%|████████▋ | 101800/117610 [8:17:42<1:11:44,  3.67it/s]

{'loss': 4.5443, 'learning_rate': 2.218689989195566e-06, 'epoch': 1.73}


 87%|████████▋ | 101900/117610 [8:18:08<1:06:41,  3.93it/s]

{'loss': 4.5308, 'learning_rate': 2.1911279205141606e-06, 'epoch': 1.73}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.574, 'learning_rate': 2.163730272851727e-06, 'epoch': 1.73}


 87%|████████▋ | 102100/117610 [8:19:16<1:09:08,  3.74it/s] 

{'loss': 4.5, 'learning_rate': 2.1364972437081217e-06, 'epoch': 1.74}


 87%|████████▋ | 102200/117610 [8:19:43<1:10:15,  3.66it/s]

{'loss': 4.5665, 'learning_rate': 2.1094290293965425e-06, 'epoch': 1.74}


 87%|████████▋ | 102300/117610 [8:20:10<1:07:59,  3.75it/s]

{'loss': 4.5274, 'learning_rate': 2.082525825042078e-06, 'epoch': 1.74}


 87%|████████▋ | 102400/117610 [8:20:36<1:09:17,  3.66it/s]

{'loss': 4.5532, 'learning_rate': 2.055787824580335e-06, 'epoch': 1.74}


 87%|████████▋ | 102500/117610 [8:21:14<14:47:14,  3.52s/it]

{'loss': 4.5199, 'learning_rate': 2.0292152207560143e-06, 'epoch': 1.74}


 87%|████████▋ | 102600/117610 [8:21:40<59:41,  4.19it/s]   

{'loss': 4.5417, 'learning_rate': 2.0028082051215256e-06, 'epoch': 1.74}


 87%|████████▋ | 102700/117610 [8:22:03<59:10,  4.20it/s]  

{'loss': 4.554, 'learning_rate': 1.976566968035634e-06, 'epoch': 1.75}


 87%|████████▋ | 102800/117610 [8:22:27<56:35,  4.36it/s]  

{'loss': 4.5785, 'learning_rate': 1.9504916986620464e-06, 'epoch': 1.75}


 87%|████████▋ | 102900/117610 [8:22:52<1:01:32,  3.98it/s]

{'loss': 4.4952, 'learning_rate': 1.9245825849680736e-06, 'epoch': 1.75}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5845, 'learning_rate': 1.898839813723266e-06, 'epoch': 1.75}


 88%|████████▊ | 103100/117610 [8:24:03<1:03:46,  3.79it/s] 

{'loss': 4.5375, 'learning_rate': 1.873263570498085e-06, 'epoch': 1.75}


 88%|████████▊ | 103200/117610 [8:24:30<1:05:04,  3.69it/s]

{'loss': 4.5377, 'learning_rate': 1.8478540396625332e-06, 'epoch': 1.75}


 88%|████████▊ | 103300/117610 [8:24:57<1:05:59,  3.61it/s]

{'loss': 4.5613, 'learning_rate': 1.8226114043848496e-06, 'epoch': 1.76}


 88%|████████▊ | 103400/117610 [8:25:25<1:08:10,  3.47it/s]

{'loss': 4.5782, 'learning_rate': 1.7975358466301884e-06, 'epoch': 1.76}


 88%|████████▊ | 103500/117610 [8:26:11<21:31:54,  5.49s/it]

{'loss': 4.4669, 'learning_rate': 1.7726275471592973e-06, 'epoch': 1.76}


 88%|████████▊ | 103600/117610 [8:26:36<1:01:59,  3.77it/s] 

{'loss': 4.6037, 'learning_rate': 1.7478866855272114e-06, 'epoch': 1.76}


 88%|████████▊ | 103700/117610 [8:27:01<54:44,  4.23it/s]  

{'loss': 4.5202, 'learning_rate': 1.7233134400819751e-06, 'epoch': 1.76}


 88%|████████▊ | 103800/117610 [8:27:28<1:02:44,  3.67it/s]

{'loss': 4.5387, 'learning_rate': 1.6989079879633496e-06, 'epoch': 1.77}


 88%|████████▊ | 103900/117610 [8:27:55<1:04:12,  3.56it/s]

{'loss': 4.5194, 'learning_rate': 1.674670505101522e-06, 'epoch': 1.77}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5355, 'learning_rate': 1.6506011662158677e-06, 'epoch': 1.77}


 89%|████████▊ | 104100/117610 [8:29:07<1:01:05,  3.69it/s] 

{'loss': 4.5059, 'learning_rate': 1.6267001448136477e-06, 'epoch': 1.77}


 89%|████████▊ | 104200/117610 [8:29:34<1:01:08,  3.66it/s]

{'loss': 4.5102, 'learning_rate': 1.6029676131888078e-06, 'epoch': 1.77}


 89%|████████▊ | 104300/117610 [8:30:02<1:00:54,  3.64it/s]

{'loss': 4.5613, 'learning_rate': 1.579403742420696e-06, 'epoch': 1.77}


 89%|████████▉ | 104400/117610 [8:30:29<1:01:32,  3.58it/s]

{'loss': 4.4987, 'learning_rate': 1.556008702372838e-06, 'epoch': 1.78}


 89%|████████▉ | 104500/117610 [8:31:10<15:26:09,  4.24s/it]

{'loss': 4.5295, 'learning_rate': 1.5327826616917424e-06, 'epoch': 1.78}


 89%|████████▉ | 104600/117610 [8:31:37<59:43,  3.63it/s]   

{'loss': 4.5877, 'learning_rate': 1.509725787805641e-06, 'epoch': 1.78}


 89%|████████▉ | 104700/117610 [8:32:05<59:12,  3.63it/s]  

{'loss': 4.4706, 'learning_rate': 1.4868382469233054e-06, 'epoch': 1.78}


 89%|████████▉ | 104800/117610 [8:32:33<59:07,  3.61it/s]  

{'loss': 4.5251, 'learning_rate': 1.464120204032854e-06, 'epoch': 1.78}


 89%|████████▉ | 104900/117610 [8:33:01<58:26,  3.63it/s]  

{'loss': 4.491, 'learning_rate': 1.441571822900553e-06, 'epoch': 1.78}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5475, 'learning_rate': 1.4191932660696323e-06, 'epoch': 1.79}


 89%|████████▉ | 105100/117610 [8:34:13<56:03,  3.72it/s]   

{'loss': 4.5531, 'learning_rate': 1.3969846948591187e-06, 'epoch': 1.79}


 89%|████████▉ | 105200/117610 [8:34:41<57:09,  3.62it/s]

{'loss': 4.5621, 'learning_rate': 1.374946269362684e-06, 'epoch': 1.79}


 90%|████████▉ | 105300/117610 [8:35:09<56:48,  3.61it/s]

{'loss': 4.5358, 'learning_rate': 1.3530781484474675e-06, 'epoch': 1.79}


 90%|████████▉ | 105400/117610 [8:35:36<54:33,  3.73it/s]

{'loss': 4.5492, 'learning_rate': 1.331380489752948e-06, 'epoch': 1.79}


 90%|████████▉ | 105500/117610 [8:36:16<13:34:16,  4.03s/it]

{'loss': 4.4508, 'learning_rate': 1.3098534496898045e-06, 'epoch': 1.79}


 90%|████████▉ | 105600/117610 [8:36:44<54:55,  3.64it/s]   

{'loss': 4.5335, 'learning_rate': 1.288497183438786e-06, 'epoch': 1.8}


 90%|████████▉ | 105700/117610 [8:37:12<53:41,  3.70it/s]

{'loss': 4.5257, 'learning_rate': 1.2673118449496007e-06, 'epoch': 1.8}


 90%|████████▉ | 105800/117610 [8:37:39<53:08,  3.70it/s]

{'loss': 4.4622, 'learning_rate': 1.2462975869397847e-06, 'epoch': 1.8}


 90%|█████████ | 105900/117610 [8:38:07<52:27,  3.72it/s]

{'loss': 4.5095, 'learning_rate': 1.2254545608936296e-06, 'epoch': 1.8}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5219, 'learning_rate': 1.204782917061073e-06, 'epoch': 1.8}


 90%|█████████ | 106100/117610 [8:39:18<53:52,  3.56it/s]   

{'loss': 4.544, 'learning_rate': 1.184282804456613e-06, 'epoch': 1.8}


 90%|█████████ | 106200/117610 [8:39:46<53:04,  3.58it/s]

{'loss': 4.5348, 'learning_rate': 1.1639543708582473e-06, 'epoch': 1.81}


 90%|█████████ | 106300/117610 [8:40:14<52:00,  3.62it/s]

{'loss': 4.5707, 'learning_rate': 1.1437977628064007e-06, 'epoch': 1.81}


 90%|█████████ | 106400/117610 [8:40:39<44:28,  4.20it/s]

{'loss': 4.4988, 'learning_rate': 1.1238131256028683e-06, 'epoch': 1.81}


 91%|█████████ | 106500/117610 [8:41:16<12:52:17,  4.17s/it]

{'loss': 4.5622, 'learning_rate': 1.1040006033097628e-06, 'epoch': 1.81}


 91%|█████████ | 106600/117610 [8:41:40<44:12,  4.15it/s]   

{'loss': 4.5616, 'learning_rate': 1.084360338748494e-06, 'epoch': 1.81}


 91%|█████████ | 106700/117610 [8:42:03<43:33,  4.17it/s]

{'loss': 4.5805, 'learning_rate': 1.064892473498716e-06, 'epoch': 1.81}


 91%|█████████ | 106800/117610 [8:42:27<43:27,  4.15it/s]

{'loss': 4.5553, 'learning_rate': 1.0455971478973204e-06, 'epoch': 1.82}


 91%|█████████ | 106900/117610 [8:42:51<42:49,  4.17it/s]

{'loss': 4.5176, 'learning_rate': 1.026474501037422e-06, 'epoch': 1.82}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5443, 'learning_rate': 1.0075246707673653e-06, 'epoch': 1.82}


 91%|█████████ | 107101/117610 [8:43:55<40:34,  4.32it/s]   

{'loss': 4.5184, 'learning_rate': 9.887477936897065e-07, 'epoch': 1.82}


 91%|█████████ | 107201/117610 [8:44:18<40:11,  4.32it/s]

{'loss': 4.5642, 'learning_rate': 9.701440051602534e-07, 'epoch': 1.82}


 91%|█████████ | 107300/117610 [8:44:42<41:45,  4.11it/s]

{'loss': 4.5221, 'learning_rate': 9.51713439287083e-07, 'epoch': 1.82}


 91%|█████████▏| 107400/117610 [8:45:05<41:12,  4.13it/s]

{'loss': 4.4932, 'learning_rate': 9.334562289295606e-07, 'epoch': 1.83}


 91%|█████████▏| 107501/117610 [8:45:44<8:57:23,  3.19s/it] 

{'loss': 4.5375, 'learning_rate': 9.153725056974039e-07, 'epoch': 1.83}


 91%|█████████▏| 107600/117610 [8:46:07<39:20,  4.24it/s]  

{'loss': 4.5134, 'learning_rate': 8.974623999497195e-07, 'epoch': 1.83}


 92%|█████████▏| 107700/117610 [8:46:31<39:04,  4.23it/s]

{'loss': 4.5275, 'learning_rate': 8.797260407940711e-07, 'epoch': 1.83}


 92%|█████████▏| 107800/117610 [8:46:55<38:34,  4.24it/s]

{'loss': 4.53, 'learning_rate': 8.621635560855379e-07, 'epoch': 1.83}


 92%|█████████▏| 107900/117610 [8:47:18<38:12,  4.24it/s]

{'loss': 4.5025, 'learning_rate': 8.447750724258019e-07, 'epoch': 1.83}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.4823, 'learning_rate': 8.275607151622455e-07, 'epoch': 1.84}


 92%|█████████▏| 108100/117610 [8:48:20<37:45,  4.20it/s]   

{'loss': 4.5387, 'learning_rate': 8.105206083870192e-07, 'epoch': 1.84}


 92%|█████████▏| 108200/117610 [8:48:44<36:51,  4.25it/s]

{'loss': 4.5, 'learning_rate': 7.936548749361756e-07, 'epoch': 1.84}


 92%|█████████▏| 108300/117610 [8:49:08<36:56,  4.20it/s]

{'loss': 4.5507, 'learning_rate': 7.769636363887667e-07, 'epoch': 1.84}


 92%|█████████▏| 108400/117610 [8:49:32<35:11,  4.36it/s]

{'loss': 4.5599, 'learning_rate': 7.604470130659735e-07, 'epoch': 1.84}


 92%|█████████▏| 108500/117610 [8:50:06<8:28:25,  3.35s/it]

{'loss': 4.4982, 'learning_rate': 7.441051240302421e-07, 'epoch': 1.85}


 92%|█████████▏| 108601/117610 [8:50:30<34:10,  4.39it/s]  

{'loss': 4.5044, 'learning_rate': 7.279380870844149e-07, 'epoch': 1.85}


 92%|█████████▏| 108700/117610 [8:50:53<34:22,  4.32it/s]

{'loss': 4.5037, 'learning_rate': 7.119460187708926e-07, 'epoch': 1.85}


 93%|█████████▎| 108800/117610 [8:51:17<35:35,  4.12it/s]

{'loss': 4.5584, 'learning_rate': 6.961290343707933e-07, 'epoch': 1.85}


 93%|█████████▎| 108900/117610 [8:51:41<35:02,  4.14it/s]

{'loss': 4.5334, 'learning_rate': 6.804872479031082e-07, 'epoch': 1.85}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.4666, 'learning_rate': 6.650207721238922e-07, 'epoch': 1.85}


 93%|█████████▎| 109100/117610 [8:52:39<33:45,  4.20it/s]  

{'loss': 4.5239, 'learning_rate': 6.497297185254547e-07, 'epoch': 1.86}


 93%|█████████▎| 109200/117610 [8:53:03<33:11,  4.22it/s]

{'loss': 4.5601, 'learning_rate': 6.346141973355396e-07, 'epoch': 1.86}


 93%|█████████▎| 109300/117610 [8:53:27<32:59,  4.20it/s]

{'loss': 4.5693, 'learning_rate': 6.196743175165498e-07, 'epoch': 1.86}


 93%|█████████▎| 109400/117610 [8:53:50<33:08,  4.13it/s]

{'loss': 4.5472, 'learning_rate': 6.0491018676474e-07, 'epoch': 1.86}


 93%|█████████▎| 109500/117610 [8:54:27<9:19:57,  4.14s/it]

{'loss': 4.5658, 'learning_rate': 5.903219115094672e-07, 'epoch': 1.86}


 93%|█████████▎| 109600/117610 [8:54:51<30:32,  4.37it/s]  

{'loss': 4.5437, 'learning_rate': 5.759095969123996e-07, 'epoch': 1.86}


 93%|█████████▎| 109701/117610 [8:55:15<30:49,  4.28it/s]

{'loss': 4.514, 'learning_rate': 5.616733468667673e-07, 'epoch': 1.87}


 93%|█████████▎| 109800/117610 [8:55:39<29:59,  4.34it/s]

{'loss': 4.5024, 'learning_rate': 5.476132639966186e-07, 'epoch': 1.87}


 93%|█████████▎| 109900/117610 [8:56:02<30:09,  4.26it/s]

{'loss': 4.5455, 'learning_rate': 5.337294496560729e-07, 'epoch': 1.87}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.4923, 'learning_rate': 5.200220039285942e-07, 'epoch': 1.87}


 94%|█████████▎| 110101/117610 [8:57:02<29:32,  4.24it/s]  

{'loss': 4.4792, 'learning_rate': 5.0649102562626e-07, 'epoch': 1.87}


 94%|█████████▎| 110200/117610 [8:57:26<29:47,  4.14it/s]

{'loss': 4.4788, 'learning_rate': 4.931366122890746e-07, 'epoch': 1.87}


 94%|█████████▍| 110300/117610 [8:57:50<28:13,  4.32it/s]

{'loss': 4.4711, 'learning_rate': 4.79958860184232e-07, 'epoch': 1.88}


 94%|█████████▍| 110400/117610 [8:58:13<27:07,  4.43it/s]

{'loss': 4.5353, 'learning_rate': 4.669578643054423e-07, 'epoch': 1.88}


 94%|█████████▍| 110500/117610 [8:58:49<7:29:49,  3.80s/it]

{'loss': 4.4719, 'learning_rate': 4.541337183722516e-07, 'epoch': 1.88}


 94%|█████████▍| 110600/117610 [8:59:13<28:23,  4.11it/s]  

{'loss': 4.5217, 'learning_rate': 4.414865148293451e-07, 'epoch': 1.88}


 94%|█████████▍| 110700/117610 [8:59:37<26:25,  4.36it/s]

{'loss': 4.5797, 'learning_rate': 4.290163448459061e-07, 'epoch': 1.88}


 94%|█████████▍| 110800/117610 [9:00:01<27:25,  4.14it/s]

{'loss': 4.559, 'learning_rate': 4.167232983149305e-07, 'epoch': 1.88}


 94%|█████████▍| 110900/117610 [9:00:25<26:36,  4.20it/s]

{'loss': 4.5213, 'learning_rate': 4.0460746385261037e-07, 'epoch': 1.89}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.4716, 'learning_rate': 3.926689287976737e-07, 'epoch': 1.89}


 94%|█████████▍| 111100/117610 [9:01:27<25:53,  4.19it/s]  

{'loss': 4.5335, 'learning_rate': 3.8090777921075406e-07, 'epoch': 1.89}


 95%|█████████▍| 111201/117610 [9:01:51<24:46,  4.31it/s]

{'loss': 4.5268, 'learning_rate': 3.6932409987378846e-07, 'epoch': 1.89}


 95%|█████████▍| 111300/117610 [9:02:14<23:35,  4.46it/s]

{'loss': 4.5343, 'learning_rate': 3.579179742893818e-07, 'epoch': 1.89}


 95%|█████████▍| 111400/117610 [9:02:38<23:46,  4.35it/s]

{'loss': 4.5309, 'learning_rate': 3.4668948468022376e-07, 'epoch': 1.89}


 95%|█████████▍| 111501/117610 [9:03:15<5:01:14,  2.96s/it]

{'loss': 4.5122, 'learning_rate': 3.3563871198848686e-07, 'epoch': 1.9}


 95%|█████████▍| 111600/117610 [9:03:39<23:47,  4.21it/s]  

{'loss': 4.5016, 'learning_rate': 3.247657358752543e-07, 'epoch': 1.9}


 95%|█████████▍| 111700/117610 [9:04:02<22:11,  4.44it/s]

{'loss': 4.5265, 'learning_rate': 3.140706347199235e-07, 'epoch': 1.9}


 95%|█████████▌| 111800/117610 [9:04:26<23:17,  4.16it/s]

{'loss': 4.5383, 'learning_rate': 3.0355348561966477e-07, 'epoch': 1.9}


 95%|█████████▌| 111900/117610 [9:04:50<22:39,  4.20it/s]

{'loss': 4.5278, 'learning_rate': 2.932143643888552e-07, 'epoch': 1.9}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5308, 'learning_rate': 2.83053345558526e-07, 'epoch': 1.9}


 95%|█████████▌| 112100/117610 [9:05:47<21:59,  4.18it/s]  

{'loss': 4.526, 'learning_rate': 2.730705023758412e-07, 'epoch': 1.91}


 95%|█████████▌| 112200/117610 [9:06:10<21:20,  4.22it/s]

{'loss': 4.5373, 'learning_rate': 2.6326590680354477e-07, 'epoch': 1.91}


 95%|█████████▌| 112300/117610 [9:06:34<21:07,  4.19it/s]

{'loss': 4.5356, 'learning_rate': 2.536396295194782e-07, 'epoch': 1.91}


 96%|█████████▌| 112401/117610 [9:06:58<20:27,  4.24it/s]

{'loss': 4.5049, 'learning_rate': 2.441917399160332e-07, 'epoch': 1.91}


 96%|█████████▌| 112500/117610 [9:07:33<5:08:51,  3.63s/it]

{'loss': 4.5214, 'learning_rate': 2.3492230609967192e-07, 'epoch': 1.91}


 96%|█████████▌| 112600/117610 [9:07:57<20:07,  4.15it/s]  

{'loss': 4.5503, 'learning_rate': 2.2583139489044114e-07, 'epoch': 1.91}


 96%|█████████▌| 112701/117610 [9:08:21<18:23,  4.45it/s]

{'loss': 4.5361, 'learning_rate': 2.1691907182146976e-07, 'epoch': 1.92}


 96%|█████████▌| 112800/117610 [9:08:45<18:25,  4.35it/s]

{'loss': 4.5091, 'learning_rate': 2.0818540113852203e-07, 'epoch': 1.92}


 96%|█████████▌| 112900/117610 [9:09:08<18:02,  4.35it/s]

{'loss': 4.5325, 'learning_rate': 1.9963044579950351e-07, 'epoch': 1.92}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.4892, 'learning_rate': 1.9125426747403917e-07, 'epoch': 1.92}


 96%|█████████▌| 113100/117610 [9:10:10<18:10,  4.14it/s]  

{'loss': 4.5318, 'learning_rate': 1.8305692654300987e-07, 'epoch': 1.92}


 96%|█████████▋| 113200/117610 [9:10:34<17:31,  4.19it/s]

{'loss': 4.5012, 'learning_rate': 1.7503848209811935e-07, 'epoch': 1.93}


 96%|█████████▋| 113300/117610 [9:10:57<17:22,  4.14it/s]

{'loss': 4.5168, 'learning_rate': 1.6719899194147236e-07, 'epoch': 1.93}


 96%|█████████▋| 113401/117610 [9:11:21<15:53,  4.42it/s]

{'loss': 4.5119, 'learning_rate': 1.595385125851584e-07, 'epoch': 1.93}


 97%|█████████▋| 113500/117610 [9:11:57<4:28:44,  3.92s/it]

{'loss': 4.4955, 'learning_rate': 1.520570992508352e-07, 'epoch': 1.93}


 97%|█████████▋| 113600/117610 [9:12:21<15:59,  4.18it/s]  

{'loss': 4.4664, 'learning_rate': 1.4475480586934042e-07, 'epoch': 1.93}


 97%|█████████▋| 113700/117610 [9:12:45<15:46,  4.13it/s]

{'loss': 4.4708, 'learning_rate': 1.3763168508030833e-07, 'epoch': 1.93}


 97%|█████████▋| 113800/117610 [9:13:09<14:40,  4.33it/s]

{'loss': 4.4728, 'learning_rate': 1.3068778823176752e-07, 'epoch': 1.94}


 97%|█████████▋| 113900/117610 [9:13:32<14:15,  4.34it/s]

{'loss': 4.5222, 'learning_rate': 1.2392316537979677e-07, 'epoch': 1.94}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5171, 'learning_rate': 1.1733786528814739e-07, 'epoch': 1.94}


 97%|█████████▋| 114100/117610 [9:14:35<13:47,  4.24it/s]  

{'loss': 4.5168, 'learning_rate': 1.1093193542789371e-07, 'epoch': 1.94}


 97%|█████████▋| 114200/117610 [9:14:58<13:37,  4.17it/s]

{'loss': 4.499, 'learning_rate': 1.0470542197710264e-07, 'epoch': 1.94}


 97%|█████████▋| 114300/117610 [9:15:22<13:18,  4.14it/s]

{'loss': 4.531, 'learning_rate': 9.865836982048405e-08, 'epoch': 1.94}


 97%|█████████▋| 114400/117610 [9:15:46<12:50,  4.17it/s]

{'loss': 4.4905, 'learning_rate': 9.279082254908256e-08, 'epoch': 1.95}


 97%|█████████▋| 114500/117610 [9:16:24<3:53:54,  4.51s/it]

{'loss': 4.5041, 'learning_rate': 8.710282245995017e-08, 'epoch': 1.95}


 97%|█████████▋| 114600/117610 [9:16:48<11:45,  4.27it/s]  

{'loss': 4.5192, 'learning_rate': 8.159441055584916e-08, 'epoch': 1.95}


 98%|█████████▊| 114700/117610 [9:17:11<11:23,  4.25it/s]

{'loss': 4.4631, 'learning_rate': 7.626562654495795e-08, 'epoch': 1.95}


 98%|█████████▊| 114800/117610 [9:17:35<10:55,  4.29it/s]

{'loss': 4.5571, 'learning_rate': 7.111650884057685e-08, 'epoch': 1.95}


 98%|█████████▊| 114900/117610 [9:17:59<10:55,  4.13it/s]

{'loss': 4.5213, 'learning_rate': 6.61470945608561e-08, 'epoch': 1.95}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5444, 'learning_rate': 6.135741952853213e-08, 'epoch': 1.96}


 98%|█████████▊| 115100/117610 [9:18:57<09:48,  4.27it/s]  

{'loss': 4.5106, 'learning_rate': 5.674751827066394e-08, 'epoch': 1.96}


 98%|█████████▊| 115200/117610 [9:19:20<09:22,  4.29it/s]

{'loss': 4.5484, 'learning_rate': 5.231742401838047e-08, 'epoch': 1.96}


 98%|█████████▊| 115300/117610 [9:19:45<09:14,  4.16it/s]

{'loss': 4.5131, 'learning_rate': 4.806716870665584e-08, 'epoch': 1.96}


 98%|█████████▊| 115401/117610 [9:20:08<08:37,  4.27it/s]

{'loss': 4.5041, 'learning_rate': 4.399678297405674e-08, 'epoch': 1.96}


 98%|█████████▊| 115500/117610 [9:20:45<2:21:33,  4.03s/it]

{'loss': 4.4679, 'learning_rate': 4.010629616254813e-08, 'epoch': 1.96}


 98%|█████████▊| 115600/117610 [9:21:08<07:58,  4.20it/s]  

{'loss': 4.5522, 'learning_rate': 3.639573631725735e-08, 'epoch': 1.97}


 98%|█████████▊| 115700/117610 [9:21:32<07:41,  4.14it/s]

{'loss': 4.4683, 'learning_rate': 3.286513018628812e-08, 'epoch': 1.97}


 98%|█████████▊| 115800/117610 [9:21:56<07:20,  4.11it/s]

{'loss': 4.5857, 'learning_rate': 2.9514503220517964e-08, 'epoch': 1.97}


 99%|█████████▊| 115900/117610 [9:22:20<06:51,  4.15it/s]

{'loss': 4.4981, 'learning_rate': 2.6343879573420548e-08, 'epoch': 1.97}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.4997, 'learning_rate': 2.3353282100893603e-08, 'epoch': 1.97}


 99%|█████████▊| 116100/117610 [9:23:19<05:46,  4.36it/s]  

{'loss': 4.5084, 'learning_rate': 2.0542732361084062e-08, 'epoch': 1.97}


 99%|█████████▉| 116200/117610 [9:23:43<05:42,  4.12it/s]

{'loss': 4.522, 'learning_rate': 1.7912250614240955e-08, 'epoch': 1.98}


 99%|█████████▉| 116300/117610 [9:24:07<05:01,  4.34it/s]

{'loss': 4.4704, 'learning_rate': 1.5461855822573868e-08, 'epoch': 1.98}


 99%|█████████▉| 116400/117610 [9:24:30<04:48,  4.19it/s]

{'loss': 4.5328, 'learning_rate': 1.3191565650097493e-08, 'epoch': 1.98}


 99%|█████████▉| 116500/117610 [9:25:00<40:34,  2.19s/it]

{'loss': 4.5081, 'learning_rate': 1.1101396462528945e-08, 'epoch': 1.98}


 99%|█████████▉| 116600/117610 [9:25:24<04:00,  4.20it/s]

{'loss': 4.5157, 'learning_rate': 9.191363327151758e-09, 'epoch': 1.98}


 99%|█████████▉| 116700/117610 [9:25:48<03:38,  4.17it/s]

{'loss': 4.5168, 'learning_rate': 7.461480012707634e-09, 'epoch': 1.98}


 99%|█████████▉| 116800/117610 [9:26:11<03:03,  4.41it/s]

{'loss': 4.4829, 'learning_rate': 5.911758989313177e-09, 'epoch': 1.99}


 99%|█████████▉| 116900/117610 [9:26:35<02:51,  4.14it/s]

{'loss': 4.5238, 'learning_rate': 4.542211428354426e-09, 'epoch': 1.99}


Non-default generation parameters: {'max_length': 1024, 'do_sample': True}


{'loss': 4.5408, 'learning_rate': 3.3528472024091375e-09, 'epoch': 1.99}


100%|█████████▉| 117100/117610 [9:27:34<02:03,  4.12it/s]

{'loss': 4.5357, 'learning_rate': 2.3436748851773936e-09, 'epoch': 1.99}


100%|█████████▉| 117200/117610 [9:27:58<01:37,  4.19it/s]

{'loss': 4.5249, 'learning_rate': 1.5147017514260952e-09, 'epoch': 1.99}


100%|█████████▉| 117300/117610 [9:28:22<01:11,  4.35it/s]

{'loss': 4.4845, 'learning_rate': 8.659337769251208e-10, 'epoch': 1.99}


100%|█████████▉| 117400/117610 [9:28:45<00:50,  4.14it/s]

{'loss': 4.589, 'learning_rate': 3.973756384112459e-10, 'epoch': 2.0}


100%|█████████▉| 117501/117610 [9:29:19<04:20,  2.39s/it]

{'loss': 4.5317, 'learning_rate': 1.090307135548363e-10, 'epoch': 2.0}


100%|█████████▉| 117600/117610 [9:29:43<00:02,  4.14it/s]

{'loss': 4.5157, 'learning_rate': 9.010809265408427e-13, 'epoch': 2.0}


100%|██████████| 117610/117610 [9:29:45<00:00,  3.44it/s]

{'train_runtime': 34188.792, 'train_samples_per_second': 13.76, 'train_steps_per_second': 3.44, 'train_loss': 5.281797347430844, 'epoch': 2.0}





TrainOutput(global_step=117610, training_loss=5.281797347430844, metrics={'train_runtime': 34188.792, 'train_samples_per_second': 13.76, 'train_steps_per_second': 3.44, 'train_loss': 5.281797347430844, 'epoch': 2.0})

In [None]:
model.save_pretrained('./new_model', tokenizer=tokenizer)

In [18]:
tokenizer.save_pretrained('./new_tokenizer')

In [None]:
wandb.finish()