In [1]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments
import pandas as pd
import torch
import re
import os
from convokit import Corpus, download
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

# Local save path for corpus & models
save_path = '/Users/halladaykinsey/Desktop/Conversational_Chatbot/conversational_data'
if not os.path.exists(save_path):
    os.makedirs(save_path)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Downloading & loading the corpus
corpus = Corpus(filename=download("movie-corpus"))

# Extracting conversations into a list
conversations = list(corpus.iter_conversations())

# Saving conversations to a file for later use
pd.DataFrame(conversations).to_csv(os.path.join(save_path, 'conversations.csv'), index=False)

# Printing a sample conversation for verification
for utt_id in conversations[0].get_utterance_ids():
    print(corpus.get_utterance(utt_id).text)

Downloading movie-corpus to /Users/halladaykinsey/.convokit/downloads/movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done
They do not!
They do to!


In [4]:
# Function to clean text by removing brackets, punctuation, and extra spaces
def clean_text(text):
    text = re.sub(r'\[.*?\]', '', text)  
    text = re.sub(r'[^a-zA-Z\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip() 
    return text.lower() 

# Cleaning text for all utterances
total_convos = len(conversations)
for i, convo in enumerate(conversations):
    for utt_id in convo.get_utterance_ids():
        utt = corpus.get_utterance(utt_id)
        utt.text = clean_text(utt.text)
    if i % 100 == 0:  # Printing progress every 100 conversations
        print(f"Cleaned {i}/{total_convos} conversations")

Cleaned 0/83097 conversations
Cleaned 100/83097 conversations
Cleaned 200/83097 conversations
Cleaned 300/83097 conversations
Cleaned 400/83097 conversations
Cleaned 500/83097 conversations
Cleaned 600/83097 conversations
Cleaned 700/83097 conversations
Cleaned 800/83097 conversations
Cleaned 900/83097 conversations
Cleaned 1000/83097 conversations
Cleaned 1100/83097 conversations
Cleaned 1200/83097 conversations
Cleaned 1300/83097 conversations
Cleaned 1400/83097 conversations
Cleaned 1500/83097 conversations
Cleaned 1600/83097 conversations
Cleaned 1700/83097 conversations
Cleaned 1800/83097 conversations
Cleaned 1900/83097 conversations
Cleaned 2000/83097 conversations
Cleaned 2100/83097 conversations
Cleaned 2200/83097 conversations
Cleaned 2300/83097 conversations
Cleaned 2400/83097 conversations
Cleaned 2500/83097 conversations
Cleaned 2600/83097 conversations
Cleaned 2700/83097 conversations
Cleaned 2800/83097 conversations
Cleaned 2900/83097 conversations
Cleaned 3000/83097 con

In [5]:
# Initializing an empty list to store conversation data
conversation_data = []

# Looping through processed conversations and collecting text
for convo in conversations:
    for utt_id in convo.get_utterance_ids():
        utt = corpus.get_utterance(utt_id)
        processed_text = utt.text
        conversation_data.append({'text': processed_text})

# Converting conversation data to a DataFrame
df = pd.DataFrame(conversation_data)

# Displaying first few rows of the DataFrame
print(df.head())

          text
0  they do not
1   they do to
2    i hope so
3     she okay
4      lets go


In [6]:
# Creating dictionary to store conversation data
conversation_dfs = {}
conversation_dfs['all_conversations'] = df

# Taking a random sample of 10,000 rows 
subset_size = 10000 
df_subset = df.sample(n=subset_size, random_state=42)

# Verifying size and first few rows of subset
print(df_subset.shape)
print(df_subset.head())

# Splitting the dataset into training and evaluation sets (90% train, 10% eval)
train_df, eval_df = train_test_split(df_subset, test_size=0.1, random_state=42)

(10000, 1)
                                                     text
63127                                 my mom wasnt a goat
134803  i must protect my interests ms kyle and intere...
143713  you said bad things hurt places so maybe good ...
216087                             she hates all freshmen
165209                              what are you gonna do


In [7]:
# Initializing tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Define a function to tokenize input and response texts
def tokenize_text(text, max_length=128):
    return tokenizer.encode(text, truncation=True, padding='max_length', max_length=max_length)

# Tokenizing input and response text for training set
train_df['input_ids'] = train_df['text'].apply(lambda x: tokenize_text(x))
train_df['response_ids'] = train_df['text'].shift(-1).apply(lambda x: tokenize_text(x) if pd.notna(x) else [tokenizer.eos_token_id])

# Tokenizing input and response text for evaluation set
eval_df['input_ids'] = eval_df['text'].apply(lambda x: tokenize_text(x))
eval_df['response_ids'] = eval_df['text'].shift(-1).apply(lambda x: tokenize_text(x) if pd.notna(x) else [tokenizer.eos_token_id])



In [8]:
class MovieDialoguesDataset(Dataset):
    def __init__(self, dataframe):
        self.input_ids = dataframe['input_ids'].tolist()
        self.labels = dataframe['response_ids'].tolist()

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [9]:
# Filtering out rows where input_ids or response_ids are not of length 128
train_df = train_df[(train_df['input_ids'].apply(len) == 128) & (train_df['response_ids'].apply(len) == 128)]
eval_df = eval_df[(eval_df['input_ids'].apply(len) == 128) & (eval_df['response_ids'].apply(len) == 128)]

In [10]:
# Create dataset instances
train_dataset = MovieDialoguesDataset(train_df)
eval_dataset = MovieDialoguesDataset(eval_df)

# Checking dataset instances
print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

Train dataset size: 8981
Eval dataset size: 995


In [11]:
# Check for input_ids and response_ids lengths
print("Input IDs lengths:", train_df['input_ids'].apply(len).value_counts())
print("Response IDs lengths:", train_df['response_ids'].apply(len).value_counts())

Input IDs lengths: input_ids
128    8981
Name: count, dtype: int64
Response IDs lengths: response_ids
128    8981
Name: count, dtype: int64


In [12]:
# Global variable to store the trainer instance
trainer = None

def train_model_for_conversation(dataset):
    global trainer  # Declare that we want to use the global trainer variable

    # Loading GPT-2
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # Defining training arguments for conversation
    training_args = TrainingArguments(
        output_dir=os.path.join(save_path, 'model_results'), 
        num_train_epochs=3, 
        per_device_train_batch_size=2,  
        per_device_eval_batch_size=2,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=os.path.join(save_path, 'logs'),  
        logging_steps=10,
        evaluation_strategy="steps",  # Change to "steps"
        save_strategy="steps",         # Ensure saving strategy is also "steps"
        load_best_model_at_end=True,  # Add this line
    )

    # Creating trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        eval_dataset=eval_dataset  # Ensure you pass the eval dataset
    )

    # Training model
    trainer.train()

    # Saving trained model to local path
    model.save_pretrained(os.path.join(save_path, 'trained_conversational_model'))
    tokenizer.save_pretrained(os.path.join(save_path, 'trained_conversational_model'))  # Save tokenizer

    return trainer


In [13]:
# Training model
train_model_for_conversation(train_dataset)

  0%|          | 10/13473 [00:12<4:12:40,  1.13s/it]

{'loss': 12.7561, 'grad_norm': 225.73989868164062, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


                                                    
  0%|          | 10/13473 [01:41<4:12:40,  1.13s/it]

{'eval_loss': 13.287164688110352, 'eval_runtime': 88.4631, 'eval_samples_per_second': 11.248, 'eval_steps_per_second': 5.629, 'epoch': 0.0}


  0%|          | 20/13473 [01:51<7:25:12,  1.99s/it]  

{'loss': 11.004, 'grad_norm': 226.78887939453125, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


                                                    
  0%|          | 20/13473 [03:26<7:25:12,  1.99s/it]

{'eval_loss': 10.524092674255371, 'eval_runtime': 95.6633, 'eval_samples_per_second': 10.401, 'eval_steps_per_second': 5.206, 'epoch': 0.0}


  0%|          | 30/13473 [03:36<8:04:52,  2.16s/it]  

{'loss': 6.9438, 'grad_norm': 119.64160919189453, 'learning_rate': 3e-06, 'epoch': 0.01}


                                                    
  0%|          | 30/13473 [05:13<8:04:52,  2.16s/it]

{'eval_loss': 4.3112101554870605, 'eval_runtime': 96.9578, 'eval_samples_per_second': 10.262, 'eval_steps_per_second': 5.136, 'epoch': 0.01}


  0%|          | 40/13473 [05:23<7:54:54,  2.12s/it]  

{'loss': 2.8138, 'grad_norm': 38.805328369140625, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.01}


                                                    
  0%|          | 40/13473 [07:00<7:54:54,  2.12s/it]

{'eval_loss': 1.8046867847442627, 'eval_runtime': 96.2928, 'eval_samples_per_second': 10.333, 'eval_steps_per_second': 5.172, 'epoch': 0.01}


  0%|          | 50/13473 [07:10<7:56:47,  2.13s/it]  

{'loss': 1.598, 'grad_norm': 35.55680847167969, 'learning_rate': 5e-06, 'epoch': 0.01}


                                                    
  0%|          | 50/13473 [08:49<7:56:47,  2.13s/it]

{'eval_loss': 1.4474492073059082, 'eval_runtime': 98.6861, 'eval_samples_per_second': 10.082, 'eval_steps_per_second': 5.046, 'epoch': 0.01}


  0%|          | 60/13473 [08:59<7:59:28,  2.14s/it]  

{'loss': 1.0077, 'grad_norm': 30.959524154663086, 'learning_rate': 6e-06, 'epoch': 0.01}


                                                    
  0%|          | 60/13473 [10:36<7:59:28,  2.14s/it]

{'eval_loss': 1.2322136163711548, 'eval_runtime': 97.0473, 'eval_samples_per_second': 10.253, 'eval_steps_per_second': 5.132, 'epoch': 0.01}


  1%|          | 70/13473 [10:45<7:51:02,  2.11s/it]  

{'loss': 1.5614, 'grad_norm': 13.015396118164062, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.02}


                                                    
  1%|          | 70/13473 [12:21<7:51:02,  2.11s/it]

{'eval_loss': 1.154893398284912, 'eval_runtime': 95.5816, 'eval_samples_per_second': 10.41, 'eval_steps_per_second': 5.21, 'epoch': 0.02}


  1%|          | 80/13473 [12:31<7:52:39,  2.12s/it]  

{'loss': 0.9707, 'grad_norm': 13.588808059692383, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.02}


                                                    
  1%|          | 80/13473 [14:07<7:52:39,  2.12s/it]

{'eval_loss': 1.0686204433441162, 'eval_runtime': 96.2694, 'eval_samples_per_second': 10.336, 'eval_steps_per_second': 5.173, 'epoch': 0.02}


  1%|          | 90/13473 [14:17<7:50:16,  2.11s/it]  

{'loss': 1.1687, 'grad_norm': 25.365427017211914, 'learning_rate': 9e-06, 'epoch': 0.02}


                                                    
  1%|          | 90/13473 [28:57<7:50:16,  2.11s/it]

{'eval_loss': 0.9753501415252686, 'eval_runtime': 879.8865, 'eval_samples_per_second': 1.131, 'eval_steps_per_second': 0.566, 'epoch': 0.02}


  1%|          | 100/13473 [29:11<43:52:36, 11.81s/it] 

{'loss': 1.1953, 'grad_norm': 4.623264312744141, 'learning_rate': 1e-05, 'epoch': 0.02}


                                                      
  1%|          | 100/13473 [30:50<43:52:36, 11.81s/it]

{'eval_loss': 0.8940721154212952, 'eval_runtime': 98.2138, 'eval_samples_per_second': 10.131, 'eval_steps_per_second': 5.071, 'epoch': 0.02}


  1%|          | 110/13473 [31:00<8:59:13,  2.42s/it]  

{'loss': 1.0254, 'grad_norm': 15.71133804321289, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.02}


                                                     
  1%|          | 110/13473 [32:39<8:59:13,  2.42s/it]

{'eval_loss': 0.8520007133483887, 'eval_runtime': 99.6164, 'eval_samples_per_second': 9.988, 'eval_steps_per_second': 4.999, 'epoch': 0.02}


  1%|          | 120/13473 [32:49<8:03:42,  2.17s/it]  

{'loss': 0.6429, 'grad_norm': 12.167830467224121, 'learning_rate': 1.2e-05, 'epoch': 0.03}


                                                     
  1%|          | 120/13473 [34:27<8:03:42,  2.17s/it]

{'eval_loss': 0.7836592793464661, 'eval_runtime': 97.2371, 'eval_samples_per_second': 10.233, 'eval_steps_per_second': 5.122, 'epoch': 0.03}


  1%|          | 130/13473 [34:36<7:40:51,  2.07s/it]  

{'loss': 0.5077, 'grad_norm': 7.217987060546875, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.03}


                                                     
  1%|          | 130/13473 [36:09<7:40:51,  2.07s/it]

{'eval_loss': 0.768967866897583, 'eval_runtime': 92.8376, 'eval_samples_per_second': 10.718, 'eval_steps_per_second': 5.364, 'epoch': 0.03}


  1%|          | 140/13473 [36:19<7:39:20,  2.07s/it]  

{'loss': 0.8115, 'grad_norm': 3.7317817211151123, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.03}


                                                     
  1%|          | 140/13473 [37:55<7:39:20,  2.07s/it]

{'eval_loss': 0.765421986579895, 'eval_runtime': 95.7301, 'eval_samples_per_second': 10.394, 'eval_steps_per_second': 5.202, 'epoch': 0.03}


  1%|          | 150/13473 [38:06<7:57:36,  2.15s/it]  

{'loss': 0.9785, 'grad_norm': 32.19070816040039, 'learning_rate': 1.5e-05, 'epoch': 0.03}


                                                     
  1%|          | 150/13473 [39:39<7:57:36,  2.15s/it]

{'eval_loss': 0.7573814988136292, 'eval_runtime': 92.8321, 'eval_samples_per_second': 10.718, 'eval_steps_per_second': 5.365, 'epoch': 0.03}


  1%|          | 160/13473 [39:49<8:03:14,  2.18s/it]  

{'loss': 0.7549, 'grad_norm': 4.792784690856934, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.04}


                                                     
  1%|          | 160/13473 [41:25<8:03:14,  2.18s/it]

{'eval_loss': 0.7633912563323975, 'eval_runtime': 95.7387, 'eval_samples_per_second': 10.393, 'eval_steps_per_second': 5.202, 'epoch': 0.04}


  1%|▏         | 170/13473 [41:35<7:49:29,  2.12s/it]  

{'loss': 0.6052, 'grad_norm': 11.91805648803711, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.04}


                                                     
  1%|▏         | 170/13473 [43:10<7:49:29,  2.12s/it]

{'eval_loss': 0.782227098941803, 'eval_runtime': 95.4366, 'eval_samples_per_second': 10.426, 'eval_steps_per_second': 5.218, 'epoch': 0.04}


  1%|▏         | 180/13473 [43:20<7:46:22,  2.11s/it]  

{'loss': 0.8482, 'grad_norm': 15.787534713745117, 'learning_rate': 1.8e-05, 'epoch': 0.04}


                                                     
  1%|▏         | 180/13473 [44:55<7:46:22,  2.11s/it]

{'eval_loss': 0.7484285831451416, 'eval_runtime': 94.6509, 'eval_samples_per_second': 10.512, 'eval_steps_per_second': 5.261, 'epoch': 0.04}


  1%|▏         | 190/13473 [45:05<8:06:57,  2.20s/it]  

{'loss': 0.6797, 'grad_norm': 9.558030128479004, 'learning_rate': 1.9e-05, 'epoch': 0.04}


                                                     
  1%|▏         | 190/13473 [46:39<8:06:57,  2.20s/it]

{'eval_loss': 0.7376377582550049, 'eval_runtime': 94.1262, 'eval_samples_per_second': 10.571, 'eval_steps_per_second': 5.291, 'epoch': 0.04}


  1%|▏         | 200/13473 [46:49<7:39:38,  2.08s/it]  

{'loss': 0.5914, 'grad_norm': 6.9453301429748535, 'learning_rate': 2e-05, 'epoch': 0.04}


                                                     
  1%|▏         | 200/13473 [48:23<7:39:38,  2.08s/it]

{'eval_loss': 0.7730563282966614, 'eval_runtime': 93.9948, 'eval_samples_per_second': 10.586, 'eval_steps_per_second': 5.298, 'epoch': 0.04}


  2%|▏         | 210/13473 [48:33<7:39:35,  2.08s/it]  

{'loss': 0.7969, 'grad_norm': 16.71982192993164, 'learning_rate': 2.1e-05, 'epoch': 0.05}


                                                     
  2%|▏         | 210/13473 [50:07<7:39:35,  2.08s/it]

{'eval_loss': 0.7590102553367615, 'eval_runtime': 94.5546, 'eval_samples_per_second': 10.523, 'eval_steps_per_second': 5.267, 'epoch': 0.05}


  2%|▏         | 220/13473 [50:17<7:37:40,  2.07s/it]  

{'loss': 0.6488, 'grad_norm': 20.064926147460938, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.05}


                                                     
  2%|▏         | 220/13473 [51:55<7:37:40,  2.07s/it]

{'eval_loss': 0.7452864646911621, 'eval_runtime': 97.8464, 'eval_samples_per_second': 10.169, 'eval_steps_per_second': 5.09, 'epoch': 0.05}


  2%|▏         | 230/13473 [52:05<8:17:58,  2.26s/it]  

{'loss': 1.3537, 'grad_norm': 10.655451774597168, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.05}


                                                     
  2%|▏         | 230/13473 [53:44<8:17:58,  2.26s/it]

{'eval_loss': 0.7939386963844299, 'eval_runtime': 98.2299, 'eval_samples_per_second': 10.129, 'eval_steps_per_second': 5.07, 'epoch': 0.05}


  2%|▏         | 240/13473 [53:54<7:42:55,  2.10s/it]  

{'loss': 0.6137, 'grad_norm': 3.812127113342285, 'learning_rate': 2.4e-05, 'epoch': 0.05}


                                                     
  2%|▏         | 240/13473 [55:21<7:42:55,  2.10s/it]

{'eval_loss': 0.7432315349578857, 'eval_runtime': 87.7968, 'eval_samples_per_second': 11.333, 'eval_steps_per_second': 5.672, 'epoch': 0.05}


  2%|▏         | 250/13473 [55:31<7:12:37,  1.96s/it]  

{'loss': 0.7728, 'grad_norm': 10.535128593444824, 'learning_rate': 2.5e-05, 'epoch': 0.06}


                                                     
  2%|▏         | 250/13473 [56:59<7:12:37,  1.96s/it]

{'eval_loss': 0.7861974835395813, 'eval_runtime': 88.2291, 'eval_samples_per_second': 11.277, 'eval_steps_per_second': 5.644, 'epoch': 0.06}


  2%|▏         | 260/13473 [57:09<7:15:59,  1.98s/it]  

{'loss': 0.8077, 'grad_norm': 9.04298210144043, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.06}


                                                     
  2%|▏         | 260/13473 [58:38<7:15:59,  1.98s/it]

{'eval_loss': 0.7289537191390991, 'eval_runtime': 88.3536, 'eval_samples_per_second': 11.262, 'eval_steps_per_second': 5.636, 'epoch': 0.06}


  2%|▏         | 270/13473 [58:47<7:15:03,  1.98s/it]  

{'loss': 0.7384, 'grad_norm': 10.024827003479004, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.06}


                                                     
  2%|▏         | 270/13473 [1:00:16<7:15:03,  1.98s/it]

{'eval_loss': 0.7296831607818604, 'eval_runtime': 88.1858, 'eval_samples_per_second': 11.283, 'eval_steps_per_second': 5.647, 'epoch': 0.06}


  2%|▏         | 280/13473 [1:00:25<7:30:47,  2.05s/it]  

{'loss': 0.6311, 'grad_norm': 43.645442962646484, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.06}


                                                       
  2%|▏         | 280/13473 [1:01:53<7:30:47,  2.05s/it]

{'eval_loss': 0.7777179479598999, 'eval_runtime': 87.1289, 'eval_samples_per_second': 11.42, 'eval_steps_per_second': 5.716, 'epoch': 0.06}


  2%|▏         | 290/13473 [1:02:02<7:10:32,  1.96s/it]  

{'loss': 0.4039, 'grad_norm': 5.0722784996032715, 'learning_rate': 2.9e-05, 'epoch': 0.06}


                                                       
  2%|▏         | 290/13473 [1:03:31<7:10:32,  1.96s/it]

{'eval_loss': 0.7596169114112854, 'eval_runtime': 88.2776, 'eval_samples_per_second': 11.271, 'eval_steps_per_second': 5.641, 'epoch': 0.06}


  2%|▏         | 300/13473 [1:03:40<7:22:01,  2.01s/it]  

{'loss': 1.1128, 'grad_norm': 11.951732635498047, 'learning_rate': 3e-05, 'epoch': 0.07}


                                                       
  2%|▏         | 300/13473 [1:05:10<7:22:01,  2.01s/it]

{'eval_loss': 0.7700353860855103, 'eval_runtime': 89.1565, 'eval_samples_per_second': 11.16, 'eval_steps_per_second': 5.586, 'epoch': 0.07}


  2%|▏         | 310/13473 [1:05:20<7:49:45,  2.14s/it]  

{'loss': 1.4994, 'grad_norm': 31.20751190185547, 'learning_rate': 3.1e-05, 'epoch': 0.07}


                                                       
  2%|▏         | 310/13473 [1:06:49<7:49:45,  2.14s/it]

{'eval_loss': 0.76714026927948, 'eval_runtime': 89.1353, 'eval_samples_per_second': 11.163, 'eval_steps_per_second': 5.587, 'epoch': 0.07}


  2%|▏         | 320/13473 [1:07:00<7:48:40,  2.14s/it]  

{'loss': 0.79, 'grad_norm': 3.9634838104248047, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.07}


                                                       
  2%|▏         | 320/13473 [1:08:25<7:48:40,  2.14s/it]

{'eval_loss': 0.7585521936416626, 'eval_runtime': 84.8779, 'eval_samples_per_second': 11.723, 'eval_steps_per_second': 5.867, 'epoch': 0.07}


  2%|▏         | 330/13473 [1:08:34<7:04:23,  1.94s/it]  

{'loss': 0.6143, 'grad_norm': 5.89286994934082, 'learning_rate': 3.3e-05, 'epoch': 0.07}


                                                       
  2%|▏         | 330/13473 [1:09:59<7:04:23,  1.94s/it]

{'eval_loss': 0.7334747910499573, 'eval_runtime': 84.5822, 'eval_samples_per_second': 11.764, 'eval_steps_per_second': 5.888, 'epoch': 0.07}


  3%|▎         | 340/13473 [1:10:09<7:15:42,  1.99s/it] 

{'loss': 0.8197, 'grad_norm': 4.248692989349365, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.08}


                                                       
  3%|▎         | 340/13473 [1:11:35<7:15:42,  1.99s/it]

{'eval_loss': 0.7188882231712341, 'eval_runtime': 85.3059, 'eval_samples_per_second': 11.664, 'eval_steps_per_second': 5.838, 'epoch': 0.08}


  3%|▎         | 350/13473 [1:11:45<7:04:58,  1.94s/it]  

{'loss': 0.7969, 'grad_norm': 10.747626304626465, 'learning_rate': 3.5e-05, 'epoch': 0.08}


                                                       
  3%|▎         | 350/13473 [1:13:09<7:04:58,  1.94s/it]

{'eval_loss': 0.7218369841575623, 'eval_runtime': 84.0601, 'eval_samples_per_second': 11.837, 'eval_steps_per_second': 5.924, 'epoch': 0.08}


  3%|▎         | 360/13473 [1:13:18<6:54:30,  1.90s/it] 

{'loss': 0.7067, 'grad_norm': 13.693099021911621, 'learning_rate': 3.6e-05, 'epoch': 0.08}


                                                       
  3%|▎         | 360/13473 [1:14:43<6:54:30,  1.90s/it]

{'eval_loss': 0.715442419052124, 'eval_runtime': 85.2271, 'eval_samples_per_second': 11.675, 'eval_steps_per_second': 5.843, 'epoch': 0.08}


  3%|▎         | 370/13473 [1:14:53<7:12:49,  1.98s/it] 

{'loss': 0.4952, 'grad_norm': 4.46554708480835, 'learning_rate': 3.7e-05, 'epoch': 0.08}


                                                       
  3%|▎         | 370/13473 [1:16:16<7:12:49,  1.98s/it]

{'eval_loss': 0.8134557008743286, 'eval_runtime': 82.7925, 'eval_samples_per_second': 12.018, 'eval_steps_per_second': 6.015, 'epoch': 0.08}


  3%|▎         | 380/13473 [1:16:25<6:55:20,  1.90s/it] 

{'loss': 0.824, 'grad_norm': 12.475844383239746, 'learning_rate': 3.8e-05, 'epoch': 0.08}


                                                       
  3%|▎         | 380/13473 [1:17:52<6:55:20,  1.90s/it]

{'eval_loss': 0.7219618558883667, 'eval_runtime': 87.1292, 'eval_samples_per_second': 11.42, 'eval_steps_per_second': 5.716, 'epoch': 0.08}


  3%|▎         | 390/13473 [1:18:02<7:30:53,  2.07s/it]  

{'loss': 1.0781, 'grad_norm': 19.894636154174805, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.09}


                                                       
  3%|▎         | 390/13473 [1:19:28<7:30:53,  2.07s/it]

{'eval_loss': 0.741669774055481, 'eval_runtime': 85.2644, 'eval_samples_per_second': 11.67, 'eval_steps_per_second': 5.841, 'epoch': 0.09}


  3%|▎         | 400/13473 [1:19:37<6:57:46,  1.92s/it]  

{'loss': 0.6871, 'grad_norm': 7.530923843383789, 'learning_rate': 4e-05, 'epoch': 0.09}


                                                       
  3%|▎         | 400/13473 [1:21:03<6:57:46,  1.92s/it]

{'eval_loss': 0.7725932598114014, 'eval_runtime': 85.8512, 'eval_samples_per_second': 11.59, 'eval_steps_per_second': 5.801, 'epoch': 0.09}


  3%|▎         | 410/13473 [1:21:13<7:06:17,  1.96s/it]  

{'loss': 0.6749, 'grad_norm': 12.001233100891113, 'learning_rate': 4.1e-05, 'epoch': 0.09}


                                                       
  3%|▎         | 410/13473 [1:22:43<7:06:17,  1.96s/it]

{'eval_loss': 0.7639809250831604, 'eval_runtime': 89.5343, 'eval_samples_per_second': 11.113, 'eval_steps_per_second': 5.562, 'epoch': 0.09}


  3%|▎         | 420/13473 [1:22:53<7:44:49,  2.14s/it]  

{'loss': 0.5663, 'grad_norm': 19.694570541381836, 'learning_rate': 4.2e-05, 'epoch': 0.09}


                                                       
  3%|▎         | 420/13473 [1:24:22<7:44:49,  2.14s/it]

{'eval_loss': 0.7200566530227661, 'eval_runtime': 88.8746, 'eval_samples_per_second': 11.196, 'eval_steps_per_second': 5.603, 'epoch': 0.09}


  3%|▎         | 430/13473 [1:24:32<7:09:14,  1.97s/it]  

{'loss': 0.718, 'grad_norm': 7.53505802154541, 'learning_rate': 4.3e-05, 'epoch': 0.1}


                                                       
  3%|▎         | 430/13473 [1:26:04<7:09:14,  1.97s/it]

{'eval_loss': 0.7132872343063354, 'eval_runtime': 92.6568, 'eval_samples_per_second': 10.739, 'eval_steps_per_second': 5.375, 'epoch': 0.1}


  3%|▎         | 440/13473 [1:26:14<7:25:36,  2.05s/it]  

{'loss': 1.0859, 'grad_norm': 3.3442037105560303, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.1}


                                                       
  3%|▎         | 440/13473 [1:27:50<7:25:36,  2.05s/it]

{'eval_loss': 0.7192938327789307, 'eval_runtime': 96.3706, 'eval_samples_per_second': 10.325, 'eval_steps_per_second': 5.168, 'epoch': 0.1}


  3%|▎         | 450/13473 [1:27:59<7:22:41,  2.04s/it]  

{'loss': 0.6907, 'grad_norm': 4.642205715179443, 'learning_rate': 4.5e-05, 'epoch': 0.1}


                                                       
  3%|▎         | 450/13473 [1:29:38<7:22:41,  2.04s/it]

{'eval_loss': 0.7263298630714417, 'eval_runtime': 98.5235, 'eval_samples_per_second': 10.099, 'eval_steps_per_second': 5.055, 'epoch': 0.1}


  3%|▎         | 460/13473 [1:29:49<7:56:57,  2.20s/it]  

{'loss': 0.6052, 'grad_norm': 7.136635780334473, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.1}


                                                       
  3%|▎         | 460/13473 [1:31:23<7:56:57,  2.20s/it]

{'eval_loss': 0.8042763471603394, 'eval_runtime': 94.4429, 'eval_samples_per_second': 10.535, 'eval_steps_per_second': 5.273, 'epoch': 0.1}


  3%|▎         | 470/13473 [1:31:33<7:30:39,  2.08s/it]  

{'loss': 0.9245, 'grad_norm': 3.0325064659118652, 'learning_rate': 4.7e-05, 'epoch': 0.1}


                                                       
  3%|▎         | 470/13473 [1:33:07<7:30:39,  2.08s/it]

{'eval_loss': 0.7295767664909363, 'eval_runtime': 93.6526, 'eval_samples_per_second': 10.624, 'eval_steps_per_second': 5.318, 'epoch': 0.1}


  4%|▎         | 480/13473 [1:33:16<7:25:34,  2.06s/it]  

{'loss': 0.5705, 'grad_norm': 6.795499324798584, 'learning_rate': 4.8e-05, 'epoch': 0.11}


                                                       
  4%|▎         | 480/13473 [1:34:51<7:25:34,  2.06s/it]

{'eval_loss': 0.7218373417854309, 'eval_runtime': 94.717, 'eval_samples_per_second': 10.505, 'eval_steps_per_second': 5.258, 'epoch': 0.11}


  4%|▎         | 490/13473 [1:35:00<7:28:15,  2.07s/it]  

{'loss': 0.8097, 'grad_norm': 7.625492095947266, 'learning_rate': 4.9e-05, 'epoch': 0.11}


                                                       
  4%|▎         | 490/13473 [1:36:36<7:28:15,  2.07s/it]

{'eval_loss': 0.7263543605804443, 'eval_runtime': 95.7314, 'eval_samples_per_second': 10.394, 'eval_steps_per_second': 5.202, 'epoch': 0.11}


  4%|▎         | 500/13473 [1:36:47<8:09:35,  2.26s/it]  

{'loss': 0.6295, 'grad_norm': 7.837323188781738, 'learning_rate': 5e-05, 'epoch': 0.11}


                                                       
  4%|▎         | 500/13473 [1:38:21<8:09:35,  2.26s/it]

{'eval_loss': 0.7308434247970581, 'eval_runtime': 94.7122, 'eval_samples_per_second': 10.506, 'eval_steps_per_second': 5.258, 'epoch': 0.11}


  4%|▍         | 510/13473 [1:38:34<7:43:18,  2.14s/it]  

{'loss': 0.882, 'grad_norm': 2.539280891418457, 'learning_rate': 4.996145841362831e-05, 'epoch': 0.11}


                                                       
  4%|▍         | 510/13473 [1:40:10<7:43:18,  2.14s/it]

{'eval_loss': 0.7174072861671448, 'eval_runtime': 95.7095, 'eval_samples_per_second': 10.396, 'eval_steps_per_second': 5.203, 'epoch': 0.11}


  4%|▍         | 520/13473 [1:40:19<7:33:07,  2.10s/it]  

{'loss': 0.6176, 'grad_norm': 2.437349319458008, 'learning_rate': 4.992291682725661e-05, 'epoch': 0.12}


                                                       
  4%|▍         | 520/13473 [1:41:56<7:33:07,  2.10s/it]

{'eval_loss': 0.7373660206794739, 'eval_runtime': 96.9875, 'eval_samples_per_second': 10.259, 'eval_steps_per_second': 5.135, 'epoch': 0.12}


  4%|▍         | 530/13473 [1:42:07<8:07:23,  2.26s/it]  

{'loss': 0.4594, 'grad_norm': 2.1899638175964355, 'learning_rate': 4.988437524088492e-05, 'epoch': 0.12}


                                                       
  4%|▍         | 530/13473 [1:43:42<8:07:23,  2.26s/it]

{'eval_loss': 0.7759943604469299, 'eval_runtime': 94.916, 'eval_samples_per_second': 10.483, 'eval_steps_per_second': 5.247, 'epoch': 0.12}


  4%|▍         | 540/13473 [1:43:52<7:37:10,  2.12s/it]  

{'loss': 0.5292, 'grad_norm': 4.19710636138916, 'learning_rate': 4.984583365451323e-05, 'epoch': 0.12}


                                                       
  4%|▍         | 540/13473 [1:45:25<7:37:10,  2.12s/it]

{'eval_loss': 0.7409554123878479, 'eval_runtime': 93.3181, 'eval_samples_per_second': 10.662, 'eval_steps_per_second': 5.337, 'epoch': 0.12}


  4%|▍         | 550/13473 [1:45:35<7:36:15,  2.12s/it]  

{'loss': 0.5971, 'grad_norm': 3.9198174476623535, 'learning_rate': 4.980729206814153e-05, 'epoch': 0.12}


                                                       
  4%|▍         | 550/13473 [1:47:07<7:36:15,  2.12s/it]

{'eval_loss': 0.7819506525993347, 'eval_runtime': 92.4649, 'eval_samples_per_second': 10.761, 'eval_steps_per_second': 5.386, 'epoch': 0.12}


  4%|▍         | 560/13473 [1:47:17<7:18:34,  2.04s/it]  

{'loss': 0.3935, 'grad_norm': 2.949068784713745, 'learning_rate': 4.976875048176983e-05, 'epoch': 0.12}


                                                       
  4%|▍         | 560/13473 [1:48:53<7:18:34,  2.04s/it]

{'eval_loss': 0.7575950026512146, 'eval_runtime': 96.5151, 'eval_samples_per_second': 10.309, 'eval_steps_per_second': 5.16, 'epoch': 0.12}


  4%|▍         | 570/13473 [1:49:03<7:26:53,  2.08s/it]  

{'loss': 0.6959, 'grad_norm': 9.689306259155273, 'learning_rate': 4.973020889539814e-05, 'epoch': 0.13}


                                                       
  4%|▍         | 570/13473 [1:50:38<7:26:53,  2.08s/it]

{'eval_loss': 0.7610098719596863, 'eval_runtime': 94.9483, 'eval_samples_per_second': 10.479, 'eval_steps_per_second': 5.245, 'epoch': 0.13}


  4%|▍         | 580/13473 [1:50:48<7:22:08,  2.06s/it]  

{'loss': 0.872, 'grad_norm': 6.518136978149414, 'learning_rate': 4.969166730902644e-05, 'epoch': 0.13}


                                                       
  4%|▍         | 580/13473 [1:52:22<7:22:08,  2.06s/it]

{'eval_loss': 0.7292805314064026, 'eval_runtime': 94.8391, 'eval_samples_per_second': 10.491, 'eval_steps_per_second': 5.251, 'epoch': 0.13}


  4%|▍         | 590/13473 [1:52:33<7:49:19,  2.19s/it]  

{'loss': 0.8603, 'grad_norm': 23.949779510498047, 'learning_rate': 4.965312572265475e-05, 'epoch': 0.13}


                                                       
  4%|▍         | 590/13473 [1:54:09<7:49:19,  2.19s/it]

{'eval_loss': 0.7582117915153503, 'eval_runtime': 95.315, 'eval_samples_per_second': 10.439, 'eval_steps_per_second': 5.225, 'epoch': 0.13}


  4%|▍         | 600/13473 [1:54:18<7:21:54,  2.06s/it]  

{'loss': 1.0698, 'grad_norm': 7.331009387969971, 'learning_rate': 4.9614584136283055e-05, 'epoch': 0.13}


                                                       
  4%|▍         | 600/13473 [1:55:49<7:21:54,  2.06s/it]

{'eval_loss': 0.7260338664054871, 'eval_runtime': 90.5965, 'eval_samples_per_second': 10.983, 'eval_steps_per_second': 5.497, 'epoch': 0.13}


  5%|▍         | 610/13473 [1:55:58<7:12:55,  2.02s/it]  

{'loss': 0.6389, 'grad_norm': 1.294986367225647, 'learning_rate': 4.9576042549911356e-05, 'epoch': 0.14}


                                                       
  5%|▍         | 610/13473 [1:57:29<7:12:55,  2.02s/it]

{'eval_loss': 0.7683898210525513, 'eval_runtime': 91.003, 'eval_samples_per_second': 10.934, 'eval_steps_per_second': 5.472, 'epoch': 0.14}


  5%|▍         | 620/13473 [1:57:39<7:17:45,  2.04s/it]  

{'loss': 1.0715, 'grad_norm': 2.8777897357940674, 'learning_rate': 4.9537500963539664e-05, 'epoch': 0.14}


                                                       
  5%|▍         | 620/13473 [1:59:11<7:17:45,  2.04s/it]

{'eval_loss': 0.7212613224983215, 'eval_runtime': 91.9611, 'eval_samples_per_second': 10.82, 'eval_steps_per_second': 5.415, 'epoch': 0.14}


  5%|▍         | 630/13473 [1:59:21<7:22:51,  2.07s/it]  

{'loss': 0.7605, 'grad_norm': 10.818059921264648, 'learning_rate': 4.9498959377167965e-05, 'epoch': 0.14}


                                                       
  5%|▍         | 630/13473 [2:00:55<7:22:51,  2.07s/it]

{'eval_loss': 0.7082850337028503, 'eval_runtime': 93.7114, 'eval_samples_per_second': 10.618, 'eval_steps_per_second': 5.314, 'epoch': 0.14}


  5%|▍         | 640/13473 [2:01:05<7:27:18,  2.09s/it]  

{'loss': 0.8074, 'grad_norm': 4.424010753631592, 'learning_rate': 4.946041779079627e-05, 'epoch': 0.14}


                                                       
  5%|▍         | 640/13473 [2:02:38<7:27:18,  2.09s/it]

{'eval_loss': 0.7118048667907715, 'eval_runtime': 92.9464, 'eval_samples_per_second': 10.705, 'eval_steps_per_second': 5.358, 'epoch': 0.14}


  5%|▍         | 650/13473 [2:02:47<7:17:48,  2.05s/it]  

{'loss': 1.0921, 'grad_norm': 3.74688982963562, 'learning_rate': 4.9421876204424574e-05, 'epoch': 0.14}


                                                       
  5%|▍         | 650/13473 [2:04:17<7:17:48,  2.05s/it]

{'eval_loss': 0.711936891078949, 'eval_runtime': 89.1323, 'eval_samples_per_second': 11.163, 'eval_steps_per_second': 5.587, 'epoch': 0.14}


  5%|▍         | 660/13473 [2:04:26<7:02:12,  1.98s/it]  

{'loss': 0.6395, 'grad_norm': 5.4742021560668945, 'learning_rate': 4.9383334618052876e-05, 'epoch': 0.15}


                                                       
  5%|▍         | 660/13473 [2:05:58<7:02:12,  1.98s/it]

{'eval_loss': 0.7179082632064819, 'eval_runtime': 91.9277, 'eval_samples_per_second': 10.824, 'eval_steps_per_second': 5.417, 'epoch': 0.15}


  5%|▍         | 670/13473 [2:06:08<7:11:32,  2.02s/it]  

{'loss': 0.8924, 'grad_norm': 3.238553762435913, 'learning_rate': 4.9344793031681183e-05, 'epoch': 0.15}


                                                       
  5%|▍         | 670/13473 [2:07:40<7:11:32,  2.02s/it]

{'eval_loss': 0.7252442240715027, 'eval_runtime': 91.7277, 'eval_samples_per_second': 10.847, 'eval_steps_per_second': 5.429, 'epoch': 0.15}


  5%|▌         | 680/13473 [2:07:49<7:13:50,  2.03s/it]  

{'loss': 1.0267, 'grad_norm': 4.061991214752197, 'learning_rate': 4.930625144530949e-05, 'epoch': 0.15}


                                                       
  5%|▌         | 680/13473 [2:09:22<7:13:50,  2.03s/it]

{'eval_loss': 0.7674462199211121, 'eval_runtime': 92.4945, 'eval_samples_per_second': 10.757, 'eval_steps_per_second': 5.384, 'epoch': 0.15}


  5%|▌         | 690/13473 [2:09:32<7:12:46,  2.03s/it]  

{'loss': 0.5789, 'grad_norm': 2.7130885124206543, 'learning_rate': 4.926770985893779e-05, 'epoch': 0.15}


                                                       
  5%|▌         | 690/13473 [2:11:06<7:12:46,  2.03s/it]

{'eval_loss': 0.7610204219818115, 'eval_runtime': 93.9183, 'eval_samples_per_second': 10.594, 'eval_steps_per_second': 5.302, 'epoch': 0.15}


  5%|▌         | 700/13473 [2:11:15<7:20:32,  2.07s/it]  

{'loss': 1.4544, 'grad_norm': 4.297976970672607, 'learning_rate': 4.92291682725661e-05, 'epoch': 0.16}


                                                       
  5%|▌         | 700/13473 [2:12:49<7:20:32,  2.07s/it]

{'eval_loss': 0.7120113372802734, 'eval_runtime': 93.3017, 'eval_samples_per_second': 10.664, 'eval_steps_per_second': 5.338, 'epoch': 0.16}


  5%|▌         | 710/13473 [2:13:10<12:02:37,  3.40s/it] 

{'loss': 0.6497, 'grad_norm': 1.2363989353179932, 'learning_rate': 4.919062668619441e-05, 'epoch': 0.16}


                                                        
  5%|▌         | 710/13473 [2:14:45<12:02:37,  3.40s/it]

{'eval_loss': 0.7100369334220886, 'eval_runtime': 94.9199, 'eval_samples_per_second': 10.483, 'eval_steps_per_second': 5.247, 'epoch': 0.16}


  5%|▌         | 720/13473 [2:14:55<7:40:08,  2.16s/it]  

{'loss': 0.5717, 'grad_norm': 2.2854833602905273, 'learning_rate': 4.915208509982271e-05, 'epoch': 0.16}


                                                       
  5%|▌         | 720/13473 [2:16:26<7:40:08,  2.16s/it]

{'eval_loss': 0.7251970171928406, 'eval_runtime': 91.0174, 'eval_samples_per_second': 10.932, 'eval_steps_per_second': 5.471, 'epoch': 0.16}


  5%|▌         | 730/13473 [2:16:36<7:11:01,  2.03s/it]  

{'loss': 0.5517, 'grad_norm': 1.8585456609725952, 'learning_rate': 4.911354351345102e-05, 'epoch': 0.16}


                                                       
  5%|▌         | 730/13473 [2:18:03<7:11:01,  2.03s/it]

{'eval_loss': 0.7292176485061646, 'eval_runtime': 87.0039, 'eval_samples_per_second': 11.436, 'eval_steps_per_second': 5.724, 'epoch': 0.16}


  5%|▌         | 740/13473 [2:18:12<6:58:37,  1.97s/it] 

{'loss': 0.754, 'grad_norm': 4.194638729095459, 'learning_rate': 4.9075001927079326e-05, 'epoch': 0.16}


                                                       
  5%|▌         | 740/13473 [2:19:41<6:58:37,  1.97s/it]

{'eval_loss': 0.7159916758537292, 'eval_runtime': 88.4022, 'eval_samples_per_second': 11.255, 'eval_steps_per_second': 5.633, 'epoch': 0.16}


  6%|▌         | 750/13473 [2:19:50<6:59:31,  1.98s/it]  

{'loss': 1.1148, 'grad_norm': 4.305179119110107, 'learning_rate': 4.903646034070763e-05, 'epoch': 0.17}


                                                       
  6%|▌         | 750/13473 [2:21:18<6:59:31,  1.98s/it]

{'eval_loss': 0.7319956421852112, 'eval_runtime': 88.2197, 'eval_samples_per_second': 11.279, 'eval_steps_per_second': 5.645, 'epoch': 0.17}


  6%|▌         | 760/13473 [2:21:28<6:48:56,  1.93s/it]  

{'loss': 0.6916, 'grad_norm': 0.8794369697570801, 'learning_rate': 4.899791875433593e-05, 'epoch': 0.17}


                                                       
  6%|▌         | 760/13473 [2:23:00<6:48:56,  1.93s/it]

{'eval_loss': 0.7851095199584961, 'eval_runtime': 92.6909, 'eval_samples_per_second': 10.735, 'eval_steps_per_second': 5.373, 'epoch': 0.17}


  6%|▌         | 770/13473 [2:23:17<8:20:21,  2.36s/it]  

{'loss': 0.8373, 'grad_norm': 1.721944808959961, 'learning_rate': 4.8959377167964236e-05, 'epoch': 0.17}


                                                       
  6%|▌         | 770/13473 [2:24:49<8:20:21,  2.36s/it]

{'eval_loss': 0.722147524356842, 'eval_runtime': 92.6019, 'eval_samples_per_second': 10.745, 'eval_steps_per_second': 5.378, 'epoch': 0.17}


  6%|▌         | 780/13473 [2:24:59<7:17:33,  2.07s/it]  

{'loss': 0.8105, 'grad_norm': 1.3992294073104858, 'learning_rate': 4.892083558159254e-05, 'epoch': 0.17}


                                                       
  6%|▌         | 780/13473 [2:26:32<7:17:33,  2.07s/it]

{'eval_loss': 0.724015474319458, 'eval_runtime': 93.1592, 'eval_samples_per_second': 10.681, 'eval_steps_per_second': 5.346, 'epoch': 0.17}


  6%|▌         | 790/13473 [2:26:41<7:00:48,  1.99s/it]  

{'loss': 0.53, 'grad_norm': 3.928380012512207, 'learning_rate': 4.8882293995220845e-05, 'epoch': 0.18}


                                                       
  6%|▌         | 790/13473 [2:28:06<7:00:48,  1.99s/it]

{'eval_loss': 0.7555516958236694, 'eval_runtime': 84.3414, 'eval_samples_per_second': 11.797, 'eval_steps_per_second': 5.905, 'epoch': 0.18}


  6%|▌         | 800/13473 [2:28:14<6:26:46,  1.83s/it] 

{'loss': 0.8185, 'grad_norm': 2.0838499069213867, 'learning_rate': 4.884375240884915e-05, 'epoch': 0.18}


                                                       
  6%|▌         | 800/13473 [2:29:39<6:26:46,  1.83s/it]

{'eval_loss': 0.7292628288269043, 'eval_runtime': 84.5121, 'eval_samples_per_second': 11.773, 'eval_steps_per_second': 5.893, 'epoch': 0.18}


  6%|▌         | 810/13473 [2:29:47<6:27:45,  1.84s/it] 

{'loss': 0.6927, 'grad_norm': 1.6489243507385254, 'learning_rate': 4.8805210822477454e-05, 'epoch': 0.18}


                                                       
  6%|▌         | 810/13473 [2:31:13<6:27:45,  1.84s/it]

{'eval_loss': 0.7278084754943848, 'eval_runtime': 85.3302, 'eval_samples_per_second': 11.661, 'eval_steps_per_second': 5.836, 'epoch': 0.18}


  6%|▌         | 820/13473 [2:31:21<6:28:00,  1.84s/it] 

{'loss': 0.9188, 'grad_norm': 0.880077600479126, 'learning_rate': 4.876666923610576e-05, 'epoch': 0.18}


                                                       
  6%|▌         | 820/13473 [2:32:54<6:28:00,  1.84s/it]

{'eval_loss': 0.7438677549362183, 'eval_runtime': 93.295, 'eval_samples_per_second': 10.665, 'eval_steps_per_second': 5.338, 'epoch': 0.18}


  6%|▌         | 830/13473 [2:33:03<6:49:15,  1.94s/it]  

{'loss': 0.6808, 'grad_norm': 1.3322120904922485, 'learning_rate': 4.872812764973407e-05, 'epoch': 0.18}


                                                       
  6%|▌         | 830/13473 [2:35:29<6:49:15,  1.94s/it]

{'eval_loss': 0.7135311365127563, 'eval_runtime': 146.4881, 'eval_samples_per_second': 6.792, 'eval_steps_per_second': 3.4, 'epoch': 0.18}


  6%|▌         | 840/13473 [2:35:38<8:51:33,  2.52s/it]  

{'loss': 0.8396, 'grad_norm': 2.4489147663116455, 'learning_rate': 4.868958606336237e-05, 'epoch': 0.19}


                                                       
  6%|▌         | 840/13473 [2:37:07<8:51:33,  2.52s/it]

{'eval_loss': 0.7200942635536194, 'eval_runtime': 88.725, 'eval_samples_per_second': 11.214, 'eval_steps_per_second': 5.613, 'epoch': 0.19}


  6%|▋         | 850/13473 [2:37:15<6:36:48,  1.89s/it]  

{'loss': 0.7305, 'grad_norm': 1.0077183246612549, 'learning_rate': 4.865104447699068e-05, 'epoch': 0.19}


                                                       
  6%|▋         | 850/13473 [2:38:43<6:36:48,  1.89s/it]

{'eval_loss': 0.7167109847068787, 'eval_runtime': 88.3656, 'eval_samples_per_second': 11.26, 'eval_steps_per_second': 5.636, 'epoch': 0.19}


  6%|▋         | 860/13473 [2:38:52<6:37:18,  1.89s/it] 

{'loss': 1.1066, 'grad_norm': 7.812808990478516, 'learning_rate': 4.861250289061898e-05, 'epoch': 0.19}


                                                       
  6%|▋         | 860/13473 [2:40:19<6:37:18,  1.89s/it]

{'eval_loss': 0.721595048904419, 'eval_runtime': 86.533, 'eval_samples_per_second': 11.499, 'eval_steps_per_second': 5.755, 'epoch': 0.19}


  6%|▋         | 870/13473 [2:40:27<6:28:39,  1.85s/it] 

{'loss': 1.0457, 'grad_norm': 5.049548149108887, 'learning_rate': 4.857396130424728e-05, 'epoch': 0.19}


                                                       
  6%|▋         | 870/13473 [2:41:50<6:28:39,  1.85s/it]

{'eval_loss': 0.7437730431556702, 'eval_runtime': 82.8283, 'eval_samples_per_second': 12.013, 'eval_steps_per_second': 6.012, 'epoch': 0.19}


  7%|▋         | 880/13473 [2:41:58<6:19:07,  1.81s/it] 

{'loss': 0.8232, 'grad_norm': 0.7144428491592407, 'learning_rate': 4.853541971787559e-05, 'epoch': 0.2}


                                                       
  7%|▋         | 880/13473 [2:43:26<6:19:07,  1.81s/it]

{'eval_loss': 0.758797824382782, 'eval_runtime': 88.0373, 'eval_samples_per_second': 11.302, 'eval_steps_per_second': 5.657, 'epoch': 0.2}


  7%|▋         | 890/13473 [2:43:34<6:27:55,  1.85s/it] 

{'loss': 0.5445, 'grad_norm': 3.9657552242279053, 'learning_rate': 4.849687813150389e-05, 'epoch': 0.2}


                                                       
  7%|▋         | 890/13473 [2:45:00<6:27:55,  1.85s/it]

{'eval_loss': 0.7753114700317383, 'eval_runtime': 85.3675, 'eval_samples_per_second': 11.655, 'eval_steps_per_second': 5.834, 'epoch': 0.2}


  7%|▋         | 900/13473 [2:45:08<6:25:04,  1.84s/it] 

{'loss': 1.495, 'grad_norm': 3.091947317123413, 'learning_rate': 4.84583365451322e-05, 'epoch': 0.2}


                                                       
  7%|▋         | 900/13473 [2:46:36<6:25:04,  1.84s/it]

{'eval_loss': 0.7498748898506165, 'eval_runtime': 87.5298, 'eval_samples_per_second': 11.368, 'eval_steps_per_second': 5.689, 'epoch': 0.2}


  7%|▋         | 910/13473 [2:46:44<6:28:51,  1.86s/it] 

{'loss': 0.9024, 'grad_norm': 1.2600520849227905, 'learning_rate': 4.841979495876051e-05, 'epoch': 0.2}


                                                       
  7%|▋         | 910/13473 [2:48:15<6:28:51,  1.86s/it]

{'eval_loss': 0.7094249129295349, 'eval_runtime': 90.9272, 'eval_samples_per_second': 10.943, 'eval_steps_per_second': 5.477, 'epoch': 0.2}


  7%|▋         | 920/13473 [2:48:26<6:48:26,  1.95s/it]  

{'loss': 1.0429, 'grad_norm': 2.2931437492370605, 'learning_rate': 4.838125337238881e-05, 'epoch': 0.2}


                                                       
  7%|▋         | 920/13473 [2:49:52<6:48:26,  1.95s/it]

{'eval_loss': 0.728596568107605, 'eval_runtime': 86.2633, 'eval_samples_per_second': 11.534, 'eval_steps_per_second': 5.773, 'epoch': 0.2}


  7%|▋         | 930/13473 [2:50:01<6:30:15,  1.87s/it] 

{'loss': 0.8121, 'grad_norm': 0.8550971746444702, 'learning_rate': 4.8342711786017116e-05, 'epoch': 0.21}


                                                       
  7%|▋         | 930/13473 [2:51:31<6:30:15,  1.87s/it]

{'eval_loss': 0.7305892109870911, 'eval_runtime': 89.4471, 'eval_samples_per_second': 11.124, 'eval_steps_per_second': 5.568, 'epoch': 0.21}


  7%|▋         | 940/13473 [2:51:39<6:37:29,  1.90s/it] 

{'loss': 0.5333, 'grad_norm': 0.8148048520088196, 'learning_rate': 4.8304170199645424e-05, 'epoch': 0.21}


                                                       
  7%|▋         | 940/13473 [2:53:07<6:37:29,  1.90s/it]

{'eval_loss': 0.8656340837478638, 'eval_runtime': 87.6336, 'eval_samples_per_second': 11.354, 'eval_steps_per_second': 5.683, 'epoch': 0.21}


  7%|▋         | 950/13473 [2:53:16<6:33:52,  1.89s/it] 

{'loss': 1.7056, 'grad_norm': 2.120008707046509, 'learning_rate': 4.8265628613273725e-05, 'epoch': 0.21}


                                                       
  7%|▋         | 950/13473 [2:54:38<6:33:52,  1.89s/it]

{'eval_loss': 0.7111411094665527, 'eval_runtime': 82.8374, 'eval_samples_per_second': 12.011, 'eval_steps_per_second': 6.012, 'epoch': 0.21}


  7%|▋         | 960/13473 [2:54:47<6:09:22,  1.77s/it] 

{'loss': 0.6287, 'grad_norm': 1.897599697113037, 'learning_rate': 4.8227087026902026e-05, 'epoch': 0.21}


                                                       
  7%|▋         | 960/13473 [2:56:12<6:09:22,  1.77s/it]

{'eval_loss': 0.7159776091575623, 'eval_runtime': 85.3502, 'eval_samples_per_second': 11.658, 'eval_steps_per_second': 5.835, 'epoch': 0.21}


  7%|▋         | 970/13473 [2:56:21<6:21:07,  1.83s/it] 

{'loss': 1.0025, 'grad_norm': 1.7923847436904907, 'learning_rate': 4.8188545440530334e-05, 'epoch': 0.22}


                                                       
  7%|▋         | 970/13473 [2:57:58<6:21:07,  1.83s/it]

{'eval_loss': 0.7256544232368469, 'eval_runtime': 97.0997, 'eval_samples_per_second': 10.247, 'eval_steps_per_second': 5.129, 'epoch': 0.22}


  7%|▋         | 980/13473 [2:58:06<7:00:39,  2.02s/it]  

{'loss': 0.6522, 'grad_norm': 1.9486404657363892, 'learning_rate': 4.8150003854158635e-05, 'epoch': 0.22}


                                                       
  7%|▋         | 980/13473 [2:59:35<7:00:39,  2.02s/it]

{'eval_loss': 0.71314936876297, 'eval_runtime': 88.9224, 'eval_samples_per_second': 11.19, 'eval_steps_per_second': 5.6, 'epoch': 0.22}


  7%|▋         | 990/13473 [2:59:44<6:44:41,  1.95s/it] 

{'loss': 0.7985, 'grad_norm': 0.802541196346283, 'learning_rate': 4.811146226778694e-05, 'epoch': 0.22}


                                                       
  7%|▋         | 990/13473 [3:01:18<6:44:41,  1.95s/it]

{'eval_loss': 0.7152471542358398, 'eval_runtime': 93.9581, 'eval_samples_per_second': 10.59, 'eval_steps_per_second': 5.3, 'epoch': 0.22}


  7%|▋         | 1000/13473 [3:01:27<6:52:00,  1.98s/it] 

{'loss': 0.7686, 'grad_norm': 3.20576810836792, 'learning_rate': 4.807292068141525e-05, 'epoch': 0.22}


                                                        
  7%|▋         | 1000/13473 [3:03:00<6:52:00,  1.98s/it]

{'eval_loss': 0.7395420074462891, 'eval_runtime': 93.4626, 'eval_samples_per_second': 10.646, 'eval_steps_per_second': 5.328, 'epoch': 0.22}


  7%|▋         | 1010/13473 [3:03:10<6:56:03,  2.00s/it]  

{'loss': 0.777, 'grad_norm': 3.6957592964172363, 'learning_rate': 4.803437909504355e-05, 'epoch': 0.22}


                                                        
  7%|▋         | 1010/13473 [3:04:39<6:56:03,  2.00s/it]

{'eval_loss': 0.7145246267318726, 'eval_runtime': 88.7262, 'eval_samples_per_second': 11.214, 'eval_steps_per_second': 5.613, 'epoch': 0.22}


  8%|▊         | 1020/13473 [3:04:48<6:41:19,  1.93s/it] 

{'loss': 0.87, 'grad_norm': 1.6647191047668457, 'learning_rate': 4.799583750867186e-05, 'epoch': 0.23}


                                                        
  8%|▊         | 1020/13473 [3:06:18<6:41:19,  1.93s/it]

{'eval_loss': 0.7141407132148743, 'eval_runtime': 90.7013, 'eval_samples_per_second': 10.97, 'eval_steps_per_second': 5.491, 'epoch': 0.23}


  8%|▊         | 1030/13473 [3:06:27<6:43:27,  1.95s/it]  

{'loss': 0.6097, 'grad_norm': 1.3284659385681152, 'learning_rate': 4.795729592230017e-05, 'epoch': 0.23}


                                                        
  8%|▊         | 1030/13473 [3:07:55<6:43:27,  1.95s/it]

{'eval_loss': 0.7902750968933105, 'eval_runtime': 87.5761, 'eval_samples_per_second': 11.362, 'eval_steps_per_second': 5.686, 'epoch': 0.23}


  8%|▊         | 1040/13473 [3:08:03<6:33:43,  1.90s/it] 

{'loss': 0.7995, 'grad_norm': 0.5131694078445435, 'learning_rate': 4.791875433592847e-05, 'epoch': 0.23}


                                                        
  8%|▊         | 1040/13473 [3:09:34<6:33:43,  1.90s/it]

{'eval_loss': 0.7171759009361267, 'eval_runtime': 90.2286, 'eval_samples_per_second': 11.028, 'eval_steps_per_second': 5.519, 'epoch': 0.23}


  8%|▊         | 1050/13473 [3:09:42<6:38:33,  1.92s/it] 

{'loss': 0.5395, 'grad_norm': 0.6878935694694519, 'learning_rate': 4.788021274955678e-05, 'epoch': 0.23}


                                                        
  8%|▊         | 1050/13473 [3:11:14<6:38:33,  1.92s/it]

{'eval_loss': 0.720122218132019, 'eval_runtime': 91.4219, 'eval_samples_per_second': 10.884, 'eval_steps_per_second': 5.447, 'epoch': 0.23}


  8%|▊         | 1060/13473 [3:11:22<6:38:06,  1.92s/it]  

{'loss': 0.6505, 'grad_norm': 1.657626748085022, 'learning_rate': 4.784167116318508e-05, 'epoch': 0.24}


                                                        
  8%|▊         | 1060/13473 [3:12:56<6:38:06,  1.92s/it]

{'eval_loss': 0.7295893430709839, 'eval_runtime': 93.1687, 'eval_samples_per_second': 10.68, 'eval_steps_per_second': 5.345, 'epoch': 0.24}


  8%|▊         | 1070/13473 [3:13:05<6:52:18,  1.99s/it]  

{'loss': 1.0507, 'grad_norm': 4.237522125244141, 'learning_rate': 4.780312957681338e-05, 'epoch': 0.24}


                                                        
  8%|▊         | 1070/13473 [3:14:37<6:52:18,  1.99s/it]

{'eval_loss': 0.7144214510917664, 'eval_runtime': 91.8846, 'eval_samples_per_second': 10.829, 'eval_steps_per_second': 5.42, 'epoch': 0.24}


  8%|▊         | 1080/13473 [3:14:45<6:42:47,  1.95s/it]  

{'loss': 0.6948, 'grad_norm': 4.188687324523926, 'learning_rate': 4.776458799044169e-05, 'epoch': 0.24}


                                                        
  8%|▊         | 1080/13473 [3:16:18<6:42:47,  1.95s/it]

{'eval_loss': 0.7413955926895142, 'eval_runtime': 92.5134, 'eval_samples_per_second': 10.755, 'eval_steps_per_second': 5.383, 'epoch': 0.24}


  8%|▊         | 1090/13473 [3:16:27<6:42:35,  1.95s/it]  

{'loss': 0.5714, 'grad_norm': 0.6747583746910095, 'learning_rate': 4.7726046404069996e-05, 'epoch': 0.24}


                                                        
  8%|▊         | 1090/13473 [3:17:58<6:42:35,  1.95s/it]

{'eval_loss': 0.7088626623153687, 'eval_runtime': 91.919, 'eval_samples_per_second': 10.825, 'eval_steps_per_second': 5.418, 'epoch': 0.24}


  8%|▊         | 1100/13473 [3:18:07<6:45:26,  1.97s/it]  

{'loss': 0.5107, 'grad_norm': 1.2320609092712402, 'learning_rate': 4.76875048176983e-05, 'epoch': 0.24}


                                                        
  8%|▊         | 1100/13473 [3:19:39<6:45:26,  1.97s/it]

{'eval_loss': 0.7264747619628906, 'eval_runtime': 91.9507, 'eval_samples_per_second': 10.821, 'eval_steps_per_second': 5.416, 'epoch': 0.24}


  8%|▊         | 1110/13473 [3:19:48<6:41:25,  1.95s/it]  

{'loss': 0.9342, 'grad_norm': 2.1185920238494873, 'learning_rate': 4.7648963231326605e-05, 'epoch': 0.25}


                                                        
  8%|▊         | 1110/13473 [3:21:19<6:41:25,  1.95s/it]

{'eval_loss': 0.7119067311286926, 'eval_runtime': 90.9688, 'eval_samples_per_second': 10.938, 'eval_steps_per_second': 5.474, 'epoch': 0.25}


  8%|▊         | 1120/13473 [3:21:28<6:34:45,  1.92s/it]  

{'loss': 0.6871, 'grad_norm': 2.2307839393615723, 'learning_rate': 4.761042164495491e-05, 'epoch': 0.25}


                                                        
  8%|▊         | 1120/13473 [3:23:16<6:34:45,  1.92s/it]

{'eval_loss': 0.7112356424331665, 'eval_runtime': 108.558, 'eval_samples_per_second': 9.166, 'eval_steps_per_second': 4.587, 'epoch': 0.25}


  8%|▊         | 1130/13473 [3:23:25<7:19:46,  2.14s/it]  

{'loss': 0.7023, 'grad_norm': 0.669330894947052, 'learning_rate': 4.7571880058583214e-05, 'epoch': 0.25}


                                                        
  8%|▊         | 1130/13473 [3:24:46<7:19:46,  2.14s/it]

{'eval_loss': 0.746435821056366, 'eval_runtime': 81.2977, 'eval_samples_per_second': 12.239, 'eval_steps_per_second': 6.126, 'epoch': 0.25}


  8%|▊         | 1140/13473 [3:24:55<6:10:25,  1.80s/it] 

{'loss': 1.0308, 'grad_norm': 3.494601249694824, 'learning_rate': 4.753333847221152e-05, 'epoch': 0.25}


                                                        
  8%|▊         | 1140/13473 [3:26:17<6:10:25,  1.80s/it]

{'eval_loss': 0.7069201469421387, 'eval_runtime': 82.4286, 'eval_samples_per_second': 12.071, 'eval_steps_per_second': 6.042, 'epoch': 0.25}


  9%|▊         | 1150/13473 [3:26:26<6:05:16,  1.78s/it] 

{'loss': 0.5192, 'grad_norm': 0.6412769556045532, 'learning_rate': 4.749479688583982e-05, 'epoch': 0.26}


                                                        
  9%|▊         | 1150/13473 [3:27:48<6:05:16,  1.78s/it]

{'eval_loss': 0.7108064889907837, 'eval_runtime': 81.7906, 'eval_samples_per_second': 12.165, 'eval_steps_per_second': 6.089, 'epoch': 0.26}


  9%|▊         | 1160/13473 [3:27:56<6:12:21,  1.81s/it] 

{'loss': 0.4507, 'grad_norm': 0.4320884644985199, 'learning_rate': 4.7456255299468124e-05, 'epoch': 0.26}


                                                        
  9%|▊         | 1160/13473 [3:29:19<6:12:21,  1.81s/it]

{'eval_loss': 0.756320059299469, 'eval_runtime': 83.0232, 'eval_samples_per_second': 11.985, 'eval_steps_per_second': 5.998, 'epoch': 0.26}


  9%|▊         | 1170/13473 [3:29:28<6:08:02,  1.79s/it] 

{'loss': 0.679, 'grad_norm': 0.5039994716644287, 'learning_rate': 4.741771371309643e-05, 'epoch': 0.26}


                                                        
  9%|▊         | 1170/13473 [3:30:51<6:08:02,  1.79s/it]

{'eval_loss': 0.7348077893257141, 'eval_runtime': 82.7635, 'eval_samples_per_second': 12.022, 'eval_steps_per_second': 6.017, 'epoch': 0.26}


  9%|▉         | 1180/13473 [3:30:59<6:04:02,  1.78s/it] 

{'loss': 0.6526, 'grad_norm': 1.564716100692749, 'learning_rate': 4.7379172126724733e-05, 'epoch': 0.26}


                                                        
  9%|▉         | 1180/13473 [3:32:20<6:04:02,  1.78s/it]

{'eval_loss': 0.725050151348114, 'eval_runtime': 81.2381, 'eval_samples_per_second': 12.248, 'eval_steps_per_second': 6.13, 'epoch': 0.26}


  9%|▉         | 1190/13473 [3:32:29<6:09:26,  1.80s/it] 

{'loss': 0.8764, 'grad_norm': 0.9503040313720703, 'learning_rate': 4.734063054035304e-05, 'epoch': 0.26}


                                                        
  9%|▉         | 1190/13473 [3:33:53<6:09:26,  1.80s/it]

{'eval_loss': 0.7207709550857544, 'eval_runtime': 84.2146, 'eval_samples_per_second': 11.815, 'eval_steps_per_second': 5.913, 'epoch': 0.26}


  9%|▉         | 1200/13473 [3:34:02<6:08:50,  1.80s/it] 

{'loss': 0.4367, 'grad_norm': 0.7702710032463074, 'learning_rate': 4.730208895398135e-05, 'epoch': 0.27}


                                                        
  9%|▉         | 1200/13473 [3:35:24<6:08:50,  1.80s/it]

{'eval_loss': 0.7302140593528748, 'eval_runtime': 81.572, 'eval_samples_per_second': 12.198, 'eval_steps_per_second': 6.105, 'epoch': 0.27}


  9%|▉         | 1210/13473 [3:35:32<5:59:51,  1.76s/it] 

{'loss': 0.5525, 'grad_norm': 0.8278048038482666, 'learning_rate': 4.726354736760965e-05, 'epoch': 0.27}


                                                        
  9%|▉         | 1210/13473 [3:36:53<5:59:51,  1.76s/it]

{'eval_loss': 0.7407797574996948, 'eval_runtime': 80.6214, 'eval_samples_per_second': 12.342, 'eval_steps_per_second': 6.177, 'epoch': 0.27}


  9%|▉         | 1220/13473 [3:37:01<5:57:09,  1.75s/it] 

{'loss': 0.7781, 'grad_norm': 1.236138939857483, 'learning_rate': 4.722500578123796e-05, 'epoch': 0.27}


                                                        
  9%|▉         | 1220/13473 [3:38:20<5:57:09,  1.75s/it]

{'eval_loss': 0.7109213471412659, 'eval_runtime': 78.8426, 'eval_samples_per_second': 12.62, 'eval_steps_per_second': 6.316, 'epoch': 0.27}


  9%|▉         | 1230/13473 [3:38:28<5:54:57,  1.74s/it] 

{'loss': 0.4521, 'grad_norm': 0.8522307872772217, 'learning_rate': 4.7186464194866266e-05, 'epoch': 0.27}


                                                        
  9%|▉         | 1230/13473 [3:39:50<5:54:57,  1.74s/it]

{'eval_loss': 0.7211979031562805, 'eval_runtime': 82.0609, 'eval_samples_per_second': 12.125, 'eval_steps_per_second': 6.069, 'epoch': 0.27}


  9%|▉         | 1240/13473 [3:39:59<6:13:58,  1.83s/it] 

{'loss': 0.621, 'grad_norm': 2.312032699584961, 'learning_rate': 4.714792260849457e-05, 'epoch': 0.28}


                                                        
  9%|▉         | 1240/13473 [3:41:20<6:13:58,  1.83s/it]

{'eval_loss': 0.7400104999542236, 'eval_runtime': 80.6977, 'eval_samples_per_second': 12.33, 'eval_steps_per_second': 6.171, 'epoch': 0.28}


  9%|▉         | 1250/13473 [3:41:28<6:03:27,  1.78s/it] 

{'loss': 0.5381, 'grad_norm': 2.1786952018737793, 'learning_rate': 4.7109381022122876e-05, 'epoch': 0.28}


                                                        
  9%|▉         | 1250/13473 [3:42:49<6:03:27,  1.78s/it]

{'eval_loss': 0.7329460382461548, 'eval_runtime': 80.3642, 'eval_samples_per_second': 12.381, 'eval_steps_per_second': 6.197, 'epoch': 0.28}


  9%|▉         | 1260/13473 [3:42:57<5:56:17,  1.75s/it] 

{'loss': 0.8535, 'grad_norm': 1.7678651809692383, 'learning_rate': 4.707083943575118e-05, 'epoch': 0.28}


                                                        
  9%|▉         | 1260/13473 [3:44:19<5:56:17,  1.75s/it]

{'eval_loss': 0.7214676141738892, 'eval_runtime': 81.9713, 'eval_samples_per_second': 12.138, 'eval_steps_per_second': 6.075, 'epoch': 0.28}


  9%|▉         | 1270/13473 [3:44:28<6:03:49,  1.79s/it] 

{'loss': 0.6672, 'grad_norm': 3.21195125579834, 'learning_rate': 4.703229784937948e-05, 'epoch': 0.28}


                                                        
  9%|▉         | 1270/13473 [3:45:50<6:03:49,  1.79s/it]

{'eval_loss': 0.7373894453048706, 'eval_runtime': 81.546, 'eval_samples_per_second': 12.202, 'eval_steps_per_second': 6.107, 'epoch': 0.28}


 10%|▉         | 1280/13473 [3:45:59<6:06:57,  1.81s/it] 

{'loss': 0.5585, 'grad_norm': 0.8366872072219849, 'learning_rate': 4.6993756263007786e-05, 'epoch': 0.29}


                                                        
 10%|▉         | 1280/13473 [3:47:21<6:06:57,  1.81s/it]

{'eval_loss': 0.7170621156692505, 'eval_runtime': 82.0867, 'eval_samples_per_second': 12.121, 'eval_steps_per_second': 6.067, 'epoch': 0.29}


 10%|▉         | 1290/13473 [3:47:30<6:10:06,  1.82s/it] 

{'loss': 0.4435, 'grad_norm': 0.5393922328948975, 'learning_rate': 4.6955214676636094e-05, 'epoch': 0.29}


                                                        
 10%|▉         | 1290/13473 [3:48:53<6:10:06,  1.82s/it]

{'eval_loss': 0.7715474367141724, 'eval_runtime': 83.0957, 'eval_samples_per_second': 11.974, 'eval_steps_per_second': 5.993, 'epoch': 0.29}


 10%|▉         | 1300/13473 [3:49:01<6:07:41,  1.81s/it] 

{'loss': 0.8779, 'grad_norm': 4.017683506011963, 'learning_rate': 4.6916673090264395e-05, 'epoch': 0.29}


                                                        
 10%|▉         | 1300/13473 [3:50:24<6:07:41,  1.81s/it]

{'eval_loss': 0.7722898125648499, 'eval_runtime': 82.5834, 'eval_samples_per_second': 12.048, 'eval_steps_per_second': 6.03, 'epoch': 0.29}


 10%|▉         | 1310/13473 [3:50:33<6:19:32,  1.87s/it] 

{'loss': 0.5088, 'grad_norm': 0.9395571947097778, 'learning_rate': 4.68781315038927e-05, 'epoch': 0.29}


                                                        
 10%|▉         | 1310/13473 [3:51:56<6:19:32,  1.87s/it]

{'eval_loss': 0.7306893467903137, 'eval_runtime': 82.7167, 'eval_samples_per_second': 12.029, 'eval_steps_per_second': 6.021, 'epoch': 0.29}


 10%|▉         | 1320/13473 [3:52:04<6:01:33,  1.79s/it] 

{'loss': 0.9153, 'grad_norm': 1.65311861038208, 'learning_rate': 4.683958991752101e-05, 'epoch': 0.29}


                                                        
 10%|▉         | 1320/13473 [3:53:28<6:01:33,  1.79s/it]

{'eval_loss': 0.7105292081832886, 'eval_runtime': 84.4662, 'eval_samples_per_second': 11.78, 'eval_steps_per_second': 5.896, 'epoch': 0.29}


 10%|▉         | 1330/13473 [3:53:37<6:13:26,  1.85s/it] 

{'loss': 0.5082, 'grad_norm': 1.3201704025268555, 'learning_rate': 4.680104833114931e-05, 'epoch': 0.3}


                                                        
 10%|▉         | 1330/13473 [3:55:00<6:13:26,  1.85s/it]

{'eval_loss': 0.7184027433395386, 'eval_runtime': 83.0995, 'eval_samples_per_second': 11.974, 'eval_steps_per_second': 5.993, 'epoch': 0.3}


 10%|▉         | 1340/13473 [3:55:09<6:07:44,  1.82s/it] 

{'loss': 0.7118, 'grad_norm': 0.4114711582660675, 'learning_rate': 4.676250674477762e-05, 'epoch': 0.3}


                                                        
 10%|▉         | 1340/13473 [3:56:29<6:07:44,  1.82s/it]

{'eval_loss': 0.7399001717567444, 'eval_runtime': 80.1972, 'eval_samples_per_second': 12.407, 'eval_steps_per_second': 6.21, 'epoch': 0.3}


 10%|█         | 1350/13473 [3:56:38<5:59:42,  1.78s/it] 

{'loss': 1.1318, 'grad_norm': 2.1328115463256836, 'learning_rate': 4.672396515840592e-05, 'epoch': 0.3}


                                                        
 10%|█         | 1350/13473 [3:58:01<5:59:42,  1.78s/it]

{'eval_loss': 0.7183504104614258, 'eval_runtime': 83.2492, 'eval_samples_per_second': 11.952, 'eval_steps_per_second': 5.982, 'epoch': 0.3}


 10%|█         | 1360/13473 [3:58:10<6:09:35,  1.83s/it] 

{'loss': 0.4897, 'grad_norm': 0.3622509241104126, 'learning_rate': 4.668542357203423e-05, 'epoch': 0.3}


                                                        
 10%|█         | 1360/13473 [3:59:35<6:09:35,  1.83s/it]

{'eval_loss': 0.7500921487808228, 'eval_runtime': 84.4374, 'eval_samples_per_second': 11.784, 'eval_steps_per_second': 5.898, 'epoch': 0.3}


 10%|█         | 1370/13473 [3:59:43<6:05:44,  1.81s/it] 

{'loss': 0.4595, 'grad_norm': 0.3629562556743622, 'learning_rate': 4.664688198566253e-05, 'epoch': 0.31}


                                                        
 10%|█         | 1370/13473 [4:01:06<6:05:44,  1.81s/it]

{'eval_loss': 0.7506008148193359, 'eval_runtime': 83.4075, 'eval_samples_per_second': 11.929, 'eval_steps_per_second': 5.971, 'epoch': 0.31}


 10%|█         | 1380/13473 [4:01:15<6:08:32,  1.83s/it] 

{'loss': 0.6358, 'grad_norm': 0.606879711151123, 'learning_rate': 4.660834039929084e-05, 'epoch': 0.31}


                                                        
 10%|█         | 1380/13473 [4:02:45<6:08:32,  1.83s/it]

{'eval_loss': 0.7199923992156982, 'eval_runtime': 89.9126, 'eval_samples_per_second': 11.066, 'eval_steps_per_second': 5.539, 'epoch': 0.31}


 10%|█         | 1390/13473 [4:02:54<6:14:39,  1.86s/it] 

{'loss': 0.9896, 'grad_norm': 1.5137988328933716, 'learning_rate': 4.656979881291914e-05, 'epoch': 0.31}


                                                        
 10%|█         | 1390/13473 [4:04:21<6:14:39,  1.86s/it]

{'eval_loss': 0.7108912467956543, 'eval_runtime': 87.1432, 'eval_samples_per_second': 11.418, 'eval_steps_per_second': 5.715, 'epoch': 0.31}


 10%|█         | 1400/13473 [4:04:30<6:37:08,  1.97s/it] 

{'loss': 0.9675, 'grad_norm': 4.645201683044434, 'learning_rate': 4.653125722654745e-05, 'epoch': 0.31}


                                                        
 10%|█         | 1400/13473 [4:06:05<6:37:08,  1.97s/it]

{'eval_loss': 0.7299368381500244, 'eval_runtime': 94.8045, 'eval_samples_per_second': 10.495, 'eval_steps_per_second': 5.253, 'epoch': 0.31}


 10%|█         | 1410/13473 [4:06:13<6:37:26,  1.98s/it]  

{'loss': 0.6627, 'grad_norm': 0.9072343707084656, 'learning_rate': 4.649271564017575e-05, 'epoch': 0.31}


                                                        
 10%|█         | 1410/13473 [4:08:45<6:37:26,  1.98s/it]

{'eval_loss': 0.7048231959342957, 'eval_runtime': 151.433, 'eval_samples_per_second': 6.571, 'eval_steps_per_second': 3.289, 'epoch': 0.31}


 11%|█         | 1420/13473 [4:09:04<11:14:02,  3.36s/it] 

{'loss': 0.6724, 'grad_norm': 1.2361435890197754, 'learning_rate': 4.645417405380406e-05, 'epoch': 0.32}


                                                         
 11%|█         | 1420/13473 [4:10:46<11:14:02,  3.36s/it]

{'eval_loss': 0.7077056169509888, 'eval_runtime': 101.2813, 'eval_samples_per_second': 9.824, 'eval_steps_per_second': 4.917, 'epoch': 0.32}


 11%|█         | 1430/13473 [4:10:54<6:42:35,  2.01s/it]  

{'loss': 0.3859, 'grad_norm': 0.6448909640312195, 'learning_rate': 4.6415632467432365e-05, 'epoch': 0.32}


                                                        
 11%|█         | 1430/13473 [4:12:19<6:42:35,  2.01s/it]

{'eval_loss': 0.7550995349884033, 'eval_runtime': 84.9505, 'eval_samples_per_second': 11.713, 'eval_steps_per_second': 5.862, 'epoch': 0.32}


 11%|█         | 1440/13473 [4:12:27<6:08:43,  1.84s/it] 

{'loss': 0.7993, 'grad_norm': 0.767679750919342, 'learning_rate': 4.6377090881060666e-05, 'epoch': 0.32}


                                                        
 11%|█         | 1440/13473 [4:14:03<6:08:43,  1.84s/it]

{'eval_loss': 0.7249862551689148, 'eval_runtime': 96.2959, 'eval_samples_per_second': 10.333, 'eval_steps_per_second': 5.172, 'epoch': 0.32}


 11%|█         | 1450/13473 [4:14:12<6:30:38,  1.95s/it]  

{'loss': 0.509, 'grad_norm': 0.41060855984687805, 'learning_rate': 4.6338549294688974e-05, 'epoch': 0.32}


                                                        
 11%|█         | 1450/13473 [4:15:41<6:30:38,  1.95s/it]

{'eval_loss': 0.7145369052886963, 'eval_runtime': 89.558, 'eval_samples_per_second': 11.11, 'eval_steps_per_second': 5.561, 'epoch': 0.32}


 11%|█         | 1460/13473 [4:15:50<6:22:01,  1.91s/it] 

{'loss': 0.7676, 'grad_norm': 1.006548285484314, 'learning_rate': 4.6300007708317275e-05, 'epoch': 0.33}


                                                        
 11%|█         | 1460/13473 [4:17:22<6:22:01,  1.91s/it]

{'eval_loss': 0.7089418768882751, 'eval_runtime': 92.4772, 'eval_samples_per_second': 10.759, 'eval_steps_per_second': 5.385, 'epoch': 0.33}


 11%|█         | 1470/13473 [4:17:31<6:29:57,  1.95s/it] 

{'loss': 0.7172, 'grad_norm': 1.3715665340423584, 'learning_rate': 4.6261466121945576e-05, 'epoch': 0.33}


                                                        
 11%|█         | 1470/13473 [4:19:07<6:29:57,  1.95s/it]

{'eval_loss': 0.71302729845047, 'eval_runtime': 95.8815, 'eval_samples_per_second': 10.377, 'eval_steps_per_second': 5.194, 'epoch': 0.33}


 11%|█         | 1480/13473 [4:19:16<6:53:06,  2.07s/it]  

{'loss': 0.462, 'grad_norm': 0.41759929060935974, 'learning_rate': 4.6222924535573884e-05, 'epoch': 0.33}


                                                        
 11%|█         | 1480/13473 [4:20:54<6:53:06,  2.07s/it]

{'eval_loss': 0.7832806706428528, 'eval_runtime': 97.5792, 'eval_samples_per_second': 10.197, 'eval_steps_per_second': 5.104, 'epoch': 0.33}


 11%|█         | 1490/13473 [4:21:02<6:36:52,  1.99s/it]  

{'loss': 1.5848, 'grad_norm': 1.7653383016586304, 'learning_rate': 4.618438294920219e-05, 'epoch': 0.33}


                                                        
 11%|█         | 1490/13473 [4:22:33<6:36:52,  1.99s/it]

{'eval_loss': 0.7533369064331055, 'eval_runtime': 90.9066, 'eval_samples_per_second': 10.945, 'eval_steps_per_second': 5.478, 'epoch': 0.33}


 11%|█         | 1500/13473 [4:22:42<6:23:14,  1.92s/it] 

{'loss': 0.6194, 'grad_norm': 0.9157181978225708, 'learning_rate': 4.614584136283049e-05, 'epoch': 0.33}


                                                        
 11%|█         | 1500/13473 [4:24:17<6:23:14,  1.92s/it]

{'eval_loss': 0.7068504095077515, 'eval_runtime': 95.0619, 'eval_samples_per_second': 10.467, 'eval_steps_per_second': 5.239, 'epoch': 0.33}


 11%|█         | 1510/13473 [4:24:27<6:41:44,  2.01s/it]  

{'loss': 0.55, 'grad_norm': 2.123558759689331, 'learning_rate': 4.61072997764588e-05, 'epoch': 0.34}


                                                        
 11%|█         | 1510/13473 [4:25:57<6:41:44,  2.01s/it]

{'eval_loss': 0.723517119884491, 'eval_runtime': 90.417, 'eval_samples_per_second': 11.005, 'eval_steps_per_second': 5.508, 'epoch': 0.34}


 11%|█▏        | 1520/13473 [4:26:06<6:26:24,  1.94s/it] 

{'loss': 1.1164, 'grad_norm': 4.1391754150390625, 'learning_rate': 4.606875819008711e-05, 'epoch': 0.34}


                                                        
 11%|█▏        | 1520/13473 [4:27:31<6:26:24,  1.94s/it]

{'eval_loss': 0.7070767879486084, 'eval_runtime': 85.5945, 'eval_samples_per_second': 11.625, 'eval_steps_per_second': 5.818, 'epoch': 0.34}


 11%|█▏        | 1530/13473 [4:27:40<6:06:16,  1.84s/it] 

{'loss': 0.7301, 'grad_norm': 1.5580425262451172, 'learning_rate': 4.603021660371541e-05, 'epoch': 0.34}


                                                        
 11%|█▏        | 1530/13473 [4:29:09<6:06:16,  1.84s/it]

{'eval_loss': 0.7037699222564697, 'eval_runtime': 88.9814, 'eval_samples_per_second': 11.182, 'eval_steps_per_second': 5.597, 'epoch': 0.34}


 11%|█▏        | 1540/13473 [4:29:17<6:16:03,  1.89s/it] 

{'loss': 0.4072, 'grad_norm': 0.7893063426017761, 'learning_rate': 4.599167501734372e-05, 'epoch': 0.34}


                                                        
 11%|█▏        | 1540/13473 [4:30:51<6:16:03,  1.89s/it]

{'eval_loss': 0.7305781245231628, 'eval_runtime': 93.7893, 'eval_samples_per_second': 10.609, 'eval_steps_per_second': 5.31, 'epoch': 0.34}


 12%|█▏        | 1550/13473 [4:31:00<6:26:17,  1.94s/it] 

{'loss': 1.0707, 'grad_norm': 1.0980217456817627, 'learning_rate': 4.5953133430972026e-05, 'epoch': 0.35}


                                                        
 12%|█▏        | 1550/13473 [4:32:30<6:26:17,  1.94s/it]

{'eval_loss': 0.7244377136230469, 'eval_runtime': 89.8316, 'eval_samples_per_second': 11.076, 'eval_steps_per_second': 5.544, 'epoch': 0.35}


 12%|█▏        | 1560/13473 [4:32:38<6:27:54,  1.95s/it] 

{'loss': 0.5881, 'grad_norm': 1.5263744592666626, 'learning_rate': 4.591459184460033e-05, 'epoch': 0.35}


                                                        
 12%|█▏        | 1560/13473 [4:34:00<6:27:54,  1.95s/it]

{'eval_loss': 0.7053008675575256, 'eval_runtime': 82.09, 'eval_samples_per_second': 12.121, 'eval_steps_per_second': 6.067, 'epoch': 0.35}


 12%|█▏        | 1570/13473 [4:34:09<5:57:36,  1.80s/it] 

{'loss': 0.8427, 'grad_norm': 3.6361703872680664, 'learning_rate': 4.587605025822863e-05, 'epoch': 0.35}


                                                        
 12%|█▏        | 1570/13473 [4:35:38<5:57:36,  1.80s/it]

{'eval_loss': 0.7211704850196838, 'eval_runtime': 89.5285, 'eval_samples_per_second': 11.114, 'eval_steps_per_second': 5.562, 'epoch': 0.35}


 12%|█▏        | 1580/13473 [4:35:47<6:09:27,  1.86s/it] 

{'loss': 0.3806, 'grad_norm': 0.6039679050445557, 'learning_rate': 4.5837508671856937e-05, 'epoch': 0.35}


                                                        
 12%|█▏        | 1580/13473 [4:37:18<6:09:27,  1.86s/it]

{'eval_loss': 0.7436638474464417, 'eval_runtime': 90.6979, 'eval_samples_per_second': 10.97, 'eval_steps_per_second': 5.491, 'epoch': 0.35}


 12%|█▏        | 1590/13473 [4:37:26<6:12:50,  1.88s/it] 

{'loss': 0.8103, 'grad_norm': 4.055576324462891, 'learning_rate': 4.579896708548524e-05, 'epoch': 0.35}


                                                        
 12%|█▏        | 1590/13473 [4:39:01<6:12:50,  1.88s/it]

{'eval_loss': 0.7353806495666504, 'eval_runtime': 94.6701, 'eval_samples_per_second': 10.51, 'eval_steps_per_second': 5.26, 'epoch': 0.35}


 12%|█▏        | 1600/13473 [4:39:09<6:39:30,  2.02s/it] 

{'loss': 0.8981, 'grad_norm': 1.2090704441070557, 'learning_rate': 4.5760425499113546e-05, 'epoch': 0.36}


                                                        
 12%|█▏        | 1600/13473 [4:40:31<6:39:30,  2.02s/it]

{'eval_loss': 0.7173966765403748, 'eval_runtime': 81.2175, 'eval_samples_per_second': 12.251, 'eval_steps_per_second': 6.132, 'epoch': 0.36}


 12%|█▏        | 1610/13473 [4:40:39<5:51:34,  1.78s/it] 

{'loss': 0.4579, 'grad_norm': 0.7050515413284302, 'learning_rate': 4.5721883912741854e-05, 'epoch': 0.36}


                                                        
 12%|█▏        | 1610/13473 [4:42:00<5:51:34,  1.78s/it]

{'eval_loss': 0.7213606834411621, 'eval_runtime': 81.4101, 'eval_samples_per_second': 12.222, 'eval_steps_per_second': 6.117, 'epoch': 0.36}


 12%|█▏        | 1620/13473 [4:42:09<5:53:14,  1.79s/it] 

{'loss': 0.7639, 'grad_norm': 0.6528768539428711, 'learning_rate': 4.5683342326370155e-05, 'epoch': 0.36}


                                                        
 12%|█▏        | 1620/13473 [4:43:39<5:53:14,  1.79s/it]

{'eval_loss': 0.7480626106262207, 'eval_runtime': 89.4939, 'eval_samples_per_second': 11.118, 'eval_steps_per_second': 5.565, 'epoch': 0.36}


 12%|█▏        | 1630/13473 [4:43:47<6:12:59,  1.89s/it] 

{'loss': 0.8453, 'grad_norm': 1.174096703529358, 'learning_rate': 4.564480073999846e-05, 'epoch': 0.36}


                                                        
 12%|█▏        | 1630/13473 [4:45:15<6:12:59,  1.89s/it]

{'eval_loss': 0.7145264744758606, 'eval_runtime': 88.0188, 'eval_samples_per_second': 11.304, 'eval_steps_per_second': 5.658, 'epoch': 0.36}


 12%|█▏        | 1640/13473 [4:45:23<6:06:45,  1.86s/it] 

{'loss': 1.0806, 'grad_norm': 1.2050257921218872, 'learning_rate': 4.560625915362677e-05, 'epoch': 0.37}


                                                        
 12%|█▏        | 1640/13473 [4:46:48<6:06:45,  1.86s/it]

{'eval_loss': 0.708932101726532, 'eval_runtime': 84.6036, 'eval_samples_per_second': 11.761, 'eval_steps_per_second': 5.886, 'epoch': 0.37}


 12%|█▏        | 1650/13473 [4:46:56<5:54:13,  1.80s/it] 

{'loss': 0.4104, 'grad_norm': 0.47131457924842834, 'learning_rate': 4.556771756725507e-05, 'epoch': 0.37}


                                                        
 12%|█▏        | 1650/13473 [4:48:26<5:54:13,  1.80s/it]

{'eval_loss': 0.7420781850814819, 'eval_runtime': 89.962, 'eval_samples_per_second': 11.06, 'eval_steps_per_second': 5.536, 'epoch': 0.37}


 12%|█▏        | 1660/13473 [4:48:34<6:08:14,  1.87s/it] 

{'loss': 0.8342, 'grad_norm': 4.270543575286865, 'learning_rate': 4.552917598088337e-05, 'epoch': 0.37}


                                                        
 12%|█▏        | 1660/13473 [4:50:40<6:08:14,  1.87s/it]

{'eval_loss': 0.7474173307418823, 'eval_runtime': 125.8536, 'eval_samples_per_second': 7.906, 'eval_steps_per_second': 3.957, 'epoch': 0.37}


 12%|█▏        | 1670/13473 [4:50:54<9:12:32,  2.81s/it]  

{'loss': 0.7186, 'grad_norm': 0.8583488464355469, 'learning_rate': 4.5490634394511674e-05, 'epoch': 0.37}


                                                        
 12%|█▏        | 1670/13473 [4:53:04<9:12:32,  2.81s/it]

{'eval_loss': 0.70782071352005, 'eval_runtime': 130.3224, 'eval_samples_per_second': 7.635, 'eval_steps_per_second': 3.821, 'epoch': 0.37}


 12%|█▏        | 1680/13473 [4:53:13<7:54:01,  2.41s/it]  

{'loss': 0.392, 'grad_norm': 0.378540575504303, 'learning_rate': 4.545209280813998e-05, 'epoch': 0.37}


                                                        
 12%|█▏        | 1680/13473 [4:54:41<7:54:01,  2.41s/it]

{'eval_loss': 0.7257293462753296, 'eval_runtime': 88.1988, 'eval_samples_per_second': 11.281, 'eval_steps_per_second': 5.646, 'epoch': 0.37}


 13%|█▎        | 1690/13473 [4:54:50<6:11:08,  1.89s/it] 

{'loss': 0.6203, 'grad_norm': 1.3422743082046509, 'learning_rate': 4.541355122176829e-05, 'epoch': 0.38}


                                                        
 13%|█▎        | 1690/13473 [4:56:22<6:11:08,  1.89s/it]

{'eval_loss': 0.7305331826210022, 'eval_runtime': 92.6622, 'eval_samples_per_second': 10.738, 'eval_steps_per_second': 5.374, 'epoch': 0.38}


 13%|█▎        | 1700/13473 [4:56:30<6:11:30,  1.89s/it] 

{'loss': 0.8939, 'grad_norm': 0.6991531848907471, 'learning_rate': 4.537500963539659e-05, 'epoch': 0.38}


                                                        
 13%|█▎        | 1700/13473 [4:58:01<6:11:30,  1.89s/it]

{'eval_loss': 0.709256649017334, 'eval_runtime': 90.2985, 'eval_samples_per_second': 11.019, 'eval_steps_per_second': 5.515, 'epoch': 0.38}


 13%|█▎        | 1710/13473 [4:58:10<6:30:36,  1.99s/it] 

{'loss': 0.8037, 'grad_norm': 1.0165919065475464, 'learning_rate': 4.53364680490249e-05, 'epoch': 0.38}


                                                        
 13%|█▎        | 1710/13473 [4:59:32<6:30:36,  1.99s/it]

{'eval_loss': 0.7085448503494263, 'eval_runtime': 81.7342, 'eval_samples_per_second': 12.174, 'eval_steps_per_second': 6.093, 'epoch': 0.38}


 13%|█▎        | 1720/13473 [4:59:40<5:58:06,  1.83s/it] 

{'loss': 0.8292, 'grad_norm': 0.3827914595603943, 'learning_rate': 4.529792646265321e-05, 'epoch': 0.38}


                                                        
 13%|█▎        | 1720/13473 [5:01:04<5:58:06,  1.83s/it]

{'eval_loss': 0.7073668837547302, 'eval_runtime': 83.595, 'eval_samples_per_second': 11.903, 'eval_steps_per_second': 5.957, 'epoch': 0.38}


 13%|█▎        | 1730/13473 [5:01:13<6:05:06,  1.87s/it] 

{'loss': 0.8973, 'grad_norm': 0.8006923198699951, 'learning_rate': 4.525938487628151e-05, 'epoch': 0.39}


                                                        
 13%|█▎        | 1730/13473 [5:02:45<6:05:06,  1.87s/it]

{'eval_loss': 0.7055185437202454, 'eval_runtime': 92.6872, 'eval_samples_per_second': 10.735, 'eval_steps_per_second': 5.373, 'epoch': 0.39}


 13%|█▎        | 1740/13473 [5:02:54<6:18:06,  1.93s/it] 

{'loss': 0.6412, 'grad_norm': 4.721027374267578, 'learning_rate': 4.5220843289909816e-05, 'epoch': 0.39}


                                                        
 13%|█▎        | 1740/13473 [5:04:19<6:18:06,  1.93s/it]

{'eval_loss': 0.7631812691688538, 'eval_runtime': 84.577, 'eval_samples_per_second': 11.764, 'eval_steps_per_second': 5.888, 'epoch': 0.39}


 13%|█▎        | 1750/13473 [5:04:27<5:57:57,  1.83s/it] 

{'loss': 0.7318, 'grad_norm': 1.577833652496338, 'learning_rate': 4.5182301703538124e-05, 'epoch': 0.39}


                                                        
 13%|█▎        | 1750/13473 [5:05:58<5:57:57,  1.83s/it]

{'eval_loss': 0.7146906852722168, 'eval_runtime': 90.7564, 'eval_samples_per_second': 10.963, 'eval_steps_per_second': 5.487, 'epoch': 0.39}


 13%|█▎        | 1760/13473 [5:06:07<6:21:11,  1.95s/it] 

{'loss': 0.4673, 'grad_norm': 0.6391088366508484, 'learning_rate': 4.5143760117166426e-05, 'epoch': 0.39}


                                                        
 13%|█▎        | 1760/13473 [5:07:36<6:21:11,  1.95s/it]

{'eval_loss': 0.7198969125747681, 'eval_runtime': 89.2683, 'eval_samples_per_second': 11.146, 'eval_steps_per_second': 5.579, 'epoch': 0.39}


 13%|█▎        | 1770/13473 [5:07:44<5:56:06,  1.83s/it] 

{'loss': 0.7138, 'grad_norm': 0.7605961561203003, 'learning_rate': 4.510521853079473e-05, 'epoch': 0.39}


                                                        
 13%|█▎        | 1770/13473 [5:09:20<5:56:06,  1.83s/it]

{'eval_loss': 0.7237828373908997, 'eval_runtime': 96.0802, 'eval_samples_per_second': 10.356, 'eval_steps_per_second': 5.183, 'epoch': 0.39}


 13%|█▎        | 1780/13473 [5:09:30<6:27:19,  1.99s/it] 

{'loss': 0.8218, 'grad_norm': 0.4036530554294586, 'learning_rate': 4.5066676944423035e-05, 'epoch': 0.4}


                                                        
 13%|█▎        | 1780/13473 [5:11:08<6:27:19,  1.99s/it]

{'eval_loss': 0.711174726486206, 'eval_runtime': 98.1446, 'eval_samples_per_second': 10.138, 'eval_steps_per_second': 5.074, 'epoch': 0.4}


 13%|█▎        | 1790/13473 [5:11:17<6:29:17,  2.00s/it]  

{'loss': 0.8663, 'grad_norm': 0.5986822247505188, 'learning_rate': 4.5028135358051336e-05, 'epoch': 0.4}


                                                        
 13%|█▎        | 1790/13473 [5:12:52<6:29:17,  2.00s/it]

{'eval_loss': 0.712125301361084, 'eval_runtime': 95.6745, 'eval_samples_per_second': 10.4, 'eval_steps_per_second': 5.205, 'epoch': 0.4}


 13%|█▎        | 1800/13473 [5:13:00<6:11:06,  1.91s/it] 

{'loss': 0.8714, 'grad_norm': 7.578426361083984, 'learning_rate': 4.4989593771679644e-05, 'epoch': 0.4}


                                                        
 13%|█▎        | 1800/13473 [5:14:35<6:11:06,  1.91s/it]

{'eval_loss': 0.7323468923568726, 'eval_runtime': 94.9064, 'eval_samples_per_second': 10.484, 'eval_steps_per_second': 5.247, 'epoch': 0.4}


 13%|█▎        | 1810/13473 [5:14:44<6:22:13,  1.97s/it] 

{'loss': 1.2451, 'grad_norm': 0.4631699025630951, 'learning_rate': 4.495105218530795e-05, 'epoch': 0.4}


                                                        
 13%|█▎        | 1810/13473 [5:16:15<6:22:13,  1.97s/it]

{'eval_loss': 0.7073465585708618, 'eval_runtime': 90.9701, 'eval_samples_per_second': 10.938, 'eval_steps_per_second': 5.474, 'epoch': 0.4}


 14%|█▎        | 1820/13473 [5:16:23<6:12:55,  1.92s/it] 

{'loss': 0.6412, 'grad_norm': 0.9126927852630615, 'learning_rate': 4.491251059893625e-05, 'epoch': 0.41}


                                                        
 14%|█▎        | 1820/13473 [5:17:54<6:12:55,  1.92s/it]

{'eval_loss': 0.7081512808799744, 'eval_runtime': 91.2186, 'eval_samples_per_second': 10.908, 'eval_steps_per_second': 5.459, 'epoch': 0.41}


 14%|█▎        | 1830/13473 [5:18:03<6:03:04,  1.87s/it] 

{'loss': 0.3753, 'grad_norm': 0.30387040972709656, 'learning_rate': 4.487396901256456e-05, 'epoch': 0.41}


                                                        
 14%|█▎        | 1830/13473 [5:19:36<6:03:04,  1.87s/it]

{'eval_loss': 0.753166913986206, 'eval_runtime': 93.1418, 'eval_samples_per_second': 10.683, 'eval_steps_per_second': 5.347, 'epoch': 0.41}


 14%|█▎        | 1840/13473 [5:19:45<6:13:10,  1.92s/it] 

{'loss': 1.0714, 'grad_norm': 0.4910085201263428, 'learning_rate': 4.483542742619287e-05, 'epoch': 0.41}


                                                        
 14%|█▎        | 1840/13473 [5:21:21<6:13:10,  1.92s/it]

{'eval_loss': 0.7166451811790466, 'eval_runtime': 96.5897, 'eval_samples_per_second': 10.301, 'eval_steps_per_second': 5.156, 'epoch': 0.41}


 14%|█▎        | 1850/13473 [5:21:30<6:24:03,  1.98s/it] 

{'loss': 0.5453, 'grad_norm': 0.835182249546051, 'learning_rate': 4.479688583982117e-05, 'epoch': 0.41}


                                                        
 14%|█▎        | 1850/13473 [5:23:01<6:24:03,  1.98s/it]

{'eval_loss': 0.7090451717376709, 'eval_runtime': 91.4196, 'eval_samples_per_second': 10.884, 'eval_steps_per_second': 5.447, 'epoch': 0.41}


 14%|█▍        | 1860/13473 [5:23:10<6:13:10,  1.93s/it] 

{'loss': 0.4895, 'grad_norm': 0.3316307067871094, 'learning_rate': 4.475834425344947e-05, 'epoch': 0.41}


                                                        
 14%|█▍        | 1860/13473 [5:24:40<6:13:10,  1.93s/it]

{'eval_loss': 0.7285173535346985, 'eval_runtime': 90.024, 'eval_samples_per_second': 11.053, 'eval_steps_per_second': 5.532, 'epoch': 0.41}


 14%|█▍        | 1870/13473 [5:24:49<6:17:20,  1.95s/it] 

{'loss': 0.7047, 'grad_norm': 1.4791988134384155, 'learning_rate': 4.471980266707778e-05, 'epoch': 0.42}


                                                        
 14%|█▍        | 1870/13473 [5:26:23<6:17:20,  1.95s/it]

{'eval_loss': 0.7151032090187073, 'eval_runtime': 93.361, 'eval_samples_per_second': 10.658, 'eval_steps_per_second': 5.334, 'epoch': 0.42}


 14%|█▍        | 1880/13473 [5:26:31<6:14:02,  1.94s/it] 

{'loss': 0.6071, 'grad_norm': 0.9045484662055969, 'learning_rate': 4.468126108070608e-05, 'epoch': 0.42}


                                                        
 14%|█▍        | 1880/13473 [5:28:06<6:14:02,  1.94s/it]

{'eval_loss': 0.7152920961380005, 'eval_runtime': 95.1099, 'eval_samples_per_second': 10.462, 'eval_steps_per_second': 5.236, 'epoch': 0.42}


 14%|█▍        | 1890/13473 [5:28:15<6:20:38,  1.97s/it] 

{'loss': 1.3635, 'grad_norm': 0.9680332541465759, 'learning_rate': 4.464271949433439e-05, 'epoch': 0.42}


                                                        
 14%|█▍        | 1890/13473 [5:29:50<6:20:38,  1.97s/it]

{'eval_loss': 0.7070485353469849, 'eval_runtime': 95.4682, 'eval_samples_per_second': 10.422, 'eval_steps_per_second': 5.216, 'epoch': 0.42}


 14%|█▍        | 1900/13473 [5:29:59<6:15:37,  1.95s/it] 

{'loss': 0.689, 'grad_norm': 0.5511382818222046, 'learning_rate': 4.4604177907962696e-05, 'epoch': 0.42}


                                                        
 14%|█▍        | 1900/13473 [5:31:32<6:15:37,  1.95s/it]

{'eval_loss': 0.7135331034660339, 'eval_runtime': 93.7825, 'eval_samples_per_second': 10.61, 'eval_steps_per_second': 5.31, 'epoch': 0.42}


 14%|█▍        | 1910/13473 [5:31:41<6:11:08,  1.93s/it] 

{'loss': 0.6801, 'grad_norm': 0.9061484932899475, 'learning_rate': 4.4565636321591e-05, 'epoch': 0.43}


                                                        
 14%|█▍        | 1910/13473 [5:33:15<6:11:08,  1.93s/it]

{'eval_loss': 0.7068142294883728, 'eval_runtime': 93.8931, 'eval_samples_per_second': 10.597, 'eval_steps_per_second': 5.304, 'epoch': 0.43}


 14%|█▍        | 1920/13473 [5:33:24<6:08:02,  1.91s/it] 

{'loss': 0.61, 'grad_norm': 0.8590366840362549, 'learning_rate': 4.4527094735219305e-05, 'epoch': 0.43}


                                                        
 14%|█▍        | 1920/13473 [5:34:54<6:08:02,  1.91s/it]

{'eval_loss': 0.710296094417572, 'eval_runtime': 90.7715, 'eval_samples_per_second': 10.962, 'eval_steps_per_second': 5.486, 'epoch': 0.43}


 14%|█▍        | 1930/13473 [5:35:03<6:06:27,  1.90s/it] 

{'loss': 0.8132, 'grad_norm': 0.502025306224823, 'learning_rate': 4.448855314884761e-05, 'epoch': 0.43}


                                                        
 14%|█▍        | 1930/13473 [5:36:31<6:06:27,  1.90s/it]

{'eval_loss': 0.7122973799705505, 'eval_runtime': 87.5923, 'eval_samples_per_second': 11.359, 'eval_steps_per_second': 5.685, 'epoch': 0.43}


 14%|█▍        | 1940/13473 [5:36:39<5:56:49,  1.86s/it] 

{'loss': 0.5575, 'grad_norm': 0.6873029470443726, 'learning_rate': 4.4450011562475915e-05, 'epoch': 0.43}


                                                        
 14%|█▍        | 1940/13473 [5:38:09<5:56:49,  1.86s/it]

{'eval_loss': 0.7202385067939758, 'eval_runtime': 89.9274, 'eval_samples_per_second': 11.064, 'eval_steps_per_second': 5.538, 'epoch': 0.43}


 14%|█▍        | 1950/13473 [5:38:18<6:04:03,  1.90s/it] 

{'loss': 0.8967, 'grad_norm': 2.005964994430542, 'learning_rate': 4.441146997610422e-05, 'epoch': 0.43}


                                                        
 14%|█▍        | 1950/13473 [5:39:54<6:04:03,  1.90s/it]

{'eval_loss': 0.7201951146125793, 'eval_runtime': 96.2872, 'eval_samples_per_second': 10.334, 'eval_steps_per_second': 5.172, 'epoch': 0.43}


 15%|█▍        | 1960/13473 [5:40:03<6:15:38,  1.96s/it] 

{'loss': 0.8191, 'grad_norm': 1.1009063720703125, 'learning_rate': 4.4372928389732524e-05, 'epoch': 0.44}


                                                        
 15%|█▍        | 1960/13473 [5:41:35<6:15:38,  1.96s/it]

{'eval_loss': 0.7128644585609436, 'eval_runtime': 92.7072, 'eval_samples_per_second': 10.733, 'eval_steps_per_second': 5.372, 'epoch': 0.44}


 15%|█▍        | 1970/13473 [5:41:44<6:12:45,  1.94s/it] 

{'loss': 0.5397, 'grad_norm': 0.7201723456382751, 'learning_rate': 4.4334386803360825e-05, 'epoch': 0.44}


                                                        
 15%|█▍        | 1970/13473 [5:43:19<6:12:45,  1.94s/it]

{'eval_loss': 0.720757782459259, 'eval_runtime': 94.5999, 'eval_samples_per_second': 10.518, 'eval_steps_per_second': 5.264, 'epoch': 0.44}


 15%|█▍        | 1980/13473 [5:43:27<6:19:34,  1.98s/it] 

{'loss': 0.7673, 'grad_norm': 0.9559247493743896, 'learning_rate': 4.429584521698913e-05, 'epoch': 0.44}


                                                        
 15%|█▍        | 1980/13473 [5:45:07<6:19:34,  1.98s/it]

{'eval_loss': 0.7134225368499756, 'eval_runtime': 99.2741, 'eval_samples_per_second': 10.023, 'eval_steps_per_second': 5.016, 'epoch': 0.44}


 15%|█▍        | 1990/13473 [5:45:17<6:36:42,  2.07s/it]  

{'loss': 1.0593, 'grad_norm': 1.2111252546310425, 'learning_rate': 4.4257303630617434e-05, 'epoch': 0.44}


                                                        
 15%|█▍        | 1990/13473 [5:46:43<6:36:42,  2.07s/it]

{'eval_loss': 0.7526330947875977, 'eval_runtime': 86.3756, 'eval_samples_per_second': 11.519, 'eval_steps_per_second': 5.766, 'epoch': 0.44}


 15%|█▍        | 2000/13473 [5:46:51<5:49:05,  1.83s/it] 

{'loss': 0.6858, 'grad_norm': 0.4536432921886444, 'learning_rate': 4.421876204424574e-05, 'epoch': 0.45}


                                                        
 15%|█▍        | 2000/13473 [5:48:22<5:49:05,  1.83s/it]

{'eval_loss': 0.7095974683761597, 'eval_runtime': 90.861, 'eval_samples_per_second': 10.951, 'eval_steps_per_second': 5.481, 'epoch': 0.45}


 15%|█▍        | 2010/13473 [5:48:33<6:30:45,  2.05s/it] 

{'loss': 0.8302, 'grad_norm': 0.525465726852417, 'learning_rate': 4.418022045787405e-05, 'epoch': 0.45}


                                                        
 15%|█▍        | 2010/13473 [5:50:07<6:30:45,  2.05s/it]

{'eval_loss': 0.7318348288536072, 'eval_runtime': 93.7014, 'eval_samples_per_second': 10.619, 'eval_steps_per_second': 5.315, 'epoch': 0.45}


 15%|█▍        | 2020/13473 [5:50:15<6:12:53,  1.95s/it] 

{'loss': 0.7448, 'grad_norm': 0.3993614614009857, 'learning_rate': 4.414167887150235e-05, 'epoch': 0.45}


                                                        
 15%|█▍        | 2020/13473 [5:51:40<6:12:53,  1.95s/it]

{'eval_loss': 0.7100774645805359, 'eval_runtime': 84.7423, 'eval_samples_per_second': 11.741, 'eval_steps_per_second': 5.877, 'epoch': 0.45}


 15%|█▌        | 2030/13473 [5:51:49<5:48:54,  1.83s/it] 

{'loss': 1.3788, 'grad_norm': 1.3902130126953125, 'learning_rate': 4.410313728513066e-05, 'epoch': 0.45}


                                                        
 15%|█▌        | 2030/13473 [5:53:18<5:48:54,  1.83s/it]

{'eval_loss': 0.7085111737251282, 'eval_runtime': 89.0787, 'eval_samples_per_second': 11.17, 'eval_steps_per_second': 5.591, 'epoch': 0.45}


 15%|█▌        | 2040/13473 [5:53:26<6:01:58,  1.90s/it] 

{'loss': 0.6324, 'grad_norm': 1.3850826025009155, 'learning_rate': 4.406459569875897e-05, 'epoch': 0.45}


                                                        
 15%|█▌        | 2040/13473 [5:54:53<6:01:58,  1.90s/it]

{'eval_loss': 0.7115429639816284, 'eval_runtime': 87.2501, 'eval_samples_per_second': 11.404, 'eval_steps_per_second': 5.708, 'epoch': 0.45}


 15%|█▌        | 2050/13473 [5:55:02<5:55:03,  1.86s/it] 

{'loss': 0.7761, 'grad_norm': 2.119194984436035, 'learning_rate': 4.402605411238727e-05, 'epoch': 0.46}


                                                        
 15%|█▌        | 2050/13473 [5:56:29<5:55:03,  1.86s/it]

{'eval_loss': 0.7130801677703857, 'eval_runtime': 87.3428, 'eval_samples_per_second': 11.392, 'eval_steps_per_second': 5.702, 'epoch': 0.46}


 15%|█▌        | 2060/13473 [5:56:38<5:56:19,  1.87s/it] 

{'loss': 0.7348, 'grad_norm': 2.0510318279266357, 'learning_rate': 4.3987512526015576e-05, 'epoch': 0.46}


                                                        
 15%|█▌        | 2060/13473 [5:58:06<5:56:19,  1.87s/it]

{'eval_loss': 0.7089831233024597, 'eval_runtime': 88.2106, 'eval_samples_per_second': 11.28, 'eval_steps_per_second': 5.646, 'epoch': 0.46}


 15%|█▌        | 2070/13473 [5:58:14<5:51:17,  1.85s/it] 

{'loss': 0.8072, 'grad_norm': 1.0312271118164062, 'learning_rate': 4.394897093964388e-05, 'epoch': 0.46}


                                                        
 15%|█▌        | 2070/13473 [5:59:42<5:51:17,  1.85s/it]

{'eval_loss': 0.7256609201431274, 'eval_runtime': 88.0852, 'eval_samples_per_second': 11.296, 'eval_steps_per_second': 5.654, 'epoch': 0.46}


 15%|█▌        | 2080/13473 [5:59:51<5:51:34,  1.85s/it] 

{'loss': 0.901, 'grad_norm': 5.194822311401367, 'learning_rate': 4.391042935327218e-05, 'epoch': 0.46}


                                                        
 15%|█▌        | 2080/13473 [6:01:17<5:51:34,  1.85s/it]

{'eval_loss': 0.7163109183311462, 'eval_runtime': 86.7267, 'eval_samples_per_second': 11.473, 'eval_steps_per_second': 5.742, 'epoch': 0.46}


 16%|█▌        | 2090/13473 [6:01:26<5:53:14,  1.86s/it] 

{'loss': 1.6945, 'grad_norm': 3.5952460765838623, 'learning_rate': 4.3871887766900487e-05, 'epoch': 0.47}


                                                        
 16%|█▌        | 2090/13473 [6:02:52<5:53:14,  1.86s/it]

{'eval_loss': 0.706464946269989, 'eval_runtime': 85.6456, 'eval_samples_per_second': 11.618, 'eval_steps_per_second': 5.815, 'epoch': 0.47}


 16%|█▌        | 2100/13473 [6:03:00<5:47:35,  1.83s/it] 

{'loss': 0.3928, 'grad_norm': 0.8949377536773682, 'learning_rate': 4.3833346180528795e-05, 'epoch': 0.47}


                                                        
 16%|█▌        | 2100/13473 [6:04:21<5:47:35,  1.83s/it]

{'eval_loss': 0.707107424736023, 'eval_runtime': 80.8922, 'eval_samples_per_second': 12.3, 'eval_steps_per_second': 6.156, 'epoch': 0.47}


 16%|█▌        | 2110/13473 [6:04:29<5:44:45,  1.82s/it] 

{'loss': 0.749, 'grad_norm': 0.7975779175758362, 'learning_rate': 4.3794804594157096e-05, 'epoch': 0.47}


                                                        
 16%|█▌        | 2110/13473 [6:05:57<5:44:45,  1.82s/it]

{'eval_loss': 0.7209849953651428, 'eval_runtime': 87.7765, 'eval_samples_per_second': 11.336, 'eval_steps_per_second': 5.673, 'epoch': 0.47}


 16%|█▌        | 2120/13473 [6:06:06<5:59:04,  1.90s/it] 

{'loss': 1.0541, 'grad_norm': 0.7388021945953369, 'learning_rate': 4.3756263007785404e-05, 'epoch': 0.47}


                                                        
 16%|█▌        | 2120/13473 [6:07:32<5:59:04,  1.90s/it]

{'eval_loss': 0.7050874829292297, 'eval_runtime': 85.877, 'eval_samples_per_second': 11.586, 'eval_steps_per_second': 5.799, 'epoch': 0.47}


 16%|█▌        | 2130/13473 [6:07:40<5:47:54,  1.84s/it] 

{'loss': 0.7722, 'grad_norm': 0.47283366322517395, 'learning_rate': 4.371772142141371e-05, 'epoch': 0.47}


                                                        
 16%|█▌        | 2130/13473 [6:09:05<5:47:54,  1.84s/it]

{'eval_loss': 0.7094653844833374, 'eval_runtime': 84.3334, 'eval_samples_per_second': 11.798, 'eval_steps_per_second': 5.905, 'epoch': 0.47}


 16%|█▌        | 2140/13473 [6:09:13<5:43:04,  1.82s/it] 

{'loss': 0.5262, 'grad_norm': 0.4131946265697479, 'learning_rate': 4.367917983504201e-05, 'epoch': 0.48}


                                                        
 16%|█▌        | 2140/13473 [6:10:40<5:43:04,  1.82s/it]

{'eval_loss': 0.7152077555656433, 'eval_runtime': 86.6064, 'eval_samples_per_second': 11.489, 'eval_steps_per_second': 5.75, 'epoch': 0.48}


 16%|█▌        | 2150/13473 [6:10:48<5:50:25,  1.86s/it] 

{'loss': 0.8029, 'grad_norm': 0.9747267961502075, 'learning_rate': 4.364063824867032e-05, 'epoch': 0.48}


                                                        
 16%|█▌        | 2150/13473 [6:12:29<5:50:25,  1.86s/it]

{'eval_loss': 0.706454336643219, 'eval_runtime': 100.3312, 'eval_samples_per_second': 9.917, 'eval_steps_per_second': 4.964, 'epoch': 0.48}


 16%|█▌        | 2160/13473 [6:12:40<6:41:49,  2.13s/it]  

{'loss': 0.5355, 'grad_norm': 0.9251266121864319, 'learning_rate': 4.360209666229862e-05, 'epoch': 0.48}


                                                        
 16%|█▌        | 2160/13473 [6:14:03<6:41:49,  2.13s/it]

{'eval_loss': 0.7122470140457153, 'eval_runtime': 83.1477, 'eval_samples_per_second': 11.967, 'eval_steps_per_second': 5.989, 'epoch': 0.48}


 16%|█▌        | 2170/13473 [6:14:13<6:00:13,  1.91s/it] 

{'loss': 0.7427, 'grad_norm': 0.32756486535072327, 'learning_rate': 4.356355507592692e-05, 'epoch': 0.48}


                                                        
 16%|█▌        | 2170/13473 [6:15:36<6:00:13,  1.91s/it]

{'eval_loss': 0.7245118618011475, 'eval_runtime': 83.0271, 'eval_samples_per_second': 11.984, 'eval_steps_per_second': 5.998, 'epoch': 0.48}


 16%|█▌        | 2180/13473 [6:15:44<5:35:58,  1.79s/it] 

{'loss': 0.4328, 'grad_norm': 0.29465657472610474, 'learning_rate': 4.352501348955523e-05, 'epoch': 0.49}


                                                        
 16%|█▌        | 2180/13473 [6:17:10<5:35:58,  1.79s/it]

{'eval_loss': 0.7371435165405273, 'eval_runtime': 86.3387, 'eval_samples_per_second': 11.524, 'eval_steps_per_second': 5.768, 'epoch': 0.49}


 16%|█▋        | 2190/13473 [6:17:19<5:43:48,  1.83s/it] 

{'loss': 0.7478, 'grad_norm': 4.329399108886719, 'learning_rate': 4.348647190318353e-05, 'epoch': 0.49}


                                                        
 16%|█▋        | 2190/13473 [6:18:46<5:43:48,  1.83s/it]

{'eval_loss': 0.7369129657745361, 'eval_runtime': 87.4566, 'eval_samples_per_second': 11.377, 'eval_steps_per_second': 5.694, 'epoch': 0.49}


 16%|█▋        | 2200/13473 [6:18:54<5:47:20,  1.85s/it] 

{'loss': 0.5485, 'grad_norm': 0.3527403175830841, 'learning_rate': 4.344793031681184e-05, 'epoch': 0.49}


                                                        
 16%|█▋        | 2200/13473 [6:20:21<5:47:20,  1.85s/it]

{'eval_loss': 0.7088443040847778, 'eval_runtime': 86.7604, 'eval_samples_per_second': 11.468, 'eval_steps_per_second': 5.74, 'epoch': 0.49}


 16%|█▋        | 2210/13473 [6:20:30<6:02:45,  1.93s/it] 

{'loss': 0.7687, 'grad_norm': 2.7012174129486084, 'learning_rate': 4.340938873044015e-05, 'epoch': 0.49}


                                                        
 16%|█▋        | 2210/13473 [6:22:01<6:02:45,  1.93s/it]

{'eval_loss': 0.7051684260368347, 'eval_runtime': 91.0115, 'eval_samples_per_second': 10.933, 'eval_steps_per_second': 5.472, 'epoch': 0.49}


 16%|█▋        | 2220/13473 [6:22:09<5:52:02,  1.88s/it] 

{'loss': 0.8002, 'grad_norm': 0.88569176197052, 'learning_rate': 4.337084714406845e-05, 'epoch': 0.49}


                                                        
 16%|█▋        | 2220/13473 [6:23:38<5:52:02,  1.88s/it]

{'eval_loss': 0.7028812766075134, 'eval_runtime': 88.4037, 'eval_samples_per_second': 11.255, 'eval_steps_per_second': 5.633, 'epoch': 0.49}


 17%|█▋        | 2230/13473 [6:23:46<5:48:30,  1.86s/it] 

{'loss': 0.5887, 'grad_norm': 0.6947250962257385, 'learning_rate': 4.333230555769676e-05, 'epoch': 0.5}


                                                        
 17%|█▋        | 2230/13473 [6:25:15<5:48:30,  1.86s/it]

{'eval_loss': 0.710533618927002, 'eval_runtime': 89.1465, 'eval_samples_per_second': 11.161, 'eval_steps_per_second': 5.586, 'epoch': 0.5}


 17%|█▋        | 2240/13473 [6:25:24<5:52:28,  1.88s/it] 

{'loss': 0.7827, 'grad_norm': 1.7201768159866333, 'learning_rate': 4.3293763971325065e-05, 'epoch': 0.5}


                                                        
 17%|█▋        | 2240/13473 [6:26:50<5:52:28,  1.88s/it]

{'eval_loss': 0.7056146264076233, 'eval_runtime': 86.7649, 'eval_samples_per_second': 11.468, 'eval_steps_per_second': 5.74, 'epoch': 0.5}


 17%|█▋        | 2250/13473 [6:26:59<5:45:29,  1.85s/it] 

{'loss': 0.5435, 'grad_norm': 0.43626004457473755, 'learning_rate': 4.3255222384953366e-05, 'epoch': 0.5}


                                                        
 17%|█▋        | 2250/13473 [6:28:27<5:45:29,  1.85s/it]

{'eval_loss': 0.7082577347755432, 'eval_runtime': 87.7046, 'eval_samples_per_second': 11.345, 'eval_steps_per_second': 5.678, 'epoch': 0.5}


 17%|█▋        | 2260/13473 [6:28:37<5:55:00,  1.90s/it] 

{'loss': 0.4991, 'grad_norm': 0.44012683629989624, 'learning_rate': 4.3216680798581674e-05, 'epoch': 0.5}


                                                        
 17%|█▋        | 2260/13473 [6:30:03<5:55:00,  1.90s/it]

{'eval_loss': 0.7278092503547668, 'eval_runtime': 86.6149, 'eval_samples_per_second': 11.488, 'eval_steps_per_second': 5.75, 'epoch': 0.5}


 17%|█▋        | 2270/13473 [6:30:12<5:48:11,  1.86s/it] 

{'loss': 0.7664, 'grad_norm': 0.37995803356170654, 'learning_rate': 4.3178139212209976e-05, 'epoch': 0.51}


                                                        
 17%|█▋        | 2270/13473 [6:31:38<5:48:11,  1.86s/it]

{'eval_loss': 0.7178211808204651, 'eval_runtime': 85.158, 'eval_samples_per_second': 11.684, 'eval_steps_per_second': 5.848, 'epoch': 0.51}


 17%|█▋        | 2280/13473 [6:31:46<5:40:52,  1.83s/it] 

{'loss': 0.9078, 'grad_norm': 1.2985093593597412, 'learning_rate': 4.313959762583828e-05, 'epoch': 0.51}


                                                        
 17%|█▋        | 2280/13473 [6:33:11<5:40:52,  1.83s/it]

{'eval_loss': 0.7119063138961792, 'eval_runtime': 84.9594, 'eval_samples_per_second': 11.711, 'eval_steps_per_second': 5.862, 'epoch': 0.51}


 17%|█▋        | 2290/13473 [6:33:19<5:40:30,  1.83s/it] 

{'loss': 0.5846, 'grad_norm': 0.513528048992157, 'learning_rate': 4.3101056039466585e-05, 'epoch': 0.51}


                                                        
 17%|█▋        | 2290/13473 [6:34:46<5:40:30,  1.83s/it]

{'eval_loss': 0.7095739841461182, 'eval_runtime': 87.4647, 'eval_samples_per_second': 11.376, 'eval_steps_per_second': 5.694, 'epoch': 0.51}


 17%|█▋        | 2300/13473 [6:34:55<5:48:15,  1.87s/it] 

{'loss': 0.5577, 'grad_norm': 0.8349189162254333, 'learning_rate': 4.306251445309489e-05, 'epoch': 0.51}


                                                        
 17%|█▋        | 2300/13473 [6:36:22<5:48:15,  1.87s/it]

{'eval_loss': 0.7230244278907776, 'eval_runtime': 86.5663, 'eval_samples_per_second': 11.494, 'eval_steps_per_second': 5.753, 'epoch': 0.51}


 17%|█▋        | 2310/13473 [6:36:32<5:59:44,  1.93s/it] 

{'loss': 0.5943, 'grad_norm': 0.41708171367645264, 'learning_rate': 4.3023972866723194e-05, 'epoch': 0.51}


                                                        
 17%|█▋        | 2310/13473 [6:37:58<5:59:44,  1.93s/it]

{'eval_loss': 0.7248602509498596, 'eval_runtime': 85.9538, 'eval_samples_per_second': 11.576, 'eval_steps_per_second': 5.794, 'epoch': 0.51}


 17%|█▋        | 2320/13473 [6:38:06<5:45:43,  1.86s/it] 

{'loss': 0.7947, 'grad_norm': 1.2040188312530518, 'learning_rate': 4.29854312803515e-05, 'epoch': 0.52}


                                                        
 17%|█▋        | 2320/13473 [6:39:35<5:45:43,  1.86s/it]

{'eval_loss': 0.7108703851699829, 'eval_runtime': 88.4914, 'eval_samples_per_second': 11.244, 'eval_steps_per_second': 5.628, 'epoch': 0.52}


 17%|█▋        | 2330/13473 [6:39:43<5:41:52,  1.84s/it] 

{'loss': 0.8752, 'grad_norm': 0.379834920167923, 'learning_rate': 4.294688969397981e-05, 'epoch': 0.52}


                                                        
 17%|█▋        | 2330/13473 [6:41:07<5:41:52,  1.84s/it]

{'eval_loss': 0.7096251249313354, 'eval_runtime': 84.8202, 'eval_samples_per_second': 11.731, 'eval_steps_per_second': 5.871, 'epoch': 0.52}


 17%|█▋        | 2340/13473 [6:41:16<5:33:45,  1.80s/it] 

{'loss': 0.6209, 'grad_norm': 0.32229772210121155, 'learning_rate': 4.290834810760811e-05, 'epoch': 0.52}


                                                        
 17%|█▋        | 2340/13473 [6:42:45<5:33:45,  1.80s/it]

{'eval_loss': 0.7129856944084167, 'eval_runtime': 88.8341, 'eval_samples_per_second': 11.201, 'eval_steps_per_second': 5.606, 'epoch': 0.52}


 17%|█▋        | 2350/13473 [6:42:53<5:46:15,  1.87s/it] 

{'loss': 1.481, 'grad_norm': 1.3330609798431396, 'learning_rate': 4.286980652123642e-05, 'epoch': 0.52}


                                                        
 17%|█▋        | 2350/13473 [6:44:24<5:46:15,  1.87s/it]

{'eval_loss': 0.7013341188430786, 'eval_runtime': 91.6634, 'eval_samples_per_second': 10.855, 'eval_steps_per_second': 5.433, 'epoch': 0.52}


 18%|█▊        | 2360/13473 [6:44:32<5:53:19,  1.91s/it] 

{'loss': 0.8944, 'grad_norm': 1.4124701023101807, 'learning_rate': 4.283126493486472e-05, 'epoch': 0.53}


                                                        
 18%|█▊        | 2360/13473 [6:46:08<5:53:19,  1.91s/it]

{'eval_loss': 0.7053952813148499, 'eval_runtime': 95.4485, 'eval_samples_per_second': 10.424, 'eval_steps_per_second': 5.217, 'epoch': 0.53}


 18%|█▊        | 2370/13473 [6:46:16<6:05:06,  1.97s/it] 

{'loss': 0.9256, 'grad_norm': 0.7657518982887268, 'learning_rate': 4.279272334849302e-05, 'epoch': 0.53}


                                                        
 18%|█▊        | 2370/13473 [6:47:44<6:05:06,  1.97s/it]

{'eval_loss': 0.7044410109519958, 'eval_runtime': 87.455, 'eval_samples_per_second': 11.377, 'eval_steps_per_second': 5.694, 'epoch': 0.53}


 18%|█▊        | 2380/13473 [6:47:52<5:43:37,  1.86s/it] 

{'loss': 0.7646, 'grad_norm': 3.390348434448242, 'learning_rate': 4.275418176212133e-05, 'epoch': 0.53}


                                                        
 18%|█▊        | 2380/13473 [6:49:27<5:43:37,  1.86s/it]

{'eval_loss': 0.7065070271492004, 'eval_runtime': 94.6089, 'eval_samples_per_second': 10.517, 'eval_steps_per_second': 5.264, 'epoch': 0.53}


 18%|█▊        | 2390/13473 [6:49:35<6:02:05,  1.96s/it] 

{'loss': 1.116, 'grad_norm': 0.46728843450546265, 'learning_rate': 4.271564017574964e-05, 'epoch': 0.53}


                                                        
 18%|█▊        | 2390/13473 [6:51:13<6:02:05,  1.96s/it]

{'eval_loss': 0.7022847533226013, 'eval_runtime': 97.3931, 'eval_samples_per_second': 10.216, 'eval_steps_per_second': 5.113, 'epoch': 0.53}


 18%|█▊        | 2400/13473 [6:51:21<6:03:11,  1.97s/it] 

{'loss': 1.0189, 'grad_norm': 0.45763328671455383, 'learning_rate': 4.267709858937794e-05, 'epoch': 0.53}


                                                        
 18%|█▊        | 2400/13473 [6:53:00<6:03:11,  1.97s/it]

{'eval_loss': 0.7025490999221802, 'eval_runtime': 98.9344, 'eval_samples_per_second': 10.057, 'eval_steps_per_second': 5.034, 'epoch': 0.53}


 18%|█▊        | 2410/13473 [6:53:09<6:17:20,  2.05s/it] 

{'loss': 0.4715, 'grad_norm': 0.5978042483329773, 'learning_rate': 4.2638557003006246e-05, 'epoch': 0.54}


                                                        
 18%|█▊        | 2410/13473 [6:54:42<6:17:20,  2.05s/it]

{'eval_loss': 0.7114347219467163, 'eval_runtime': 93.2488, 'eval_samples_per_second': 10.67, 'eval_steps_per_second': 5.341, 'epoch': 0.54}


 18%|█▊        | 2420/13473 [6:54:50<5:56:19,  1.93s/it] 

{'loss': 0.7438, 'grad_norm': 0.5149557590484619, 'learning_rate': 4.2600015416634554e-05, 'epoch': 0.54}


                                                        
 18%|█▊        | 2420/13473 [6:56:20<5:56:19,  1.93s/it]

{'eval_loss': 0.7123897075653076, 'eval_runtime': 89.7191, 'eval_samples_per_second': 11.09, 'eval_steps_per_second': 5.551, 'epoch': 0.54}


 18%|█▊        | 2430/13473 [6:56:28<5:44:14,  1.87s/it] 

{'loss': 0.9386, 'grad_norm': 0.7856292128562927, 'learning_rate': 4.2561473830262855e-05, 'epoch': 0.54}


                                                        
 18%|█▊        | 2430/13473 [6:57:55<5:44:14,  1.87s/it]

{'eval_loss': 0.7264742851257324, 'eval_runtime': 87.3509, 'eval_samples_per_second': 11.391, 'eval_steps_per_second': 5.701, 'epoch': 0.54}


 18%|█▊        | 2440/13473 [6:58:04<5:52:46,  1.92s/it] 

{'loss': 0.64, 'grad_norm': 1.257993459701538, 'learning_rate': 4.2522932243891163e-05, 'epoch': 0.54}


                                                        
 18%|█▊        | 2440/13473 [6:59:32<5:52:46,  1.92s/it]

{'eval_loss': 0.7136597037315369, 'eval_runtime': 88.1423, 'eval_samples_per_second': 11.289, 'eval_steps_per_second': 5.65, 'epoch': 0.54}


 18%|█▊        | 2450/13473 [6:59:41<5:42:53,  1.87s/it] 

{'loss': 0.7783, 'grad_norm': 2.263071298599243, 'learning_rate': 4.2484390657519465e-05, 'epoch': 0.55}


                                                        
 18%|█▊        | 2450/13473 [7:01:13<5:42:53,  1.87s/it]

{'eval_loss': 0.7238047122955322, 'eval_runtime': 92.6009, 'eval_samples_per_second': 10.745, 'eval_steps_per_second': 5.378, 'epoch': 0.55}


 18%|█▊        | 2460/13473 [7:01:22<5:48:05,  1.90s/it] 

{'loss': 0.5698, 'grad_norm': 0.8883643746376038, 'learning_rate': 4.244584907114777e-05, 'epoch': 0.55}


                                                        
 18%|█▊        | 2460/13473 [7:02:52<5:48:05,  1.90s/it]

{'eval_loss': 0.7253932952880859, 'eval_runtime': 90.0107, 'eval_samples_per_second': 11.054, 'eval_steps_per_second': 5.533, 'epoch': 0.55}


 18%|█▊        | 2470/13473 [7:03:00<5:48:34,  1.90s/it] 

{'loss': 0.7319, 'grad_norm': 0.5488268136978149, 'learning_rate': 4.2407307484776074e-05, 'epoch': 0.55}


                                                        
 18%|█▊        | 2470/13473 [7:04:36<5:48:34,  1.90s/it]

{'eval_loss': 0.7169007658958435, 'eval_runtime': 95.5043, 'eval_samples_per_second': 10.418, 'eval_steps_per_second': 5.214, 'epoch': 0.55}


 18%|█▊        | 2480/13473 [7:04:45<6:02:00,  1.98s/it] 

{'loss': 0.6643, 'grad_norm': 0.5260761976242065, 'learning_rate': 4.2368765898404375e-05, 'epoch': 0.55}


                                                        
 18%|█▊        | 2480/13473 [7:06:18<6:02:00,  1.98s/it]

{'eval_loss': 0.716801643371582, 'eval_runtime': 93.1476, 'eval_samples_per_second': 10.682, 'eval_steps_per_second': 5.346, 'epoch': 0.55}


 18%|█▊        | 2490/13473 [7:06:26<5:58:48,  1.96s/it] 

{'loss': 0.731, 'grad_norm': 0.36738353967666626, 'learning_rate': 4.233022431203268e-05, 'epoch': 0.55}


                                                        
 18%|█▊        | 2490/13473 [7:08:00<5:58:48,  1.96s/it]

{'eval_loss': 0.7172192931175232, 'eval_runtime': 93.1667, 'eval_samples_per_second': 10.68, 'eval_steps_per_second': 5.345, 'epoch': 0.55}


 19%|█▊        | 2500/13473 [7:08:08<6:04:44,  1.99s/it] 

{'loss': 0.9385, 'grad_norm': 2.278660297393799, 'learning_rate': 4.229168272566099e-05, 'epoch': 0.56}


                                                        
 19%|█▊        | 2500/13473 [7:09:40<6:04:44,  1.99s/it]

{'eval_loss': 0.7143375277519226, 'eval_runtime': 91.1341, 'eval_samples_per_second': 10.918, 'eval_steps_per_second': 5.464, 'epoch': 0.56}


 19%|█▊        | 2510/13473 [7:09:50<5:53:15,  1.93s/it] 

{'loss': 0.9322, 'grad_norm': 2.9008090496063232, 'learning_rate': 4.225314113928929e-05, 'epoch': 0.56}


                                                        
 19%|█▊        | 2510/13473 [7:11:17<5:53:15,  1.93s/it]

{'eval_loss': 0.7025614976882935, 'eval_runtime': 87.7215, 'eval_samples_per_second': 11.343, 'eval_steps_per_second': 5.677, 'epoch': 0.56}


 19%|█▊        | 2520/13473 [7:11:26<5:52:17,  1.93s/it] 

{'loss': 0.5627, 'grad_norm': 1.0668087005615234, 'learning_rate': 4.22145995529176e-05, 'epoch': 0.56}


                                                        
 19%|█▊        | 2520/13473 [7:12:58<5:52:17,  1.93s/it]

{'eval_loss': 0.703767716884613, 'eval_runtime': 91.8117, 'eval_samples_per_second': 10.837, 'eval_steps_per_second': 5.424, 'epoch': 0.56}


 19%|█▉        | 2530/13473 [7:13:07<5:56:56,  1.96s/it] 

{'loss': 0.9373, 'grad_norm': 1.7086724042892456, 'learning_rate': 4.217605796654591e-05, 'epoch': 0.56}


                                                        
 19%|█▉        | 2530/13473 [7:14:23<5:56:56,  1.96s/it]

{'eval_loss': 0.7136369347572327, 'eval_runtime': 76.549, 'eval_samples_per_second': 12.998, 'eval_steps_per_second': 6.506, 'epoch': 0.56}


 19%|█▉        | 2540/13473 [7:14:32<5:12:39,  1.72s/it] 

{'loss': 0.6579, 'grad_norm': 1.1123682260513306, 'learning_rate': 4.213751638017421e-05, 'epoch': 0.57}


                                                        
 19%|█▉        | 2540/13473 [7:15:54<5:12:39,  1.72s/it]

{'eval_loss': 0.7109982371330261, 'eval_runtime': 81.9161, 'eval_samples_per_second': 12.147, 'eval_steps_per_second': 6.079, 'epoch': 0.57}


 19%|█▉        | 2550/13473 [7:16:03<5:25:13,  1.79s/it] 

{'loss': 0.3317, 'grad_norm': 0.5575303435325623, 'learning_rate': 4.209897479380252e-05, 'epoch': 0.57}


                                                        
 19%|█▉        | 2550/13473 [7:17:26<5:25:13,  1.79s/it]

{'eval_loss': 0.7272456288337708, 'eval_runtime': 83.4016, 'eval_samples_per_second': 11.93, 'eval_steps_per_second': 5.971, 'epoch': 0.57}


 19%|█▉        | 2560/13473 [7:17:35<5:33:06,  1.83s/it] 

{'loss': 0.7664, 'grad_norm': 2.4379775524139404, 'learning_rate': 4.206043320743082e-05, 'epoch': 0.57}


                                                        
 19%|█▉        | 2560/13473 [7:18:59<5:33:06,  1.83s/it]

{'eval_loss': 0.723773181438446, 'eval_runtime': 84.2372, 'eval_samples_per_second': 11.812, 'eval_steps_per_second': 5.912, 'epoch': 0.57}


 19%|█▉        | 2570/13473 [7:19:08<5:35:00,  1.84s/it] 

{'loss': 0.8374, 'grad_norm': 1.1198885440826416, 'learning_rate': 4.2021891621059126e-05, 'epoch': 0.57}


                                                        
 19%|█▉        | 2570/13473 [7:20:31<5:35:00,  1.84s/it]

{'eval_loss': 0.7091813683509827, 'eval_runtime': 83.7303, 'eval_samples_per_second': 11.883, 'eval_steps_per_second': 5.948, 'epoch': 0.57}


 19%|█▉        | 2580/13473 [7:20:40<5:32:18,  1.83s/it] 

{'loss': 0.9872, 'grad_norm': 0.49013394117355347, 'learning_rate': 4.198335003468743e-05, 'epoch': 0.57}


                                                        
 19%|█▉        | 2580/13473 [7:22:11<5:32:18,  1.83s/it]

{'eval_loss': 0.7030739784240723, 'eval_runtime': 91.399, 'eval_samples_per_second': 10.886, 'eval_steps_per_second': 5.449, 'epoch': 0.57}


 19%|█▉        | 2590/13473 [7:22:22<6:17:46,  2.08s/it] 

{'loss': 0.7596, 'grad_norm': 0.8372183442115784, 'learning_rate': 4.1944808448315735e-05, 'epoch': 0.58}


                                                        
 19%|█▉        | 2590/13473 [7:24:06<6:17:46,  2.08s/it]

{'eval_loss': 0.707950234413147, 'eval_runtime': 104.1556, 'eval_samples_per_second': 9.553, 'eval_steps_per_second': 4.781, 'epoch': 0.58}


 19%|█▉        | 2600/13473 [7:24:17<6:54:49,  2.29s/it]  

{'loss': 0.8133, 'grad_norm': 0.42152804136276245, 'learning_rate': 4.1906266861944037e-05, 'epoch': 0.58}


                                                        
 19%|█▉        | 2600/13473 [7:26:04<6:54:49,  2.29s/it]

{'eval_loss': 0.7049151062965393, 'eval_runtime': 106.8976, 'eval_samples_per_second': 9.308, 'eval_steps_per_second': 4.659, 'epoch': 0.58}


 19%|█▉        | 2610/13473 [7:26:15<7:04:32,  2.34s/it]  

{'loss': 0.6538, 'grad_norm': 0.9669948220252991, 'learning_rate': 4.1867725275572345e-05, 'epoch': 0.58}


                                                        
 19%|█▉        | 2610/13473 [7:27:38<7:04:32,  2.34s/it]

{'eval_loss': 0.7049977779388428, 'eval_runtime': 82.7191, 'eval_samples_per_second': 12.029, 'eval_steps_per_second': 6.02, 'epoch': 0.58}


 19%|█▉        | 2620/13473 [7:27:46<5:22:55,  1.79s/it] 

{'loss': 0.584, 'grad_norm': 0.8359925150871277, 'learning_rate': 4.182918368920065e-05, 'epoch': 0.58}


                                                        
 19%|█▉        | 2620/13473 [7:29:06<5:22:55,  1.79s/it]

{'eval_loss': 0.7071061134338379, 'eval_runtime': 80.1964, 'eval_samples_per_second': 12.407, 'eval_steps_per_second': 6.21, 'epoch': 0.58}


 20%|█▉        | 2630/13473 [7:29:15<5:19:23,  1.77s/it] 

{'loss': 0.6529, 'grad_norm': 0.530182957649231, 'learning_rate': 4.1790642102828954e-05, 'epoch': 0.59}


                                                        
 20%|█▉        | 2630/13473 [7:30:42<5:19:23,  1.77s/it]

{'eval_loss': 0.7116830348968506, 'eval_runtime': 87.0915, 'eval_samples_per_second': 11.425, 'eval_steps_per_second': 5.718, 'epoch': 0.59}


 20%|█▉        | 2640/13473 [7:30:51<5:34:57,  1.86s/it] 

{'loss': 0.6459, 'grad_norm': 0.6329637169837952, 'learning_rate': 4.175210051645726e-05, 'epoch': 0.59}


                                                        
 20%|█▉        | 2640/13473 [7:32:14<5:34:57,  1.86s/it]

{'eval_loss': 0.7133790254592896, 'eval_runtime': 82.9571, 'eval_samples_per_second': 11.994, 'eval_steps_per_second': 6.003, 'epoch': 0.59}


 20%|█▉        | 2650/13473 [7:32:23<5:27:42,  1.82s/it] 

{'loss': 0.5438, 'grad_norm': 0.41712960600852966, 'learning_rate': 4.171355893008557e-05, 'epoch': 0.59}


                                                        
 20%|█▉        | 2650/13473 [7:33:44<5:27:42,  1.82s/it]

{'eval_loss': 0.7249210476875305, 'eval_runtime': 81.2113, 'eval_samples_per_second': 12.252, 'eval_steps_per_second': 6.132, 'epoch': 0.59}


 20%|█▉        | 2660/13473 [7:33:53<5:22:37,  1.79s/it] 

{'loss': 0.6377, 'grad_norm': 0.4302455186843872, 'learning_rate': 4.167501734371387e-05, 'epoch': 0.59}


                                                        
 20%|█▉        | 2660/13473 [7:35:12<5:22:37,  1.79s/it]

{'eval_loss': 0.7179995179176331, 'eval_runtime': 79.7416, 'eval_samples_per_second': 12.478, 'eval_steps_per_second': 6.245, 'epoch': 0.59}


 20%|█▉        | 2670/13473 [7:35:21<5:16:49,  1.76s/it] 

{'loss': 0.8395, 'grad_norm': 1.28777015209198, 'learning_rate': 4.163647575734217e-05, 'epoch': 0.59}


                                                        
 20%|█▉        | 2670/13473 [7:36:41<5:16:49,  1.76s/it]

{'eval_loss': 0.7103638052940369, 'eval_runtime': 79.5997, 'eval_samples_per_second': 12.5, 'eval_steps_per_second': 6.256, 'epoch': 0.59}


 20%|█▉        | 2680/13473 [7:36:49<5:16:41,  1.76s/it] 

{'loss': 1.3472, 'grad_norm': 11.244604110717773, 'learning_rate': 4.159793417097048e-05, 'epoch': 0.6}


                                                        
 20%|█▉        | 2680/13473 [7:38:12<5:16:41,  1.76s/it]

{'eval_loss': 0.7102010250091553, 'eval_runtime': 82.8963, 'eval_samples_per_second': 12.003, 'eval_steps_per_second': 6.008, 'epoch': 0.6}


 20%|█▉        | 2690/13473 [7:38:21<5:21:08,  1.79s/it] 

{'loss': 1.0954, 'grad_norm': 0.8396473526954651, 'learning_rate': 4.155939258459878e-05, 'epoch': 0.6}


                                                        
 20%|█▉        | 2690/13473 [7:39:44<5:21:08,  1.79s/it]

{'eval_loss': 0.7071065306663513, 'eval_runtime': 83.8435, 'eval_samples_per_second': 11.867, 'eval_steps_per_second': 5.94, 'epoch': 0.6}


 20%|██        | 2700/13473 [7:39:53<5:17:59,  1.77s/it] 

{'loss': 0.7322, 'grad_norm': 4.626010417938232, 'learning_rate': 4.152085099822709e-05, 'epoch': 0.6}


                                                        
 20%|██        | 2700/13473 [7:41:16<5:17:59,  1.77s/it]

{'eval_loss': 0.7313265800476074, 'eval_runtime': 83.6158, 'eval_samples_per_second': 11.9, 'eval_steps_per_second': 5.956, 'epoch': 0.6}


 20%|██        | 2710/13473 [7:41:25<5:29:19,  1.84s/it] 

{'loss': 0.9501, 'grad_norm': 0.37793460488319397, 'learning_rate': 4.148230941185539e-05, 'epoch': 0.6}


                                                        
 20%|██        | 2710/13473 [7:42:52<5:29:19,  1.84s/it]

{'eval_loss': 0.7068334221839905, 'eval_runtime': 87.4615, 'eval_samples_per_second': 11.376, 'eval_steps_per_second': 5.694, 'epoch': 0.6}


 20%|██        | 2720/13473 [7:43:01<5:34:41,  1.87s/it] 

{'loss': 0.3605, 'grad_norm': 0.38565337657928467, 'learning_rate': 4.14437678254837e-05, 'epoch': 0.61}


                                                        
 20%|██        | 2720/13473 [7:44:26<5:34:41,  1.87s/it]

{'eval_loss': 0.7182583212852478, 'eval_runtime': 85.348, 'eval_samples_per_second': 11.658, 'eval_steps_per_second': 5.835, 'epoch': 0.61}


 20%|██        | 2730/13473 [7:44:35<5:23:30,  1.81s/it] 

{'loss': 0.4946, 'grad_norm': 0.3958157002925873, 'learning_rate': 4.1405226239112006e-05, 'epoch': 0.61}


                                                        
 20%|██        | 2730/13473 [7:46:01<5:23:30,  1.81s/it]

{'eval_loss': 0.7230778932571411, 'eval_runtime': 86.0721, 'eval_samples_per_second': 11.56, 'eval_steps_per_second': 5.786, 'epoch': 0.61}


 20%|██        | 2740/13473 [7:46:09<5:30:34,  1.85s/it] 

{'loss': 0.7439, 'grad_norm': 0.4530039131641388, 'learning_rate': 4.136668465274031e-05, 'epoch': 0.61}


                                                        
 20%|██        | 2740/13473 [7:47:35<5:30:34,  1.85s/it]

{'eval_loss': 0.7122494578361511, 'eval_runtime': 85.7772, 'eval_samples_per_second': 11.6, 'eval_steps_per_second': 5.806, 'epoch': 0.61}


 20%|██        | 2750/13473 [7:47:43<5:25:15,  1.82s/it] 

{'loss': 0.9765, 'grad_norm': 1.4748388528823853, 'learning_rate': 4.1328143066368615e-05, 'epoch': 0.61}


                                                        
 20%|██        | 2750/13473 [7:49:05<5:25:15,  1.82s/it]

{'eval_loss': 0.7129647135734558, 'eval_runtime': 81.5589, 'eval_samples_per_second': 12.2, 'eval_steps_per_second': 6.106, 'epoch': 0.61}


 20%|██        | 2760/13473 [7:49:13<5:15:54,  1.77s/it] 

{'loss': 0.8231, 'grad_norm': 1.1170254945755005, 'learning_rate': 4.128960147999692e-05, 'epoch': 0.61}


                                                        
 20%|██        | 2760/13473 [7:50:48<5:15:54,  1.77s/it]

{'eval_loss': 0.7237991690635681, 'eval_runtime': 95.2299, 'eval_samples_per_second': 10.448, 'eval_steps_per_second': 5.229, 'epoch': 0.61}


 21%|██        | 2770/13473 [7:50:57<5:44:37,  1.93s/it] 

{'loss': 0.7487, 'grad_norm': 0.9050852060317993, 'learning_rate': 4.1251059893625224e-05, 'epoch': 0.62}


                                                        
 21%|██        | 2770/13473 [7:52:30<5:44:37,  1.93s/it]

{'eval_loss': 0.7130987048149109, 'eval_runtime': 93.5406, 'eval_samples_per_second': 10.637, 'eval_steps_per_second': 5.324, 'epoch': 0.62}


 21%|██        | 2780/13473 [7:52:39<5:45:18,  1.94s/it] 

{'loss': 0.8444, 'grad_norm': 3.4775331020355225, 'learning_rate': 4.1212518307253526e-05, 'epoch': 0.62}


                                                        
 21%|██        | 2780/13473 [7:54:15<5:45:18,  1.94s/it]

{'eval_loss': 0.7111606597900391, 'eval_runtime': 96.1877, 'eval_samples_per_second': 10.344, 'eval_steps_per_second': 5.177, 'epoch': 0.62}


 21%|██        | 2790/13473 [7:54:23<5:52:59,  1.98s/it] 

{'loss': 1.0178, 'grad_norm': 0.5547999143600464, 'learning_rate': 4.1173976720881834e-05, 'epoch': 0.62}


                                                        
 21%|██        | 2790/13473 [7:55:50<5:52:59,  1.98s/it]

{'eval_loss': 0.7078545689582825, 'eval_runtime': 87.2516, 'eval_samples_per_second': 11.404, 'eval_steps_per_second': 5.708, 'epoch': 0.62}


 21%|██        | 2800/13473 [7:55:59<5:34:45,  1.88s/it] 

{'loss': 1.0723, 'grad_norm': 0.5110867023468018, 'learning_rate': 4.1135435134510135e-05, 'epoch': 0.62}


                                                        
 21%|██        | 2800/13473 [7:57:31<5:34:45,  1.88s/it]

{'eval_loss': 0.7039186954498291, 'eval_runtime': 91.6279, 'eval_samples_per_second': 10.859, 'eval_steps_per_second': 5.435, 'epoch': 0.62}


 21%|██        | 2810/13473 [7:57:39<5:40:07,  1.91s/it] 

{'loss': 1.1761, 'grad_norm': 0.426803320646286, 'learning_rate': 4.109689354813844e-05, 'epoch': 0.63}


                                                        
 21%|██        | 2810/13473 [7:59:16<5:40:07,  1.91s/it]

{'eval_loss': 0.7198567390441895, 'eval_runtime': 96.8446, 'eval_samples_per_second': 10.274, 'eval_steps_per_second': 5.142, 'epoch': 0.63}


 21%|██        | 2820/13473 [7:59:24<5:44:32,  1.94s/it] 

{'loss': 0.8533, 'grad_norm': 1.3366928100585938, 'learning_rate': 4.105835196176675e-05, 'epoch': 0.63}


                                                        
 21%|██        | 2820/13473 [8:01:00<5:44:32,  1.94s/it]

{'eval_loss': 0.7157492637634277, 'eval_runtime': 96.1159, 'eval_samples_per_second': 10.352, 'eval_steps_per_second': 5.181, 'epoch': 0.63}


 21%|██        | 2830/13473 [8:01:10<6:06:03,  2.06s/it] 

{'loss': 0.59, 'grad_norm': 0.8526480793952942, 'learning_rate': 4.101981037539505e-05, 'epoch': 0.63}


                                                        
 21%|██        | 2830/13473 [8:02:45<6:06:03,  2.06s/it]

{'eval_loss': 0.7171306014060974, 'eval_runtime': 94.7557, 'eval_samples_per_second': 10.501, 'eval_steps_per_second': 5.256, 'epoch': 0.63}


 21%|██        | 2840/13473 [8:02:54<5:45:35,  1.95s/it] 

{'loss': 0.69, 'grad_norm': 0.48200973868370056, 'learning_rate': 4.098126878902336e-05, 'epoch': 0.63}


                                                        
 21%|██        | 2840/13473 [8:04:31<5:45:35,  1.95s/it]

{'eval_loss': 0.7182706594467163, 'eval_runtime': 96.9795, 'eval_samples_per_second': 10.26, 'eval_steps_per_second': 5.135, 'epoch': 0.63}


 21%|██        | 2850/13473 [8:04:39<5:45:51,  1.95s/it] 

{'loss': 1.4004, 'grad_norm': 6.013093948364258, 'learning_rate': 4.094272720265167e-05, 'epoch': 0.63}


                                                        
 21%|██        | 2850/13473 [8:06:11<5:45:51,  1.95s/it]

{'eval_loss': 0.7078231573104858, 'eval_runtime': 92.4096, 'eval_samples_per_second': 10.767, 'eval_steps_per_second': 5.389, 'epoch': 0.63}


 21%|██        | 2860/13473 [8:06:20<5:48:22,  1.97s/it] 

{'loss': 0.735, 'grad_norm': 0.5813180804252625, 'learning_rate': 4.090418561627997e-05, 'epoch': 0.64}


                                                        
 21%|██        | 2860/13473 [8:07:47<5:48:22,  1.97s/it]

{'eval_loss': 0.7056270241737366, 'eval_runtime': 87.115, 'eval_samples_per_second': 11.422, 'eval_steps_per_second': 5.717, 'epoch': 0.64}


 21%|██▏       | 2870/13473 [8:07:56<5:30:46,  1.87s/it] 

{'loss': 0.9437, 'grad_norm': 0.6334320902824402, 'learning_rate': 4.086564402990827e-05, 'epoch': 0.64}


                                                        
 21%|██▏       | 2870/13473 [8:09:29<5:30:46,  1.87s/it]

{'eval_loss': 0.7025587558746338, 'eval_runtime': 93.497, 'eval_samples_per_second': 10.642, 'eval_steps_per_second': 5.326, 'epoch': 0.64}


 21%|██▏       | 2880/13473 [8:09:38<5:41:36,  1.93s/it] 

{'loss': 1.212, 'grad_norm': 0.5532926321029663, 'learning_rate': 4.082710244353658e-05, 'epoch': 0.64}


                                                        
 21%|██▏       | 2880/13473 [8:11:10<5:41:36,  1.93s/it]

{'eval_loss': 0.7017965316772461, 'eval_runtime': 91.8806, 'eval_samples_per_second': 10.829, 'eval_steps_per_second': 5.42, 'epoch': 0.64}


 21%|██▏       | 2890/13473 [8:11:18<5:35:14,  1.90s/it] 

{'loss': 0.3955, 'grad_norm': 0.49779343605041504, 'learning_rate': 4.078856085716488e-05, 'epoch': 0.64}


                                                        
 21%|██▏       | 2890/13473 [8:12:48<5:35:14,  1.90s/it]

{'eval_loss': 0.7118120193481445, 'eval_runtime': 89.3819, 'eval_samples_per_second': 11.132, 'eval_steps_per_second': 5.572, 'epoch': 0.64}


 22%|██▏       | 2900/13473 [8:12:56<5:27:16,  1.86s/it] 

{'loss': 0.4836, 'grad_norm': 0.29426395893096924, 'learning_rate': 4.075001927079319e-05, 'epoch': 0.65}


                                                        
 22%|██▏       | 2900/13473 [8:14:13<5:27:16,  1.86s/it]

{'eval_loss': 0.7240875959396362, 'eval_runtime': 76.6241, 'eval_samples_per_second': 12.985, 'eval_steps_per_second': 6.499, 'epoch': 0.65}


 22%|██▏       | 2910/13473 [8:14:21<4:57:58,  1.69s/it] 

{'loss': 0.4235, 'grad_norm': 0.3697238564491272, 'learning_rate': 4.0711477684421495e-05, 'epoch': 0.65}


                                                        
 22%|██▏       | 2910/13473 [8:15:41<4:57:58,  1.69s/it]

{'eval_loss': 0.7280880808830261, 'eval_runtime': 80.2419, 'eval_samples_per_second': 12.4, 'eval_steps_per_second': 6.206, 'epoch': 0.65}


 22%|██▏       | 2920/13473 [8:15:49<5:13:08,  1.78s/it] 

{'loss': 0.5288, 'grad_norm': 0.989150881767273, 'learning_rate': 4.0672936098049796e-05, 'epoch': 0.65}


                                                        
 22%|██▏       | 2920/13473 [8:17:10<5:13:08,  1.78s/it]

{'eval_loss': 0.7223702669143677, 'eval_runtime': 80.4823, 'eval_samples_per_second': 12.363, 'eval_steps_per_second': 6.188, 'epoch': 0.65}


 22%|██▏       | 2930/13473 [8:17:18<5:08:46,  1.76s/it] 

{'loss': 0.5462, 'grad_norm': 0.5804762244224548, 'learning_rate': 4.0634394511678104e-05, 'epoch': 0.65}


                                                        
 22%|██▏       | 2930/13473 [8:18:55<5:08:46,  1.76s/it]

{'eval_loss': 0.7147807478904724, 'eval_runtime': 96.5345, 'eval_samples_per_second': 10.307, 'eval_steps_per_second': 5.159, 'epoch': 0.65}


 22%|██▏       | 2940/13473 [8:19:08<6:58:56,  2.39s/it] 

{'loss': 1.2605, 'grad_norm': 0.3583478331565857, 'learning_rate': 4.059585292530641e-05, 'epoch': 0.65}


                                                        
 22%|██▏       | 2940/13473 [8:21:15<6:58:56,  2.39s/it]

{'eval_loss': 0.7049083113670349, 'eval_runtime': 126.6423, 'eval_samples_per_second': 7.857, 'eval_steps_per_second': 3.932, 'epoch': 0.65}


 22%|██▏       | 2950/13473 [8:21:29<8:40:01,  2.97s/it]  

{'loss': 0.641, 'grad_norm': 1.6882407665252686, 'learning_rate': 4.0557311338934713e-05, 'epoch': 0.66}


                                                        
 22%|██▏       | 2950/13473 [8:23:02<8:40:01,  2.97s/it]

{'eval_loss': 0.7065035700798035, 'eval_runtime': 93.0156, 'eval_samples_per_second': 10.697, 'eval_steps_per_second': 5.354, 'epoch': 0.66}


 22%|██▏       | 2960/13473 [8:23:13<6:11:38,  2.12s/it] 

{'loss': 1.2774, 'grad_norm': 0.366055965423584, 'learning_rate': 4.051876975256302e-05, 'epoch': 0.66}


                                                        
 22%|██▏       | 2960/13473 [8:24:32<6:11:38,  2.12s/it]

{'eval_loss': 0.7012293934822083, 'eval_runtime': 79.2066, 'eval_samples_per_second': 12.562, 'eval_steps_per_second': 6.287, 'epoch': 0.66}


 22%|██▏       | 2970/13473 [8:24:40<5:09:51,  1.77s/it] 

{'loss': 0.7788, 'grad_norm': 0.4928872883319855, 'learning_rate': 4.048022816619132e-05, 'epoch': 0.66}


                                                        
 22%|██▏       | 2970/13473 [8:26:03<5:09:51,  1.77s/it]

{'eval_loss': 0.7048684358596802, 'eval_runtime': 82.9771, 'eval_samples_per_second': 11.991, 'eval_steps_per_second': 6.002, 'epoch': 0.66}


 22%|██▏       | 2980/13473 [8:26:12<5:18:14,  1.82s/it] 

{'loss': 0.6445, 'grad_norm': 1.8266513347625732, 'learning_rate': 4.0441686579819624e-05, 'epoch': 0.66}


                                                        
 22%|██▏       | 2980/13473 [8:27:33<5:18:14,  1.82s/it]

{'eval_loss': 0.7075966000556946, 'eval_runtime': 80.7873, 'eval_samples_per_second': 12.316, 'eval_steps_per_second': 6.164, 'epoch': 0.66}


 22%|██▏       | 2990/13473 [8:27:42<5:40:10,  1.95s/it] 

{'loss': 0.5742, 'grad_norm': 0.47445160150527954, 'learning_rate': 4.040314499344793e-05, 'epoch': 0.67}


                                                        
 22%|██▏       | 2990/13473 [8:29:03<5:40:10,  1.95s/it]

{'eval_loss': 0.7112367749214172, 'eval_runtime': 81.1572, 'eval_samples_per_second': 12.26, 'eval_steps_per_second': 6.136, 'epoch': 0.67}


 22%|██▏       | 3000/13473 [8:29:11<5:07:28,  1.76s/it] 

{'loss': 1.1377, 'grad_norm': 7.739402770996094, 'learning_rate': 4.036460340707623e-05, 'epoch': 0.67}


                                                        
 22%|██▏       | 3000/13473 [8:30:32<5:07:28,  1.76s/it]

{'eval_loss': 0.7111204266548157, 'eval_runtime': 80.9486, 'eval_samples_per_second': 12.292, 'eval_steps_per_second': 6.152, 'epoch': 0.67}


 22%|██▏       | 3010/13473 [8:30:42<5:09:44,  1.78s/it] 

{'loss': 1.0117, 'grad_norm': 0.3951021730899811, 'learning_rate': 4.032606182070454e-05, 'epoch': 0.67}


                                                        
 22%|██▏       | 3010/13473 [8:32:06<5:09:44,  1.78s/it]

{'eval_loss': 0.7028985619544983, 'eval_runtime': 83.9064, 'eval_samples_per_second': 11.858, 'eval_steps_per_second': 5.935, 'epoch': 0.67}


 22%|██▏       | 3020/13473 [8:32:14<5:11:19,  1.79s/it] 

{'loss': 0.673, 'grad_norm': 0.5428889393806458, 'learning_rate': 4.028752023433285e-05, 'epoch': 0.67}


                                                        
 22%|██▏       | 3020/13473 [8:33:42<5:11:19,  1.79s/it]

{'eval_loss': 0.7058576345443726, 'eval_runtime': 87.9643, 'eval_samples_per_second': 11.311, 'eval_steps_per_second': 5.661, 'epoch': 0.67}


 22%|██▏       | 3030/13473 [8:33:52<5:29:47,  1.89s/it] 

{'loss': 0.7698, 'grad_norm': 0.5332357883453369, 'learning_rate': 4.024897864796115e-05, 'epoch': 0.67}


                                                        
 22%|██▏       | 3030/13473 [8:35:16<5:29:47,  1.89s/it]

{'eval_loss': 0.7040418386459351, 'eval_runtime': 84.5101, 'eval_samples_per_second': 11.774, 'eval_steps_per_second': 5.893, 'epoch': 0.67}


 23%|██▎       | 3040/13473 [8:35:25<5:15:10,  1.81s/it] 

{'loss': 0.8596, 'grad_norm': 3.813739061355591, 'learning_rate': 4.021043706158946e-05, 'epoch': 0.68}


                                                        
 23%|██▎       | 3040/13473 [8:37:02<5:15:10,  1.81s/it]

{'eval_loss': 0.7046043276786804, 'eval_runtime': 96.9777, 'eval_samples_per_second': 10.26, 'eval_steps_per_second': 5.135, 'epoch': 0.68}


 23%|██▎       | 3050/13473 [8:37:13<6:34:11,  2.27s/it] 

{'loss': 0.8069, 'grad_norm': 0.49261024594306946, 'learning_rate': 4.0171895475217766e-05, 'epoch': 0.68}


                                                        
 23%|██▎       | 3050/13473 [8:39:02<6:34:11,  2.27s/it]

{'eval_loss': 0.7026959657669067, 'eval_runtime': 108.7536, 'eval_samples_per_second': 9.149, 'eval_steps_per_second': 4.579, 'epoch': 0.68}


 23%|██▎       | 3060/13473 [8:39:13<6:53:38,  2.38s/it]  

{'loss': 1.3552, 'grad_norm': 0.5431330800056458, 'learning_rate': 4.013335388884607e-05, 'epoch': 0.68}


                                                        
 23%|██▎       | 3060/13473 [8:40:43<6:53:38,  2.38s/it]

{'eval_loss': 0.7055066823959351, 'eval_runtime': 89.8216, 'eval_samples_per_second': 11.078, 'eval_steps_per_second': 5.544, 'epoch': 0.68}


 23%|██▎       | 3070/13473 [8:40:51<5:29:36,  1.90s/it] 

{'loss': 0.3863, 'grad_norm': 0.42497795820236206, 'learning_rate': 4.009481230247437e-05, 'epoch': 0.68}


                                                        
 23%|██▎       | 3070/13473 [8:42:14<5:29:36,  1.90s/it]

{'eval_loss': 0.7154579162597656, 'eval_runtime': 82.6237, 'eval_samples_per_second': 12.043, 'eval_steps_per_second': 6.027, 'epoch': 0.68}


 23%|██▎       | 3080/13473 [8:42:22<5:14:18,  1.81s/it] 

{'loss': 0.7399, 'grad_norm': 1.9040336608886719, 'learning_rate': 4.0056270716102676e-05, 'epoch': 0.69}


                                                        
 23%|██▎       | 3080/13473 [8:43:42<5:14:18,  1.81s/it]

{'eval_loss': 0.736663281917572, 'eval_runtime': 79.6107, 'eval_samples_per_second': 12.498, 'eval_steps_per_second': 6.255, 'epoch': 0.69}


 23%|██▎       | 3090/13473 [8:43:50<5:00:27,  1.74s/it] 

{'loss': 0.6294, 'grad_norm': 1.8581323623657227, 'learning_rate': 4.001772912973098e-05, 'epoch': 0.69}


                                                        
 23%|██▎       | 3090/13473 [8:45:21<5:00:27,  1.74s/it]

{'eval_loss': 0.7137787342071533, 'eval_runtime': 90.8533, 'eval_samples_per_second': 10.952, 'eval_steps_per_second': 5.481, 'epoch': 0.69}


 23%|██▎       | 3100/13473 [8:45:29<5:25:33,  1.88s/it] 

{'loss': 0.7949, 'grad_norm': 1.1099662780761719, 'learning_rate': 3.9979187543359285e-05, 'epoch': 0.69}


                                                        
 23%|██▎       | 3100/13473 [8:46:59<5:25:33,  1.88s/it]

{'eval_loss': 0.7050744295120239, 'eval_runtime': 90.0699, 'eval_samples_per_second': 11.047, 'eval_steps_per_second': 5.529, 'epoch': 0.69}


 23%|██▎       | 3110/13473 [8:47:08<5:23:19,  1.87s/it] 

{'loss': 0.7586, 'grad_norm': 0.4220365881919861, 'learning_rate': 3.994064595698759e-05, 'epoch': 0.69}


                                                        
 23%|██▎       | 3110/13473 [8:48:39<5:23:19,  1.87s/it]

{'eval_loss': 0.716423749923706, 'eval_runtime': 91.8326, 'eval_samples_per_second': 10.835, 'eval_steps_per_second': 5.423, 'epoch': 0.69}


 23%|██▎       | 3120/13473 [8:48:48<5:32:14,  1.93s/it] 

{'loss': 0.8058, 'grad_norm': 0.4619005620479584, 'learning_rate': 3.9902104370615895e-05, 'epoch': 0.69}


                                                        
 23%|██▎       | 3120/13473 [8:50:17<5:32:14,  1.93s/it]

{'eval_loss': 0.7071946263313293, 'eval_runtime': 88.5138, 'eval_samples_per_second': 11.241, 'eval_steps_per_second': 5.626, 'epoch': 0.69}


 23%|██▎       | 3130/13473 [8:50:25<5:19:15,  1.85s/it] 

{'loss': 0.7321, 'grad_norm': 0.3469894826412201, 'learning_rate': 3.98635627842442e-05, 'epoch': 0.7}


                                                        
 23%|██▎       | 3130/13473 [8:51:53<5:19:15,  1.85s/it]

{'eval_loss': 0.7127995491027832, 'eval_runtime': 88.0817, 'eval_samples_per_second': 11.296, 'eval_steps_per_second': 5.654, 'epoch': 0.7}


 23%|██▎       | 3140/13473 [8:52:02<5:19:46,  1.86s/it] 

{'loss': 0.5805, 'grad_norm': 0.6825297474861145, 'learning_rate': 3.982502119787251e-05, 'epoch': 0.7}


                                                        
 23%|██▎       | 3140/13473 [8:53:25<5:19:46,  1.86s/it]

{'eval_loss': 0.7146289348602295, 'eval_runtime': 83.2639, 'eval_samples_per_second': 11.95, 'eval_steps_per_second': 5.981, 'epoch': 0.7}


 23%|██▎       | 3150/13473 [8:53:33<5:07:34,  1.79s/it] 

{'loss': 0.688, 'grad_norm': 0.37746769189834595, 'learning_rate': 3.978647961150081e-05, 'epoch': 0.7}


                                                        
 23%|██▎       | 3150/13473 [8:55:02<5:07:34,  1.79s/it]

{'eval_loss': 0.7073022127151489, 'eval_runtime': 88.7211, 'eval_samples_per_second': 11.215, 'eval_steps_per_second': 5.613, 'epoch': 0.7}


 23%|██▎       | 3160/13473 [8:55:11<5:25:46,  1.90s/it] 

{'loss': 0.6078, 'grad_norm': 0.6343910694122314, 'learning_rate': 3.974793802512912e-05, 'epoch': 0.7}


                                                        
 23%|██▎       | 3160/13473 [8:56:41<5:25:46,  1.90s/it]

{'eval_loss': 0.711322546005249, 'eval_runtime': 90.1214, 'eval_samples_per_second': 11.041, 'eval_steps_per_second': 5.526, 'epoch': 0.7}


 24%|██▎       | 3170/13473 [8:56:50<5:30:34,  1.93s/it] 

{'loss': 0.5036, 'grad_norm': 0.33948951959609985, 'learning_rate': 3.970939643875742e-05, 'epoch': 0.71}


                                                        
 24%|██▎       | 3170/13473 [8:58:17<5:30:34,  1.93s/it]

{'eval_loss': 0.7219131588935852, 'eval_runtime': 87.2052, 'eval_samples_per_second': 11.41, 'eval_steps_per_second': 5.711, 'epoch': 0.71}


 24%|██▎       | 3180/13473 [8:58:25<5:20:13,  1.87s/it] 

{'loss': 0.5902, 'grad_norm': 0.2998974025249481, 'learning_rate': 3.967085485238572e-05, 'epoch': 0.71}


                                                        
 24%|██▎       | 3180/13473 [8:59:55<5:20:13,  1.87s/it]

{'eval_loss': 0.7391805052757263, 'eval_runtime': 89.4415, 'eval_samples_per_second': 11.125, 'eval_steps_per_second': 5.568, 'epoch': 0.71}


 24%|██▎       | 3190/13473 [9:00:04<5:22:43,  1.88s/it] 

{'loss': 0.8026, 'grad_norm': 2.0898916721343994, 'learning_rate': 3.963231326601403e-05, 'epoch': 0.71}


                                                        
 24%|██▎       | 3190/13473 [9:01:32<5:22:43,  1.88s/it]

{'eval_loss': 0.7132241725921631, 'eval_runtime': 88.1539, 'eval_samples_per_second': 11.287, 'eval_steps_per_second': 5.649, 'epoch': 0.71}


 24%|██▍       | 3200/13473 [9:01:40<5:13:13,  1.83s/it] 

{'loss': 0.57, 'grad_norm': 0.7613605260848999, 'learning_rate': 3.959377167964234e-05, 'epoch': 0.71}


                                                        
 24%|██▍       | 3200/13473 [9:03:10<5:13:13,  1.83s/it]

{'eval_loss': 0.7128012180328369, 'eval_runtime': 89.5956, 'eval_samples_per_second': 11.105, 'eval_steps_per_second': 5.558, 'epoch': 0.71}


 24%|██▍       | 3210/13473 [9:03:19<5:41:13,  1.99s/it] 

{'loss': 0.8434, 'grad_norm': 0.38303494453430176, 'learning_rate': 3.955523009327064e-05, 'epoch': 0.71}


                                                        
 24%|██▍       | 3210/13473 [9:04:51<5:41:13,  1.99s/it]

{'eval_loss': 0.7044512033462524, 'eval_runtime': 92.4494, 'eval_samples_per_second': 10.763, 'eval_steps_per_second': 5.387, 'epoch': 0.71}


 24%|██▍       | 3220/13473 [9:05:00<5:29:13,  1.93s/it] 

{'loss': 0.6767, 'grad_norm': 0.4678718149662018, 'learning_rate': 3.951668850689895e-05, 'epoch': 0.72}


                                                        
 24%|██▍       | 3220/13473 [9:06:29<5:29:13,  1.93s/it]

{'eval_loss': 0.7034208178520203, 'eval_runtime': 89.0751, 'eval_samples_per_second': 11.17, 'eval_steps_per_second': 5.591, 'epoch': 0.72}


 24%|██▍       | 3230/13473 [9:06:37<5:16:20,  1.85s/it] 

{'loss': 0.5933, 'grad_norm': 0.4600580930709839, 'learning_rate': 3.947814692052725e-05, 'epoch': 0.72}


                                                        
 24%|██▍       | 3230/13473 [9:08:11<5:16:20,  1.85s/it]

{'eval_loss': 0.7063741683959961, 'eval_runtime': 93.6938, 'eval_samples_per_second': 10.62, 'eval_steps_per_second': 5.315, 'epoch': 0.72}


 24%|██▍       | 3240/13473 [9:08:20<5:34:39,  1.96s/it] 

{'loss': 0.5121, 'grad_norm': 0.7455105185508728, 'learning_rate': 3.9439605334155556e-05, 'epoch': 0.72}


                                                        
 24%|██▍       | 3240/13473 [9:09:54<5:34:39,  1.96s/it]

{'eval_loss': 0.7176392078399658, 'eval_runtime': 94.2537, 'eval_samples_per_second': 10.557, 'eval_steps_per_second': 5.284, 'epoch': 0.72}


 24%|██▍       | 3250/13473 [9:10:02<5:27:30,  1.92s/it] 

{'loss': 0.6524, 'grad_norm': 0.5217342972755432, 'learning_rate': 3.9401063747783864e-05, 'epoch': 0.72}


                                                        
 24%|██▍       | 3250/13473 [9:11:29<5:27:30,  1.92s/it]

{'eval_loss': 0.7265104055404663, 'eval_runtime': 87.201, 'eval_samples_per_second': 11.41, 'eval_steps_per_second': 5.711, 'epoch': 0.72}


 24%|██▍       | 3260/13473 [9:11:38<5:17:54,  1.87s/it] 

{'loss': 1.0507, 'grad_norm': 0.502039909362793, 'learning_rate': 3.9362522161412165e-05, 'epoch': 0.73}


                                                        
 24%|██▍       | 3260/13473 [9:13:08<5:17:54,  1.87s/it]

{'eval_loss': 0.7127982378005981, 'eval_runtime': 90.2993, 'eval_samples_per_second': 11.019, 'eval_steps_per_second': 5.515, 'epoch': 0.73}


 24%|██▍       | 3270/13473 [9:13:19<5:33:44,  1.96s/it] 

{'loss': 0.634, 'grad_norm': 0.7866672873497009, 'learning_rate': 3.932398057504047e-05, 'epoch': 0.73}


                                                        
 24%|██▍       | 3270/13473 [9:14:50<5:33:44,  1.96s/it]

{'eval_loss': 0.7030744552612305, 'eval_runtime': 91.1108, 'eval_samples_per_second': 10.921, 'eval_steps_per_second': 5.466, 'epoch': 0.73}


 24%|██▍       | 3280/13473 [9:14:59<5:25:28,  1.92s/it] 

{'loss': 0.5566, 'grad_norm': 0.4141971170902252, 'learning_rate': 3.9285438988668774e-05, 'epoch': 0.73}


                                                        
 24%|██▍       | 3280/13473 [9:16:25<5:25:28,  1.92s/it]

{'eval_loss': 0.7113415598869324, 'eval_runtime': 86.4414, 'eval_samples_per_second': 11.511, 'eval_steps_per_second': 5.761, 'epoch': 0.73}


 24%|██▍       | 3290/13473 [9:16:34<5:17:08,  1.87s/it] 

{'loss': 0.8179, 'grad_norm': 0.4095412492752075, 'learning_rate': 3.9246897402297076e-05, 'epoch': 0.73}


                                                        
 24%|██▍       | 3290/13473 [9:18:04<5:17:08,  1.87s/it]

{'eval_loss': 0.7090572118759155, 'eval_runtime': 89.5285, 'eval_samples_per_second': 11.114, 'eval_steps_per_second': 5.562, 'epoch': 0.73}


 24%|██▍       | 3300/13473 [9:18:12<5:17:45,  1.87s/it] 

{'loss': 0.645, 'grad_norm': 0.913985550403595, 'learning_rate': 3.9208355815925384e-05, 'epoch': 0.73}


                                                        
 24%|██▍       | 3300/13473 [9:19:42<5:17:45,  1.87s/it]

{'eval_loss': 0.7050828337669373, 'eval_runtime': 90.4503, 'eval_samples_per_second': 11.001, 'eval_steps_per_second': 5.506, 'epoch': 0.73}


 25%|██▍       | 3310/13473 [9:19:51<5:25:10,  1.92s/it] 

{'loss': 0.5444, 'grad_norm': 0.3796693682670593, 'learning_rate': 3.916981422955369e-05, 'epoch': 0.74}


                                                        
 25%|██▍       | 3310/13473 [9:21:18<5:25:10,  1.92s/it]

{'eval_loss': 0.7140191793441772, 'eval_runtime': 87.3305, 'eval_samples_per_second': 11.393, 'eval_steps_per_second': 5.702, 'epoch': 0.74}


 25%|██▍       | 3320/13473 [9:21:27<5:16:02,  1.87s/it] 

{'loss': 0.6141, 'grad_norm': 0.3153989911079407, 'learning_rate': 3.913127264318199e-05, 'epoch': 0.74}


                                                        
 25%|██▍       | 3320/13473 [9:23:01<5:16:02,  1.87s/it]

{'eval_loss': 0.7145726084709167, 'eval_runtime': 93.6063, 'eval_samples_per_second': 10.63, 'eval_steps_per_second': 5.32, 'epoch': 0.74}


 25%|██▍       | 3330/13473 [9:23:10<5:38:30,  2.00s/it] 

{'loss': 1.028, 'grad_norm': 0.38167157769203186, 'learning_rate': 3.90927310568103e-05, 'epoch': 0.74}


                                                        
 25%|██▍       | 3330/13473 [9:24:34<5:38:30,  2.00s/it]

{'eval_loss': 0.7081769108772278, 'eval_runtime': 84.2605, 'eval_samples_per_second': 11.809, 'eval_steps_per_second': 5.91, 'epoch': 0.74}


 25%|██▍       | 3340/13473 [9:24:42<5:10:49,  1.84s/it] 

{'loss': 0.649, 'grad_norm': 0.4399332106113434, 'learning_rate': 3.905418947043861e-05, 'epoch': 0.74}


                                                        
 25%|██▍       | 3340/13473 [9:26:06<5:10:49,  1.84s/it]

{'eval_loss': 0.702200710773468, 'eval_runtime': 83.9315, 'eval_samples_per_second': 11.855, 'eval_steps_per_second': 5.933, 'epoch': 0.74}


 25%|██▍       | 3350/13473 [9:26:14<5:05:41,  1.81s/it] 

{'loss': 0.8747, 'grad_norm': 4.243327617645264, 'learning_rate': 3.901564788406691e-05, 'epoch': 0.75}


                                                        
 25%|██▍       | 3350/13473 [9:27:38<5:05:41,  1.81s/it]

{'eval_loss': 0.7060593366622925, 'eval_runtime': 83.719, 'eval_samples_per_second': 11.885, 'eval_steps_per_second': 5.948, 'epoch': 0.75}


 25%|██▍       | 3360/13473 [9:27:47<5:10:57,  1.84s/it] 

{'loss': 0.6429, 'grad_norm': 1.379043459892273, 'learning_rate': 3.897710629769522e-05, 'epoch': 0.75}


                                                        
 25%|██▍       | 3360/13473 [9:29:10<5:10:57,  1.84s/it]

{'eval_loss': 0.7051603198051453, 'eval_runtime': 82.6951, 'eval_samples_per_second': 12.032, 'eval_steps_per_second': 6.022, 'epoch': 0.75}


 25%|██▌       | 3370/13473 [9:29:18<5:08:33,  1.83s/it] 

{'loss': 0.8027, 'grad_norm': 0.3994729518890381, 'learning_rate': 3.893856471132352e-05, 'epoch': 0.75}


                                                        
 25%|██▌       | 3370/13473 [9:30:43<5:08:33,  1.83s/it]

{'eval_loss': 0.7042989134788513, 'eval_runtime': 84.8101, 'eval_samples_per_second': 11.732, 'eval_steps_per_second': 5.872, 'epoch': 0.75}


 25%|██▌       | 3380/13473 [9:30:51<5:02:02,  1.80s/it] 

{'loss': 0.8543, 'grad_norm': 1.663842797279358, 'learning_rate': 3.890002312495182e-05, 'epoch': 0.75}


                                                        
 25%|██▌       | 3380/13473 [9:32:13<5:02:02,  1.80s/it]

{'eval_loss': 0.7049670219421387, 'eval_runtime': 81.3664, 'eval_samples_per_second': 12.229, 'eval_steps_per_second': 6.12, 'epoch': 0.75}


 25%|██▌       | 3390/13473 [9:32:21<4:57:46,  1.77s/it] 

{'loss': 0.5771, 'grad_norm': 0.5559636354446411, 'learning_rate': 3.886148153858013e-05, 'epoch': 0.75}


                                                        
 25%|██▌       | 3390/13473 [9:33:46<4:57:46,  1.77s/it]

{'eval_loss': 0.7061144709587097, 'eval_runtime': 84.915, 'eval_samples_per_second': 11.718, 'eval_steps_per_second': 5.865, 'epoch': 0.75}


 25%|██▌       | 3400/13473 [9:33:55<5:02:45,  1.80s/it] 

{'loss': 0.7196, 'grad_norm': 0.3319026231765747, 'learning_rate': 3.8822939952208436e-05, 'epoch': 0.76}


                                                        
 25%|██▌       | 3400/13473 [9:35:19<5:02:45,  1.80s/it]

{'eval_loss': 0.7085961103439331, 'eval_runtime': 84.2927, 'eval_samples_per_second': 11.804, 'eval_steps_per_second': 5.908, 'epoch': 0.76}


 25%|██▌       | 3410/13473 [9:35:27<5:01:16,  1.80s/it] 

{'loss': 0.6769, 'grad_norm': 0.8461439609527588, 'learning_rate': 3.878439836583674e-05, 'epoch': 0.76}


                                                        
 25%|██▌       | 3410/13473 [9:36:52<5:01:16,  1.80s/it]

{'eval_loss': 0.7116938233375549, 'eval_runtime': 84.7966, 'eval_samples_per_second': 11.734, 'eval_steps_per_second': 5.873, 'epoch': 0.76}


 25%|██▌       | 3420/13473 [9:37:01<5:10:03,  1.85s/it] 

{'loss': 0.5426, 'grad_norm': 0.27959883213043213, 'learning_rate': 3.8745856779465045e-05, 'epoch': 0.76}


                                                        
 25%|██▌       | 3420/13473 [9:38:23<5:10:03,  1.85s/it]

{'eval_loss': 0.7187650799751282, 'eval_runtime': 82.107, 'eval_samples_per_second': 12.118, 'eval_steps_per_second': 6.065, 'epoch': 0.76}


 25%|██▌       | 3430/13473 [9:38:31<4:57:20,  1.78s/it] 

{'loss': 0.3521, 'grad_norm': 0.43428468704223633, 'learning_rate': 3.870731519309335e-05, 'epoch': 0.76}


                                                        
 25%|██▌       | 3430/13473 [9:39:54<4:57:20,  1.78s/it]

{'eval_loss': 0.7547866106033325, 'eval_runtime': 82.9388, 'eval_samples_per_second': 11.997, 'eval_steps_per_second': 6.004, 'epoch': 0.76}


 26%|██▌       | 3440/13473 [9:40:03<4:58:49,  1.79s/it] 

{'loss': 1.7108, 'grad_norm': 0.3402349650859833, 'learning_rate': 3.8668773606721654e-05, 'epoch': 0.77}


                                                        
 26%|██▌       | 3440/13473 [9:41:24<4:58:49,  1.79s/it]

{'eval_loss': 0.7123502492904663, 'eval_runtime': 81.5223, 'eval_samples_per_second': 12.205, 'eval_steps_per_second': 6.109, 'epoch': 0.77}


 26%|██▌       | 3450/13473 [9:41:33<5:13:03,  1.87s/it] 

{'loss': 0.7049, 'grad_norm': 0.3502977192401886, 'learning_rate': 3.863023202034996e-05, 'epoch': 0.77}


                                                        
 26%|██▌       | 3450/13473 [9:42:57<5:13:03,  1.87s/it]

{'eval_loss': 0.7070100903511047, 'eval_runtime': 83.7037, 'eval_samples_per_second': 11.887, 'eval_steps_per_second': 5.95, 'epoch': 0.77}


 26%|██▌       | 3460/13473 [9:43:05<5:06:46,  1.84s/it] 

{'loss': 0.5559, 'grad_norm': 0.6129095554351807, 'learning_rate': 3.859169043397827e-05, 'epoch': 0.77}


                                                        
 26%|██▌       | 3460/13473 [9:44:27<5:06:46,  1.84s/it]

{'eval_loss': 0.7061564922332764, 'eval_runtime': 81.809, 'eval_samples_per_second': 12.162, 'eval_steps_per_second': 6.087, 'epoch': 0.77}


 26%|██▌       | 3470/13473 [9:44:36<5:01:14,  1.81s/it] 

{'loss': 0.8424, 'grad_norm': 0.5535697340965271, 'learning_rate': 3.855314884760657e-05, 'epoch': 0.77}


                                                        
 26%|██▌       | 3470/13473 [9:46:00<5:01:14,  1.81s/it]

{'eval_loss': 0.7080591917037964, 'eval_runtime': 83.9, 'eval_samples_per_second': 11.859, 'eval_steps_per_second': 5.936, 'epoch': 0.77}


 26%|██▌       | 3480/13473 [9:46:09<5:18:04,  1.91s/it] 

{'loss': 0.9727, 'grad_norm': 1.2516449689865112, 'learning_rate': 3.851460726123487e-05, 'epoch': 0.77}


                                                        
 26%|██▌       | 3480/13473 [9:47:30<5:18:04,  1.91s/it]

{'eval_loss': 0.7118191123008728, 'eval_runtime': 80.8589, 'eval_samples_per_second': 12.305, 'eval_steps_per_second': 6.159, 'epoch': 0.77}


 26%|██▌       | 3490/13473 [9:47:38<4:54:26,  1.77s/it] 

{'loss': 0.7176, 'grad_norm': 2.569148302078247, 'learning_rate': 3.8476065674863174e-05, 'epoch': 0.78}


                                                        
 26%|██▌       | 3490/13473 [9:49:02<4:54:26,  1.77s/it]

{'eval_loss': 0.7051393389701843, 'eval_runtime': 83.7691, 'eval_samples_per_second': 11.878, 'eval_steps_per_second': 5.945, 'epoch': 0.78}


 26%|██▌       | 3500/13473 [9:49:11<5:05:05,  1.84s/it] 

{'loss': 0.6513, 'grad_norm': 1.865470290184021, 'learning_rate': 3.843752408849148e-05, 'epoch': 0.78}


                                                        
 26%|██▌       | 3500/13473 [9:50:34<5:05:05,  1.84s/it]

{'eval_loss': 0.7060407996177673, 'eval_runtime': 82.4109, 'eval_samples_per_second': 12.074, 'eval_steps_per_second': 6.043, 'epoch': 0.78}


 26%|██▌       | 3510/13473 [9:50:45<5:17:55,  1.91s/it] 

{'loss': 1.2612, 'grad_norm': 0.8635521531105042, 'learning_rate': 3.839898250211979e-05, 'epoch': 0.78}


                                                        
 26%|██▌       | 3510/13473 [9:52:10<5:17:55,  1.91s/it]

{'eval_loss': 0.7043155431747437, 'eval_runtime': 85.4638, 'eval_samples_per_second': 11.642, 'eval_steps_per_second': 5.827, 'epoch': 0.78}


 26%|██▌       | 3520/13473 [9:52:19<5:03:10,  1.83s/it] 

{'loss': 0.6433, 'grad_norm': 0.6883304119110107, 'learning_rate': 3.836044091574809e-05, 'epoch': 0.78}


                                                        
 26%|██▌       | 3520/13473 [9:53:44<5:03:10,  1.83s/it]

{'eval_loss': 0.7039920091629028, 'eval_runtime': 85.3234, 'eval_samples_per_second': 11.662, 'eval_steps_per_second': 5.837, 'epoch': 0.78}


 26%|██▌       | 3530/13473 [9:53:53<5:06:28,  1.85s/it] 

{'loss': 0.4432, 'grad_norm': 0.6002709865570068, 'learning_rate': 3.83218993293764e-05, 'epoch': 0.79}


                                                        
 26%|██▌       | 3530/13473 [9:55:25<5:06:28,  1.85s/it]

{'eval_loss': 0.7211416959762573, 'eval_runtime': 92.1958, 'eval_samples_per_second': 10.792, 'eval_steps_per_second': 5.402, 'epoch': 0.79}


 26%|██▋       | 3540/13473 [9:55:33<5:13:54,  1.90s/it] 

{'loss': 0.8561, 'grad_norm': 0.29821252822875977, 'learning_rate': 3.828335774300471e-05, 'epoch': 0.79}


                                                        
 26%|██▋       | 3540/13473 [9:57:02<5:13:54,  1.90s/it]

{'eval_loss': 0.7252213358879089, 'eval_runtime': 88.176, 'eval_samples_per_second': 11.284, 'eval_steps_per_second': 5.648, 'epoch': 0.79}


 26%|██▋       | 3550/13473 [9:57:10<5:16:54,  1.92s/it] 

{'loss': 0.5207, 'grad_norm': 0.3620319962501526, 'learning_rate': 3.824481615663301e-05, 'epoch': 0.79}


                                                        
 26%|██▋       | 3550/13473 [9:58:39<5:16:54,  1.92s/it]

{'eval_loss': 0.7130140662193298, 'eval_runtime': 88.2988, 'eval_samples_per_second': 11.269, 'eval_steps_per_second': 5.64, 'epoch': 0.79}


 26%|██▋       | 3560/13473 [9:58:47<5:13:57,  1.90s/it] 

{'loss': 0.6453, 'grad_norm': 1.3426628112792969, 'learning_rate': 3.8206274570261316e-05, 'epoch': 0.79}


                                                        
 26%|██▋       | 3560/13473 [10:00:11<5:13:57,  1.90s/it]

{'eval_loss': 0.7075694799423218, 'eval_runtime': 83.9178, 'eval_samples_per_second': 11.857, 'eval_steps_per_second': 5.934, 'epoch': 0.79}


 26%|██▋       | 3570/13473 [10:00:20<4:58:19,  1.81s/it] 

{'loss': 0.6696, 'grad_norm': 0.5248305797576904, 'learning_rate': 3.816773298388962e-05, 'epoch': 0.79}


                                                         
 26%|██▋       | 3570/13473 [10:01:49<4:58:19,  1.81s/it]

{'eval_loss': 0.7037575840950012, 'eval_runtime': 88.994, 'eval_samples_per_second': 11.181, 'eval_steps_per_second': 5.596, 'epoch': 0.79}


 27%|██▋       | 3580/13473 [10:01:57<5:09:34,  1.88s/it] 

{'loss': 0.9045, 'grad_norm': 0.6410086154937744, 'learning_rate': 3.812919139751792e-05, 'epoch': 0.8}


                                                         
 27%|██▋       | 3580/13473 [10:03:27<5:09:34,  1.88s/it]

{'eval_loss': 0.7067689299583435, 'eval_runtime': 90.0329, 'eval_samples_per_second': 11.052, 'eval_steps_per_second': 5.531, 'epoch': 0.8}


 27%|██▋       | 3590/13473 [10:03:36<5:29:48,  2.00s/it] 

{'loss': 0.5207, 'grad_norm': 0.460069864988327, 'learning_rate': 3.8090649811146226e-05, 'epoch': 0.8}


                                                         
 27%|██▋       | 3590/13473 [10:05:11<5:29:48,  2.00s/it]

{'eval_loss': 0.709856390953064, 'eval_runtime': 94.4752, 'eval_samples_per_second': 10.532, 'eval_steps_per_second': 5.271, 'epoch': 0.8}


 27%|██▋       | 3600/13473 [10:05:19<5:20:40,  1.95s/it] 

{'loss': 0.6842, 'grad_norm': 2.3087122440338135, 'learning_rate': 3.8052108224774534e-05, 'epoch': 0.8}


                                                         
 27%|██▋       | 3600/13473 [10:06:44<5:20:40,  1.95s/it]

{'eval_loss': 0.7199411392211914, 'eval_runtime': 85.2682, 'eval_samples_per_second': 11.669, 'eval_steps_per_second': 5.84, 'epoch': 0.8}


 27%|██▋       | 3610/13473 [10:06:53<5:03:40,  1.85s/it] 

{'loss': 0.4722, 'grad_norm': 1.7108948230743408, 'learning_rate': 3.8013566638402835e-05, 'epoch': 0.8}


                                                         
 27%|██▋       | 3610/13473 [10:08:26<5:03:40,  1.85s/it]

{'eval_loss': 0.7246010303497314, 'eval_runtime': 92.9472, 'eval_samples_per_second': 10.705, 'eval_steps_per_second': 5.358, 'epoch': 0.8}


 27%|██▋       | 3620/13473 [10:08:34<5:15:37,  1.92s/it] 

{'loss': 0.8509, 'grad_norm': 0.684867799282074, 'learning_rate': 3.797502505203114e-05, 'epoch': 0.81}


                                                         
 27%|██▋       | 3620/13473 [10:10:08<5:15:37,  1.92s/it]

{'eval_loss': 0.7080938220024109, 'eval_runtime': 94.1058, 'eval_samples_per_second': 10.573, 'eval_steps_per_second': 5.292, 'epoch': 0.81}


 27%|██▋       | 3630/13473 [10:10:17<5:19:21,  1.95s/it] 

{'loss': 0.5076, 'grad_norm': 0.8472188115119934, 'learning_rate': 3.793648346565945e-05, 'epoch': 0.81}


                                                         
 27%|██▋       | 3630/13473 [10:11:47<5:19:21,  1.95s/it]

{'eval_loss': 0.7045611143112183, 'eval_runtime': 89.6267, 'eval_samples_per_second': 11.102, 'eval_steps_per_second': 5.556, 'epoch': 0.81}


 27%|██▋       | 3640/13473 [10:11:55<5:07:56,  1.88s/it] 

{'loss': 0.772, 'grad_norm': 1.0587196350097656, 'learning_rate': 3.789794187928775e-05, 'epoch': 0.81}


                                                         
 27%|██▋       | 3640/13473 [10:13:28<5:07:56,  1.88s/it]

{'eval_loss': 0.7078485488891602, 'eval_runtime': 92.9553, 'eval_samples_per_second': 10.704, 'eval_steps_per_second': 5.357, 'epoch': 0.81}


 27%|██▋       | 3650/13473 [10:13:37<5:25:51,  1.99s/it] 

{'loss': 0.5172, 'grad_norm': 0.31593164801597595, 'learning_rate': 3.785940029291606e-05, 'epoch': 0.81}


                                                         
 27%|██▋       | 3650/13473 [10:15:09<5:25:51,  1.99s/it]

{'eval_loss': 0.7076382040977478, 'eval_runtime': 91.9058, 'eval_samples_per_second': 10.826, 'eval_steps_per_second': 5.419, 'epoch': 0.81}


 27%|██▋       | 3660/13473 [10:15:18<5:14:33,  1.92s/it] 

{'loss': 0.8597, 'grad_norm': 0.7336549162864685, 'learning_rate': 3.782085870654437e-05, 'epoch': 0.81}


                                                         
 27%|██▋       | 3660/13473 [10:16:48<5:14:33,  1.92s/it]

{'eval_loss': 0.7088034152984619, 'eval_runtime': 90.1476, 'eval_samples_per_second': 11.037, 'eval_steps_per_second': 5.524, 'epoch': 0.81}


 27%|██▋       | 3670/13473 [10:16:57<5:13:25,  1.92s/it] 

{'loss': 0.5184, 'grad_norm': 0.6726885437965393, 'learning_rate': 3.778231712017267e-05, 'epoch': 0.82}


                                                         
 27%|██▋       | 3670/13473 [10:18:25<5:13:25,  1.92s/it]

{'eval_loss': 0.7118514776229858, 'eval_runtime': 88.9189, 'eval_samples_per_second': 11.19, 'eval_steps_per_second': 5.601, 'epoch': 0.82}


 27%|██▋       | 3680/13473 [10:18:34<5:05:41,  1.87s/it] 

{'loss': 0.9137, 'grad_norm': 1.2855627536773682, 'learning_rate': 3.774377553380097e-05, 'epoch': 0.82}


                                                         
 27%|██▋       | 3680/13473 [10:20:09<5:05:41,  1.87s/it]

{'eval_loss': 0.7233105301856995, 'eval_runtime': 95.1428, 'eval_samples_per_second': 10.458, 'eval_steps_per_second': 5.234, 'epoch': 0.82}


 27%|██▋       | 3690/13473 [10:20:18<5:16:26,  1.94s/it] 

{'loss': 1.013, 'grad_norm': 0.4636841118335724, 'learning_rate': 3.770523394742928e-05, 'epoch': 0.82}


                                                         
 27%|██▋       | 3690/13473 [10:21:51<5:16:26,  1.94s/it]

{'eval_loss': 0.7020940780639648, 'eval_runtime': 92.8899, 'eval_samples_per_second': 10.712, 'eval_steps_per_second': 5.361, 'epoch': 0.82}


 27%|██▋       | 3700/13473 [10:21:59<5:16:54,  1.95s/it] 

{'loss': 0.8419, 'grad_norm': 0.8823599815368652, 'learning_rate': 3.766669236105758e-05, 'epoch': 0.82}


                                                         
 27%|██▋       | 3700/13473 [10:23:28<5:16:54,  1.95s/it]

{'eval_loss': 0.7066993117332458, 'eval_runtime': 89.1554, 'eval_samples_per_second': 11.16, 'eval_steps_per_second': 5.586, 'epoch': 0.82}


 28%|██▊       | 3710/13473 [10:23:37<5:15:36,  1.94s/it] 

{'loss': 0.5599, 'grad_norm': 0.45078471302986145, 'learning_rate': 3.762815077468589e-05, 'epoch': 0.83}


                                                         
 28%|██▊       | 3710/13473 [10:25:11<5:15:36,  1.94s/it]

{'eval_loss': 0.7057747840881348, 'eval_runtime': 93.9068, 'eval_samples_per_second': 10.596, 'eval_steps_per_second': 5.303, 'epoch': 0.83}


 28%|██▊       | 3720/13473 [10:25:22<5:27:46,  2.02s/it] 

{'loss': 0.7506, 'grad_norm': 1.6656017303466797, 'learning_rate': 3.7589609188314196e-05, 'epoch': 0.83}


                                                         
 28%|██▊       | 3720/13473 [10:26:49<5:27:46,  2.02s/it]

{'eval_loss': 0.7212421894073486, 'eval_runtime': 87.5514, 'eval_samples_per_second': 11.365, 'eval_steps_per_second': 5.688, 'epoch': 0.83}


 28%|██▊       | 3730/13473 [10:26:58<5:01:46,  1.86s/it] 

{'loss': 0.5848, 'grad_norm': 0.35851362347602844, 'learning_rate': 3.75510676019425e-05, 'epoch': 0.83}


                                                         
 28%|██▊       | 3730/13473 [10:28:30<5:01:46,  1.86s/it]

{'eval_loss': 0.7116641998291016, 'eval_runtime': 91.8068, 'eval_samples_per_second': 10.838, 'eval_steps_per_second': 5.424, 'epoch': 0.83}


 28%|██▊       | 3740/13473 [10:28:38<5:14:26,  1.94s/it] 

{'loss': 0.6288, 'grad_norm': 0.8502307534217834, 'learning_rate': 3.7512526015570805e-05, 'epoch': 0.83}


                                                         
 28%|██▊       | 3740/13473 [10:30:11<5:14:26,  1.94s/it]

{'eval_loss': 0.7095984220504761, 'eval_runtime': 92.2338, 'eval_samples_per_second': 10.788, 'eval_steps_per_second': 5.399, 'epoch': 0.83}


 28%|██▊       | 3750/13473 [10:30:19<5:13:10,  1.93s/it] 

{'loss': 0.5773, 'grad_norm': 0.6624454259872437, 'learning_rate': 3.7473984429199106e-05, 'epoch': 0.84}


                                                         
 28%|██▊       | 3750/13473 [10:31:57<5:13:10,  1.93s/it]

{'eval_loss': 0.7148882746696472, 'eval_runtime': 97.982, 'eval_samples_per_second': 10.155, 'eval_steps_per_second': 5.083, 'epoch': 0.84}


 28%|██▊       | 3760/13473 [10:32:06<5:42:22,  2.11s/it] 

{'loss': 0.5642, 'grad_norm': 0.545126736164093, 'learning_rate': 3.7435442842827414e-05, 'epoch': 0.84}


                                                         
 28%|██▊       | 3760/13473 [10:33:39<5:42:22,  2.11s/it]

{'eval_loss': 0.720025897026062, 'eval_runtime': 92.3866, 'eval_samples_per_second': 10.77, 'eval_steps_per_second': 5.39, 'epoch': 0.84}


 28%|██▊       | 3770/13473 [10:33:47<5:09:39,  1.91s/it] 

{'loss': 0.6237, 'grad_norm': 0.30296221375465393, 'learning_rate': 3.7396901256455715e-05, 'epoch': 0.84}


                                                         
 28%|██▊       | 3770/13473 [10:35:16<5:09:39,  1.91s/it]

{'eval_loss': 0.7216932773590088, 'eval_runtime': 89.6064, 'eval_samples_per_second': 11.104, 'eval_steps_per_second': 5.558, 'epoch': 0.84}


 28%|██▊       | 3780/13473 [10:35:25<5:04:18,  1.88s/it] 

{'loss': 0.539, 'grad_norm': 0.3864552676677704, 'learning_rate': 3.735835967008402e-05, 'epoch': 0.84}


                                                         
 28%|██▊       | 3780/13473 [10:36:56<5:04:18,  1.88s/it]

{'eval_loss': 0.7254020571708679, 'eval_runtime': 91.2526, 'eval_samples_per_second': 10.904, 'eval_steps_per_second': 5.457, 'epoch': 0.84}


 28%|██▊       | 3790/13473 [10:37:05<5:07:44,  1.91s/it] 

{'loss': 0.6554, 'grad_norm': 2.3283770084381104, 'learning_rate': 3.7319818083712324e-05, 'epoch': 0.84}


                                                         
 28%|██▊       | 3790/13473 [10:38:38<5:07:44,  1.91s/it]

{'eval_loss': 0.7241366505622864, 'eval_runtime': 93.2469, 'eval_samples_per_second': 10.671, 'eval_steps_per_second': 5.341, 'epoch': 0.84}


 28%|██▊       | 3800/13473 [10:38:46<5:08:20,  1.91s/it] 

{'loss': 0.4691, 'grad_norm': 0.2824951112270355, 'learning_rate': 3.728127649734063e-05, 'epoch': 0.85}


                                                         
 28%|██▊       | 3800/13473 [10:40:10<5:08:20,  1.91s/it]

{'eval_loss': 0.7252172827720642, 'eval_runtime': 84.0687, 'eval_samples_per_second': 11.836, 'eval_steps_per_second': 5.924, 'epoch': 0.85}


 28%|██▊       | 3810/13473 [10:40:19<4:49:08,  1.80s/it] 

{'loss': 0.6352, 'grad_norm': 0.330170601606369, 'learning_rate': 3.7242734910968934e-05, 'epoch': 0.85}


                                                         
 28%|██▊       | 3810/13473 [10:41:47<4:49:08,  1.80s/it]

{'eval_loss': 0.7178831100463867, 'eval_runtime': 88.7878, 'eval_samples_per_second': 11.206, 'eval_steps_per_second': 5.609, 'epoch': 0.85}


 28%|██▊       | 3820/13473 [10:41:56<5:10:41,  1.93s/it] 

{'loss': 0.3963, 'grad_norm': 0.3769034147262573, 'learning_rate': 3.720419332459724e-05, 'epoch': 0.85}


                                                         
 28%|██▊       | 3820/13473 [10:43:25<5:10:41,  1.93s/it]

{'eval_loss': 0.7289868593215942, 'eval_runtime': 88.2257, 'eval_samples_per_second': 11.278, 'eval_steps_per_second': 5.645, 'epoch': 0.85}


 28%|██▊       | 3830/13473 [10:43:35<5:47:10,  2.16s/it] 

{'loss': 0.7487, 'grad_norm': 0.3278251588344574, 'learning_rate': 3.716565173822555e-05, 'epoch': 0.85}


                                                         
 28%|██▊       | 3830/13473 [10:45:07<5:47:10,  2.16s/it]

{'eval_loss': 0.7245768904685974, 'eval_runtime': 92.8996, 'eval_samples_per_second': 10.71, 'eval_steps_per_second': 5.361, 'epoch': 0.85}


 29%|██▊       | 3840/13473 [10:45:16<5:10:42,  1.94s/it] 

{'loss': 0.7322, 'grad_norm': 0.30479979515075684, 'learning_rate': 3.712711015185385e-05, 'epoch': 0.86}


                                                         
 29%|██▊       | 3840/13473 [10:46:51<5:10:42,  1.94s/it]

{'eval_loss': 0.7156035304069519, 'eval_runtime': 95.1587, 'eval_samples_per_second': 10.456, 'eval_steps_per_second': 5.233, 'epoch': 0.86}


 29%|██▊       | 3850/13473 [10:47:00<5:16:32,  1.97s/it] 

{'loss': 1.0656, 'grad_norm': 0.5826221704483032, 'learning_rate': 3.708856856548216e-05, 'epoch': 0.86}


                                                         
 29%|██▊       | 3850/13473 [10:48:37<5:16:32,  1.97s/it]

{'eval_loss': 0.7051308155059814, 'eval_runtime': 96.564, 'eval_samples_per_second': 10.304, 'eval_steps_per_second': 5.157, 'epoch': 0.86}


 29%|██▊       | 3860/13473 [10:48:45<5:20:56,  2.00s/it] 

{'loss': 0.9162, 'grad_norm': 0.41966888308525085, 'learning_rate': 3.7050026979110467e-05, 'epoch': 0.86}


                                                         
 29%|██▊       | 3860/13473 [10:50:22<5:20:56,  2.00s/it]

{'eval_loss': 0.7022625207901001, 'eval_runtime': 96.2936, 'eval_samples_per_second': 10.333, 'eval_steps_per_second': 5.172, 'epoch': 0.86}


 29%|██▊       | 3870/13473 [10:50:31<5:45:56,  2.16s/it] 

{'loss': 0.5125, 'grad_norm': 0.39486247301101685, 'learning_rate': 3.701148539273877e-05, 'epoch': 0.86}


                                                         
 29%|██▊       | 3870/13473 [10:51:59<5:45:56,  2.16s/it]

{'eval_loss': 0.7056066393852234, 'eval_runtime': 87.3687, 'eval_samples_per_second': 11.389, 'eval_steps_per_second': 5.7, 'epoch': 0.86}


 29%|██▉       | 3880/13473 [10:52:07<4:58:37,  1.87s/it] 

{'loss': 0.5604, 'grad_norm': 1.4686979055404663, 'learning_rate': 3.697294380636707e-05, 'epoch': 0.86}


                                                         
 29%|██▉       | 3880/13473 [10:53:36<4:58:37,  1.87s/it]

{'eval_loss': 0.7274513244628906, 'eval_runtime': 89.1455, 'eval_samples_per_second': 11.162, 'eval_steps_per_second': 5.586, 'epoch': 0.86}


 29%|██▉       | 3890/13473 [10:53:45<5:05:30,  1.91s/it] 

{'loss': 0.8002, 'grad_norm': 1.2021805047988892, 'learning_rate': 3.693440221999538e-05, 'epoch': 0.87}


                                                         
 29%|██▉       | 3890/13473 [10:55:16<5:05:30,  1.91s/it]

{'eval_loss': 0.7183765769004822, 'eval_runtime': 90.9711, 'eval_samples_per_second': 10.938, 'eval_steps_per_second': 5.474, 'epoch': 0.87}


 29%|██▉       | 3900/13473 [10:55:24<5:03:25,  1.90s/it] 

{'loss': 0.6328, 'grad_norm': 0.610571563243866, 'learning_rate': 3.689586063362368e-05, 'epoch': 0.87}


                                                         
 29%|██▉       | 3900/13473 [10:56:59<5:03:25,  1.90s/it]

{'eval_loss': 0.7076836824417114, 'eval_runtime': 94.9728, 'eval_samples_per_second': 10.477, 'eval_steps_per_second': 5.244, 'epoch': 0.87}


 29%|██▉       | 3910/13473 [10:57:08<5:10:24,  1.95s/it] 

{'loss': 0.5887, 'grad_norm': 0.5190867185592651, 'learning_rate': 3.6857319047251986e-05, 'epoch': 0.87}


                                                         
 29%|██▉       | 3910/13473 [10:58:45<5:10:24,  1.95s/it]

{'eval_loss': 0.709484338760376, 'eval_runtime': 96.8305, 'eval_samples_per_second': 10.276, 'eval_steps_per_second': 5.143, 'epoch': 0.87}


 29%|██▉       | 3920/13473 [10:58:53<5:13:08,  1.97s/it] 

{'loss': 1.5458, 'grad_norm': 0.9111895561218262, 'learning_rate': 3.6818777460880294e-05, 'epoch': 0.87}


                                                         
 29%|██▉       | 3920/13473 [11:00:19<5:13:08,  1.97s/it]

{'eval_loss': 0.7050325870513916, 'eval_runtime': 85.9492, 'eval_samples_per_second': 11.577, 'eval_steps_per_second': 5.794, 'epoch': 0.87}


 29%|██▉       | 3930/13473 [11:00:28<5:10:48,  1.95s/it] 

{'loss': 0.729, 'grad_norm': 0.4127981960773468, 'learning_rate': 3.6780235874508595e-05, 'epoch': 0.88}


                                                         
 29%|██▉       | 3930/13473 [11:01:59<5:10:48,  1.95s/it]

{'eval_loss': 0.7012269496917725, 'eval_runtime': 90.5035, 'eval_samples_per_second': 10.994, 'eval_steps_per_second': 5.503, 'epoch': 0.88}


 29%|██▉       | 3940/13473 [11:02:07<5:09:06,  1.95s/it] 

{'loss': 0.4866, 'grad_norm': 0.4284135699272156, 'learning_rate': 3.67416942881369e-05, 'epoch': 0.88}


                                                         
 29%|██▉       | 3940/13473 [11:03:38<5:09:06,  1.95s/it]

{'eval_loss': 0.7160513401031494, 'eval_runtime': 90.5081, 'eval_samples_per_second': 10.993, 'eval_steps_per_second': 5.502, 'epoch': 0.88}


 29%|██▉       | 3950/13473 [11:03:46<5:01:06,  1.90s/it] 

{'loss': 0.66, 'grad_norm': 0.28240966796875, 'learning_rate': 3.670315270176521e-05, 'epoch': 0.88}


                                                         
 29%|██▉       | 3950/13473 [11:05:23<5:01:06,  1.90s/it]

{'eval_loss': 0.7310287952423096, 'eval_runtime': 96.1875, 'eval_samples_per_second': 10.344, 'eval_steps_per_second': 5.177, 'epoch': 0.88}


 29%|██▉       | 3960/13473 [11:05:31<5:16:53,  2.00s/it] 

{'loss': 0.6441, 'grad_norm': 0.8899909257888794, 'learning_rate': 3.666461111539351e-05, 'epoch': 0.88}


                                                         
 29%|██▉       | 3960/13473 [11:07:00<5:16:53,  2.00s/it]

{'eval_loss': 0.7146106362342834, 'eval_runtime': 88.9716, 'eval_samples_per_second': 11.183, 'eval_steps_per_second': 5.597, 'epoch': 0.88}


 29%|██▉       | 3970/13473 [11:07:11<5:11:28,  1.97s/it] 

{'loss': 0.3484, 'grad_norm': 0.629335343837738, 'learning_rate': 3.662606952902182e-05, 'epoch': 0.88}


                                                         
 29%|██▉       | 3970/13473 [11:08:29<5:11:28,  1.97s/it]

{'eval_loss': 0.7155555486679077, 'eval_runtime': 78.3686, 'eval_samples_per_second': 12.696, 'eval_steps_per_second': 6.355, 'epoch': 0.88}


 30%|██▉       | 3980/13473 [11:08:37<4:36:13,  1.75s/it] 

{'loss': 0.7519, 'grad_norm': 0.2992776334285736, 'learning_rate': 3.658752794265012e-05, 'epoch': 0.89}


                                                         
 30%|██▉       | 3980/13473 [11:10:07<4:36:13,  1.75s/it]

{'eval_loss': 0.7128351926803589, 'eval_runtime': 89.9674, 'eval_samples_per_second': 11.06, 'eval_steps_per_second': 5.535, 'epoch': 0.89}


 30%|██▉       | 3990/13473 [11:10:16<4:58:41,  1.89s/it] 

{'loss': 1.1955, 'grad_norm': 0.7546107172966003, 'learning_rate': 3.654898635627842e-05, 'epoch': 0.89}


                                                         
 30%|██▉       | 3990/13473 [11:11:50<4:58:41,  1.89s/it]

{'eval_loss': 0.7018095850944519, 'eval_runtime': 94.3318, 'eval_samples_per_second': 10.548, 'eval_steps_per_second': 5.279, 'epoch': 0.89}


 30%|██▉       | 4000/13473 [11:11:59<5:05:20,  1.93s/it] 

{'loss': 0.4942, 'grad_norm': 0.5391786098480225, 'learning_rate': 3.651044476990673e-05, 'epoch': 0.89}


                                                         
 30%|██▉       | 4000/13473 [11:13:33<5:05:20,  1.93s/it]

{'eval_loss': 0.7032909989356995, 'eval_runtime': 94.4266, 'eval_samples_per_second': 10.537, 'eval_steps_per_second': 5.274, 'epoch': 0.89}


 30%|██▉       | 4010/13473 [11:13:43<5:26:50,  2.07s/it] 

{'loss': 0.4657, 'grad_norm': 0.347538560628891, 'learning_rate': 3.647190318353503e-05, 'epoch': 0.89}


                                                         
 30%|██▉       | 4010/13473 [11:15:18<5:26:50,  2.07s/it]

{'eval_loss': 0.7174208164215088, 'eval_runtime': 94.6709, 'eval_samples_per_second': 10.51, 'eval_steps_per_second': 5.26, 'epoch': 0.89}


 30%|██▉       | 4020/13473 [11:15:26<5:09:15,  1.96s/it] 

{'loss': 0.5837, 'grad_norm': 0.3861829936504364, 'learning_rate': 3.643336159716334e-05, 'epoch': 0.9}


                                                         
 30%|██▉       | 4020/13473 [11:16:58<5:09:15,  1.96s/it]

{'eval_loss': 0.7175451517105103, 'eval_runtime': 91.6626, 'eval_samples_per_second': 10.855, 'eval_steps_per_second': 5.433, 'epoch': 0.9}


 30%|██▉       | 4030/13473 [11:17:07<5:11:21,  1.98s/it] 

{'loss': 0.6967, 'grad_norm': 0.34337660670280457, 'learning_rate': 3.639482001079165e-05, 'epoch': 0.9}


                                                         
 30%|██▉       | 4030/13473 [11:18:32<5:11:21,  1.98s/it]

{'eval_loss': 0.7046527862548828, 'eval_runtime': 85.2613, 'eval_samples_per_second': 11.67, 'eval_steps_per_second': 5.841, 'epoch': 0.9}


 30%|██▉       | 4040/13473 [11:18:41<4:49:57,  1.84s/it] 

{'loss': 0.8977, 'grad_norm': 0.9171013832092285, 'learning_rate': 3.635627842441995e-05, 'epoch': 0.9}


                                                         
 30%|██▉       | 4040/13473 [11:20:08<4:49:57,  1.84s/it]

{'eval_loss': 0.7035996317863464, 'eval_runtime': 87.2581, 'eval_samples_per_second': 11.403, 'eval_steps_per_second': 5.707, 'epoch': 0.9}


 30%|███       | 4050/13473 [11:20:17<4:55:20,  1.88s/it] 

{'loss': 0.7657, 'grad_norm': 0.8125676512718201, 'learning_rate': 3.631773683804826e-05, 'epoch': 0.9}


                                                         
 30%|███       | 4050/13473 [11:21:43<4:55:20,  1.88s/it]

{'eval_loss': 0.706007182598114, 'eval_runtime': 86.7126, 'eval_samples_per_second': 11.475, 'eval_steps_per_second': 5.743, 'epoch': 0.9}


 30%|███       | 4060/13473 [11:21:52<4:44:22,  1.81s/it] 

{'loss': 0.5516, 'grad_norm': 0.8091434240341187, 'learning_rate': 3.6279195251676565e-05, 'epoch': 0.9}


                                                         
 30%|███       | 4060/13473 [11:23:36<4:44:22,  1.81s/it]

{'eval_loss': 0.7099102139472961, 'eval_runtime': 103.7337, 'eval_samples_per_second': 9.592, 'eval_steps_per_second': 4.801, 'epoch': 0.9}


 30%|███       | 4070/13473 [11:23:44<5:23:07,  2.06s/it] 

{'loss': 0.6847, 'grad_norm': 0.44895434379577637, 'learning_rate': 3.6240653665304866e-05, 'epoch': 0.91}


                                                         
 30%|███       | 4070/13473 [11:25:14<5:23:07,  2.06s/it]

{'eval_loss': 0.7119683027267456, 'eval_runtime': 89.6417, 'eval_samples_per_second': 11.1, 'eval_steps_per_second': 5.555, 'epoch': 0.91}


 30%|███       | 4080/13473 [11:25:22<4:54:09,  1.88s/it] 

{'loss': 0.5812, 'grad_norm': 0.33570200204849243, 'learning_rate': 3.620211207893317e-05, 'epoch': 0.91}


                                                         
 30%|███       | 4080/13473 [11:26:57<4:54:09,  1.88s/it]

{'eval_loss': 0.714508593082428, 'eval_runtime': 95.3133, 'eval_samples_per_second': 10.439, 'eval_steps_per_second': 5.225, 'epoch': 0.91}


 30%|███       | 4090/13473 [11:27:06<5:15:10,  2.02s/it] 

{'loss': 0.5592, 'grad_norm': 0.3840620219707489, 'learning_rate': 3.6163570492561475e-05, 'epoch': 0.91}


                                                         
 30%|███       | 4090/13473 [11:28:44<5:15:10,  2.02s/it]

{'eval_loss': 0.7136523723602295, 'eval_runtime': 97.91, 'eval_samples_per_second': 10.162, 'eval_steps_per_second': 5.086, 'epoch': 0.91}


 30%|███       | 4100/13473 [11:28:53<5:09:17,  1.98s/it] 

{'loss': 0.671, 'grad_norm': 0.9406524300575256, 'learning_rate': 3.6125028906189776e-05, 'epoch': 0.91}


                                                         
 30%|███       | 4100/13473 [11:30:28<5:09:17,  1.98s/it]

{'eval_loss': 0.7151442170143127, 'eval_runtime': 95.6736, 'eval_samples_per_second': 10.4, 'eval_steps_per_second': 5.205, 'epoch': 0.91}


 31%|███       | 4110/13473 [11:30:37<5:09:10,  1.98s/it] 

{'loss': 0.8599, 'grad_norm': 2.4282541275024414, 'learning_rate': 3.6086487319818084e-05, 'epoch': 0.92}


                                                         
 31%|███       | 4110/13473 [11:32:08<5:09:10,  1.98s/it]

{'eval_loss': 0.7065102458000183, 'eval_runtime': 91.1683, 'eval_samples_per_second': 10.914, 'eval_steps_per_second': 5.462, 'epoch': 0.92}


 31%|███       | 4120/13473 [11:32:17<4:53:31,  1.88s/it] 

{'loss': 0.626, 'grad_norm': 0.6035947203636169, 'learning_rate': 3.604794573344639e-05, 'epoch': 0.92}


                                                         
 31%|███       | 4120/13473 [11:33:49<4:53:31,  1.88s/it]

{'eval_loss': 0.7064369916915894, 'eval_runtime': 92.1424, 'eval_samples_per_second': 10.799, 'eval_steps_per_second': 5.405, 'epoch': 0.92}


 31%|███       | 4130/13473 [11:33:57<5:05:55,  1.96s/it] 

{'loss': 0.844, 'grad_norm': 1.0536302328109741, 'learning_rate': 3.600940414707469e-05, 'epoch': 0.92}


                                                         
 31%|███       | 4130/13473 [11:35:34<5:05:55,  1.96s/it]

{'eval_loss': 0.7050636410713196, 'eval_runtime': 96.4448, 'eval_samples_per_second': 10.317, 'eval_steps_per_second': 5.164, 'epoch': 0.92}


 31%|███       | 4140/13473 [11:35:42<5:00:33,  1.93s/it] 

{'loss': 0.7159, 'grad_norm': 0.41902682185173035, 'learning_rate': 3.5970862560703e-05, 'epoch': 0.92}


                                                         
 31%|███       | 4140/13473 [11:37:11<5:00:33,  1.93s/it]

{'eval_loss': 0.7063677906990051, 'eval_runtime': 88.4313, 'eval_samples_per_second': 11.252, 'eval_steps_per_second': 5.631, 'epoch': 0.92}


 31%|███       | 4150/13473 [11:37:20<5:00:06,  1.93s/it] 

{'loss': 0.6459, 'grad_norm': 0.31070083379745483, 'learning_rate': 3.593232097433131e-05, 'epoch': 0.92}


                                                         
 31%|███       | 4150/13473 [11:38:46<5:00:06,  1.93s/it]

{'eval_loss': 0.7113804817199707, 'eval_runtime': 86.0477, 'eval_samples_per_second': 11.563, 'eval_steps_per_second': 5.787, 'epoch': 0.92}


 31%|███       | 4160/13473 [11:38:56<4:47:15,  1.85s/it] 

{'loss': 1.086, 'grad_norm': 5.778802871704102, 'learning_rate': 3.589377938795961e-05, 'epoch': 0.93}


                                                         
 31%|███       | 4160/13473 [11:40:15<4:47:15,  1.85s/it]

{'eval_loss': 0.711131751537323, 'eval_runtime': 78.9509, 'eval_samples_per_second': 12.603, 'eval_steps_per_second': 6.308, 'epoch': 0.93}


 31%|███       | 4170/13473 [11:40:23<4:29:10,  1.74s/it] 

{'loss': 0.6436, 'grad_norm': 0.3127228617668152, 'learning_rate': 3.585523780158792e-05, 'epoch': 0.93}


                                                         
 31%|███       | 4170/13473 [11:41:43<4:29:10,  1.74s/it]

{'eval_loss': 0.7051762938499451, 'eval_runtime': 79.6977, 'eval_samples_per_second': 12.485, 'eval_steps_per_second': 6.249, 'epoch': 0.93}


 31%|███       | 4180/13473 [11:41:51<4:24:45,  1.71s/it] 

{'loss': 0.627, 'grad_norm': 0.6781559586524963, 'learning_rate': 3.581669621521622e-05, 'epoch': 0.93}


                                                         
 31%|███       | 4180/13473 [11:43:11<4:24:45,  1.71s/it]

{'eval_loss': 0.7042727470397949, 'eval_runtime': 79.6321, 'eval_samples_per_second': 12.495, 'eval_steps_per_second': 6.254, 'epoch': 0.93}


 31%|███       | 4190/13473 [11:43:19<4:32:28,  1.76s/it] 

{'loss': 0.6698, 'grad_norm': 1.3430604934692383, 'learning_rate': 3.577815462884452e-05, 'epoch': 0.93}


                                                         
 31%|███       | 4190/13473 [11:44:39<4:32:28,  1.76s/it]

{'eval_loss': 0.7193883657455444, 'eval_runtime': 79.7537, 'eval_samples_per_second': 12.476, 'eval_steps_per_second': 6.244, 'epoch': 0.93}


 31%|███       | 4200/13473 [11:44:48<4:31:45,  1.76s/it] 

{'loss': 0.9665, 'grad_norm': 0.33248281478881836, 'learning_rate': 3.573961304247283e-05, 'epoch': 0.94}


                                                         
 31%|███       | 4200/13473 [11:46:08<4:31:45,  1.76s/it]

{'eval_loss': 0.7039628624916077, 'eval_runtime': 80.3707, 'eval_samples_per_second': 12.38, 'eval_steps_per_second': 6.196, 'epoch': 0.94}


 31%|███       | 4210/13473 [11:46:16<4:30:30,  1.75s/it] 

{'loss': 0.6997, 'grad_norm': 0.9780458211898804, 'learning_rate': 3.570107145610114e-05, 'epoch': 0.94}


                                                         
 31%|███       | 4210/13473 [11:47:35<4:30:30,  1.75s/it]

{'eval_loss': 0.7030884623527527, 'eval_runtime': 78.8307, 'eval_samples_per_second': 12.622, 'eval_steps_per_second': 6.317, 'epoch': 0.94}


 31%|███▏      | 4220/13473 [11:47:46<4:37:54,  1.80s/it] 

{'loss': 0.7689, 'grad_norm': 0.2991669476032257, 'learning_rate': 3.566252986972944e-05, 'epoch': 0.94}


                                                         
 31%|███▏      | 4220/13473 [11:49:06<4:37:54,  1.80s/it]

{'eval_loss': 0.703546941280365, 'eval_runtime': 79.7143, 'eval_samples_per_second': 12.482, 'eval_steps_per_second': 6.247, 'epoch': 0.94}


 31%|███▏      | 4230/13473 [11:49:14<4:30:41,  1.76s/it] 

{'loss': 0.7028, 'grad_norm': 1.2554113864898682, 'learning_rate': 3.5623988283357746e-05, 'epoch': 0.94}


                                                         
 31%|███▏      | 4230/13473 [11:50:36<4:30:41,  1.76s/it]

{'eval_loss': 0.7077230215072632, 'eval_runtime': 82.0744, 'eval_samples_per_second': 12.123, 'eval_steps_per_second': 6.068, 'epoch': 0.94}


 31%|███▏      | 4240/13473 [11:50:45<4:34:54,  1.79s/it] 

{'loss': 0.8998, 'grad_norm': 1.028975009918213, 'learning_rate': 3.5585446696986054e-05, 'epoch': 0.94}


                                                         
 31%|███▏      | 4240/13473 [11:52:07<4:34:54,  1.79s/it]

{'eval_loss': 0.7034347057342529, 'eval_runtime': 82.3627, 'eval_samples_per_second': 12.081, 'eval_steps_per_second': 6.046, 'epoch': 0.94}


 32%|███▏      | 4250/13473 [11:52:15<4:36:17,  1.80s/it] 

{'loss': 0.8809, 'grad_norm': 0.6104938387870789, 'learning_rate': 3.5546905110614355e-05, 'epoch': 0.95}


                                                         
 32%|███▏      | 4250/13473 [11:53:36<4:36:17,  1.80s/it]

{'eval_loss': 0.7170084714889526, 'eval_runtime': 80.9689, 'eval_samples_per_second': 12.289, 'eval_steps_per_second': 6.151, 'epoch': 0.95}


 32%|███▏      | 4260/13473 [11:53:45<4:32:18,  1.77s/it] 

{'loss': 0.6272, 'grad_norm': 0.7036762237548828, 'learning_rate': 3.550836352424266e-05, 'epoch': 0.95}


                                                         
 32%|███▏      | 4260/13473 [11:55:08<4:32:18,  1.77s/it]

{'eval_loss': 0.7068650722503662, 'eval_runtime': 82.8117, 'eval_samples_per_second': 12.015, 'eval_steps_per_second': 6.014, 'epoch': 0.95}


 32%|███▏      | 4270/13473 [11:55:16<4:33:18,  1.78s/it] 

{'loss': 1.0089, 'grad_norm': 1.2208038568496704, 'learning_rate': 3.5469821937870964e-05, 'epoch': 0.95}


                                                         
 32%|███▏      | 4270/13473 [11:56:38<4:33:18,  1.78s/it]

{'eval_loss': 0.7054459452629089, 'eval_runtime': 82.0456, 'eval_samples_per_second': 12.127, 'eval_steps_per_second': 6.07, 'epoch': 0.95}


 32%|███▏      | 4280/13473 [11:56:46<4:27:32,  1.75s/it] 

{'loss': 0.8111, 'grad_norm': 0.456283837556839, 'learning_rate': 3.5431280351499265e-05, 'epoch': 0.95}


                                                         
 32%|███▏      | 4280/13473 [11:58:07<4:27:32,  1.75s/it]

{'eval_loss': 0.7011573910713196, 'eval_runtime': 81.5486, 'eval_samples_per_second': 12.201, 'eval_steps_per_second': 6.107, 'epoch': 0.95}


 32%|███▏      | 4290/13473 [11:58:16<4:37:59,  1.82s/it] 

{'loss': 0.9637, 'grad_norm': 1.1414874792099, 'learning_rate': 3.539273876512757e-05, 'epoch': 0.96}


                                                         
 32%|███▏      | 4290/13473 [11:59:40<4:37:59,  1.82s/it]

{'eval_loss': 0.7044798135757446, 'eval_runtime': 83.247, 'eval_samples_per_second': 11.952, 'eval_steps_per_second': 5.982, 'epoch': 0.96}


 32%|███▏      | 4300/13473 [11:59:52<5:25:55,  2.13s/it] 

{'loss': 0.5527, 'grad_norm': 0.41062963008880615, 'learning_rate': 3.5354197178755874e-05, 'epoch': 0.96}


                                                         
 32%|███▏      | 4300/13473 [12:01:12<5:25:55,  2.13s/it]

{'eval_loss': 0.7069121599197388, 'eval_runtime': 80.5406, 'eval_samples_per_second': 12.354, 'eval_steps_per_second': 6.183, 'epoch': 0.96}


 32%|███▏      | 4310/13473 [12:01:20<4:29:37,  1.77s/it] 

{'loss': 1.1355, 'grad_norm': 0.5427427291870117, 'learning_rate': 3.531565559238418e-05, 'epoch': 0.96}


                                                         
 32%|███▏      | 4310/13473 [12:02:41<4:29:37,  1.77s/it]

{'eval_loss': 0.7097851634025574, 'eval_runtime': 80.5442, 'eval_samples_per_second': 12.353, 'eval_steps_per_second': 6.183, 'epoch': 0.96}


 32%|███▏      | 4320/13473 [12:02:50<4:32:32,  1.79s/it] 

{'loss': 0.9452, 'grad_norm': 2.5013248920440674, 'learning_rate': 3.527711400601249e-05, 'epoch': 0.96}


                                                         
 32%|███▏      | 4320/13473 [12:04:12<4:32:32,  1.79s/it]

{'eval_loss': 0.703131914138794, 'eval_runtime': 82.8137, 'eval_samples_per_second': 12.015, 'eval_steps_per_second': 6.013, 'epoch': 0.96}


 32%|███▏      | 4330/13473 [12:04:21<4:34:57,  1.80s/it] 

{'loss': 0.9232, 'grad_norm': 0.43255406618118286, 'learning_rate': 3.523857241964079e-05, 'epoch': 0.96}


                                                         
 32%|███▏      | 4330/13473 [12:05:43<4:34:57,  1.80s/it]

{'eval_loss': 0.7005603909492493, 'eval_runtime': 81.9656, 'eval_samples_per_second': 12.139, 'eval_steps_per_second': 6.076, 'epoch': 0.96}


 32%|███▏      | 4340/13473 [12:05:52<4:32:30,  1.79s/it] 

{'loss': 0.8371, 'grad_norm': 0.36205610632896423, 'learning_rate': 3.52000308332691e-05, 'epoch': 0.97}


                                                         
 32%|███▏      | 4340/13473 [12:07:18<4:32:30,  1.79s/it]

{'eval_loss': 0.7008020877838135, 'eval_runtime': 85.842, 'eval_samples_per_second': 11.591, 'eval_steps_per_second': 5.801, 'epoch': 0.97}


 32%|███▏      | 4350/13473 [12:07:27<4:50:52,  1.91s/it] 

{'loss': 0.445, 'grad_norm': 1.6283248662948608, 'learning_rate': 3.516148924689741e-05, 'epoch': 0.97}


                                                         
 32%|███▏      | 4350/13473 [12:08:55<4:50:52,  1.91s/it]

{'eval_loss': 0.7111572623252869, 'eval_runtime': 88.2186, 'eval_samples_per_second': 11.279, 'eval_steps_per_second': 5.645, 'epoch': 0.97}


 32%|███▏      | 4360/13473 [12:09:03<4:43:06,  1.86s/it] 

{'loss': 0.697, 'grad_norm': 0.408590704202652, 'learning_rate': 3.512294766052571e-05, 'epoch': 0.97}


                                                         
 32%|███▏      | 4360/13473 [12:10:24<4:43:06,  1.86s/it]

{'eval_loss': 0.7098608016967773, 'eval_runtime': 80.627, 'eval_samples_per_second': 12.341, 'eval_steps_per_second': 6.177, 'epoch': 0.97}


 32%|███▏      | 4370/13473 [12:10:33<4:39:15,  1.84s/it] 

{'loss': 0.7479, 'grad_norm': 0.4137978255748749, 'learning_rate': 3.5084406074154017e-05, 'epoch': 0.97}


                                                         
 32%|███▏      | 4370/13473 [12:11:55<4:39:15,  1.84s/it]

{'eval_loss': 0.7075831294059753, 'eval_runtime': 81.0999, 'eval_samples_per_second': 12.269, 'eval_steps_per_second': 6.141, 'epoch': 0.97}


 33%|███▎      | 4380/13473 [12:12:03<4:28:19,  1.77s/it] 

{'loss': 0.7609, 'grad_norm': 2.7156267166137695, 'learning_rate': 3.504586448778232e-05, 'epoch': 0.98}


                                                         
 33%|███▎      | 4380/13473 [12:13:28<4:28:19,  1.77s/it]

{'eval_loss': 0.7106891870498657, 'eval_runtime': 84.8462, 'eval_samples_per_second': 11.727, 'eval_steps_per_second': 5.869, 'epoch': 0.98}


 33%|███▎      | 4390/13473 [12:13:45<8:46:10,  3.48s/it] 

{'loss': 0.4281, 'grad_norm': 0.6350797414779663, 'learning_rate': 3.500732290141062e-05, 'epoch': 0.98}


                                                         
 33%|███▎      | 4390/13473 [12:15:39<8:46:10,  3.48s/it]

{'eval_loss': 0.7116278409957886, 'eval_runtime': 113.9823, 'eval_samples_per_second': 8.729, 'eval_steps_per_second': 4.369, 'epoch': 0.98}


 33%|███▎      | 4400/13473 [12:15:47<5:36:57,  2.23s/it] 

{'loss': 0.9422, 'grad_norm': 0.3089836835861206, 'learning_rate': 3.496878131503893e-05, 'epoch': 0.98}




KeyboardInterrupt: 

In [13]:
# Evaluating the model
if trainer is not None:  # Check if trainer is defined
    eval_results = trainer.evaluate()

    # Print metrics in a structured format
    print("Evaluation Metrics:")
    print("=====================")
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}")
else:
    print("Trainer is not defined.")

100%|██████████| 498/498 [01:34<00:00,  5.25it/s]

Evaluation Metrics:
eval_loss: 0.7101
eval_runtime: 95.4551
eval_samples_per_second: 10.4240
eval_steps_per_second: 5.2170
epoch: 3.0000





In [20]:
# Load the model and tokenizer from the saved path
model_path = '/Users/halladaykinsey/Desktop/Conversational_Chatbot/trained_conversational_model'
model = GPT2LMHeadModel.from_pretrained(model_path)

# Ensure the model is in evaluation mode
model.eval()

def generate_response(prompt):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    attention_mask = (input_ids != tokenizer.pad_token_id).long()  # Adjust this line to create attention mask

    with torch.no_grad():
        output = model.generate(input_ids, 
                                attention_mask=attention_mask, 
                                max_length=150, 
                                num_return_sequences=1,
                                temperature=0.9,   # Adjust temperature
                                top_k=50,          # Adjust top-k
                                top_p=0.95)        # Adjust top-p

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

conversation_history = ""

def chat():
    global conversation_history
    print("Welcome to the chatbot! Type 'exit' to end the conversation.")
    
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'exit':
            break
        
        # Append user input to the conversation history
        conversation_history += f"You: {user_input}\n"
        
        # Generate response based on the updated conversation history
        response = generate_response(conversation_history)
        
        print(f"Bot: {response}")
        
        # Append the bot's response to the conversation history
        conversation_history += f"Bot: {response}\n"

In [22]:
# Start chatting
chat()

Welcome to the chatbot! Type 'exit' to end the conversation.


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Bot: You: Hello
Bot: You: Hello

You: How are you?
Bot: You: Hello
Bot: You: Hello

You: How are you?

You: If I lived in Los Angeles, what movie would you recommend watching on a cold and stormy Saturday night?



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


ValueError: Input length of input_ids is 157, but `max_length` is set to 150. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.