In [1]:
import pandas as pd
import numpy as np
import torch
import csv
from sentence_transformers import SentenceTransformer, util
import torch.nn as nn
import torch.nn.functional as func
from torch.nn import Linear as lin
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from torch.distributions.multinomial import Multinomial
from transformers import BertForTokenClassification
from transformers import PreTrainedTokenizer
from transformers import AutoTokenizer
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForTokenClassification
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForMaskedLM
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_test = pd.read_csv("Listing_Titles.tsv.gz", sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE)

In [3]:
def get_text(df):
    text = []
    for idx, row in df.iterrows():
        text.append(row["Title"].lower().split(" "))
        if idx%100000 == 0:
            print(idx)
    return text

In [4]:
train, test = train_test_split(get_text(df_test[:500000]), train_size=0.995)

0
100000
200000
300000
400000


In [5]:
for i in range(len(train)):
    for j in range(len(train[i])):
        r = random.random()
        if r <= 0.15:
            train[i][j] = "[MASK]"

for i in range(len(test)):
    for j in range(len(test[i])):
        r = random.random()
        if r <= 0.15:
            test[i][j] = "[MASK]"


In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(test, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [7]:
def encode_tags(labels, encodings):
    
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = np.array(doc_labels)[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)]


        encoded_labels.append(doc_enc_labels.tolist())
    

    return encoded_labels

t = encode_tags(train_encodings["input_ids"], train_encodings)
v = encode_tags(val_encodings["input_ids"], val_encodings)

In [8]:
train_tags = torch.where(torch.tensor(train_encodings.input_ids) == tokenizer.mask_token_id, torch.tensor(t), -100)
val_tags = torch.where(torch.tensor(val_encodings.input_ids) == tokenizer.mask_token_id, torch.tensor(v), -100)
train_encodings.pop("offset_mapping")
val_encodings.pop("offset_mapping")

[[(0, 0),
  (0, 6),
  (0, 5),
  (0, 8),
  (0, 5),
  (0, 6),
  (0, 5),
  (0, 7),
  (0, 2),
  (2, 6),
  (0, 6),
  (0, 4),
  (4, 7),
  (0, 4),
  (4, 7),
  (0, 4),
  (0, 3),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0)],
 [(0, 0),
  (0, 7),
  (0, 5),
  (0, 3),
  (3, 6),
  (0, 5),
  (0, 8),
  (0, 3),
  (0, 1),
  (0, 1),
  (0, 6),
  (0, 5),
  (0, 5),
  (0, 1),
  (0, 5),
  (0, 1),
  (0, 4),
  (4, 5),
  (5, 9),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0)],
 [(0, 0),
  (0, 7),
  (0, 6),
  (0, 6),
  (0, 5),
  (0, 6),
  (0, 5),
  (0, 4),
  (4, 7),
  (0, 1),
  (0, 5),
  (0, 6),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0)],
 [(0, 0),
  (0, 4),
  (0, 2),
  (2, 5

In [9]:
class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        for encoding in self.encodings.keys():
            self.encodings[encoding] = torch.tensor(self.encodings[encoding]).to("cuda")
        self.labels = torch.tensor(labels).to("cuda")
        

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [10]:
train_dataset = Data(train_encodings, train_tags)
val_dataset = Data(val_encodings, val_tags)

  self.labels = torch.tensor(labels).to("cuda")


In [11]:
torch.cuda.set_per_process_memory_fraction(1.0, device=None)

In [15]:
torch.save(model.state_dict(), "/home/henry/work/projects/bert_1_epoch.obj")

In [13]:
#m = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")
#m.load_state_dict(torch.load("/home/henry/work/projects/bert_1_epoch.obj"))

In [14]:
from transformers import DistilBertForMaskedLM, Trainer, TrainingArguments

training_args = TrainingArguments(output_dir="./results", learning_rate=0.0001, num_train_epochs=1,per_device_train_batch_size=32,per_device_eval_batch_size=64,evaluation_strategy="steps", do_eval=True, eval_steps=250, warmup_steps=500,weight_decay=0.01,logging_dir='./logs',logging_steps=10,dataloader_pin_memory=False)

model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased").to("cuda")

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset             
)

trainer.train()

***** Running training *****
  Num examples = 497500
  Num Epochs = 1
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 15547
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhwarzecha[0m ([33muchcia[0m). Use [1m`wandb login --relogin`[0m to force relogin


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  0%|          | 11/15547 [00:02<52:27,  4.94it/s] 

{'loss': 14.0513, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


  0%|          | 21/15547 [00:04<51:35,  5.02it/s]

{'loss': 13.0515, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}


  0%|          | 31/15547 [00:06<51:34,  5.01it/s]

{'loss': 11.2415, 'learning_rate': 6e-06, 'epoch': 0.0}


  0%|          | 41/15547 [00:08<51:31,  5.02it/s]

{'loss': 8.9215, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.0}


  0%|          | 51/15547 [00:10<51:21,  5.03it/s]

{'loss': 5.6778, 'learning_rate': 1e-05, 'epoch': 0.0}


  0%|          | 61/15547 [00:12<51:43,  4.99it/s]

{'loss': 1.8133, 'learning_rate': 1.2e-05, 'epoch': 0.0}


  0%|          | 71/15547 [00:14<51:59,  4.96it/s]

{'loss': 0.2259, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.0}


  1%|          | 81/15547 [00:16<51:35,  5.00it/s]

{'loss': 0.0217, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.01}


  1%|          | 91/15547 [00:18<51:31,  5.00it/s]

{'loss': 0.005, 'learning_rate': 1.8e-05, 'epoch': 0.01}


  1%|          | 101/15547 [00:20<55:14,  4.66it/s]

{'loss': 0.0022, 'learning_rate': 2e-05, 'epoch': 0.01}


  1%|          | 111/15547 [00:22<51:42,  4.98it/s]

{'loss': 0.0015, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.01}


  1%|          | 121/15547 [00:24<51:33,  4.99it/s]

{'loss': 0.0011, 'learning_rate': 2.4e-05, 'epoch': 0.01}


  1%|          | 131/15547 [00:26<51:15,  5.01it/s]

{'loss': 0.0009, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.01}


  1%|          | 141/15547 [00:28<52:00,  4.94it/s]

{'loss': 0.0008, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.01}


  1%|          | 151/15547 [00:30<51:08,  5.02it/s]

{'loss': 0.0007, 'learning_rate': 3e-05, 'epoch': 0.01}


  1%|          | 161/15547 [00:32<51:30,  4.98it/s]

{'loss': 0.0006, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.01}


  1%|          | 171/15547 [00:34<51:09,  5.01it/s]

{'loss': 0.0005, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.01}


  1%|          | 181/15547 [00:36<51:07,  5.01it/s]

{'loss': 0.0005, 'learning_rate': 3.6e-05, 'epoch': 0.01}


  1%|          | 191/15547 [00:38<51:12,  5.00it/s]

{'loss': 0.0004, 'learning_rate': 3.8e-05, 'epoch': 0.01}


  1%|▏         | 201/15547 [00:40<55:46,  4.59it/s]

{'loss': 0.0004, 'learning_rate': 4e-05, 'epoch': 0.01}


  1%|▏         | 211/15547 [00:42<51:17,  4.98it/s]

{'loss': 0.0003, 'learning_rate': 4.2e-05, 'epoch': 0.01}


  1%|▏         | 221/15547 [00:44<51:23,  4.97it/s]

{'loss': 0.0003, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.01}


  1%|▏         | 231/15547 [00:46<51:06,  5.00it/s]

{'loss': 0.0003, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.01}


  2%|▏         | 241/15547 [00:48<50:53,  5.01it/s]

{'loss': 0.0003, 'learning_rate': 4.8e-05, 'epoch': 0.02}


  2%|▏         | 250/15547 [00:50<50:43,  5.03it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0002, 'learning_rate': 5e-05, 'epoch': 0.02}


                                                   
  2%|▏         | 251/15547 [00:53<4:43:30,  1.11s/it]

{'eval_loss': 0.00016992616292554885, 'eval_runtime': 3.0303, 'eval_samples_per_second': 825.01, 'eval_steps_per_second': 13.2, 'epoch': 0.02}


  2%|▏         | 261/15547 [00:55<57:45,  4.41it/s]  

{'loss': 0.0002, 'learning_rate': 5.2000000000000004e-05, 'epoch': 0.02}


  2%|▏         | 271/15547 [00:57<51:21,  4.96it/s]

{'loss': 0.0002, 'learning_rate': 5.4000000000000005e-05, 'epoch': 0.02}


  2%|▏         | 281/15547 [00:59<51:00,  4.99it/s]

{'loss': 0.0002, 'learning_rate': 5.6000000000000006e-05, 'epoch': 0.02}


  2%|▏         | 291/15547 [01:01<50:41,  5.02it/s]

{'loss': 0.0002, 'learning_rate': 5.8e-05, 'epoch': 0.02}


  2%|▏         | 301/15547 [01:03<55:36,  4.57it/s]

{'loss': 0.0002, 'learning_rate': 6e-05, 'epoch': 0.02}


  2%|▏         | 311/15547 [01:05<51:12,  4.96it/s]

{'loss': 0.0001, 'learning_rate': 6.2e-05, 'epoch': 0.02}


  2%|▏         | 321/15547 [01:07<50:54,  4.98it/s]

{'loss': 0.0001, 'learning_rate': 6.400000000000001e-05, 'epoch': 0.02}


  2%|▏         | 331/15547 [01:09<50:41,  5.00it/s]

{'loss': 0.0001, 'learning_rate': 6.6e-05, 'epoch': 0.02}


  2%|▏         | 341/15547 [01:11<50:45,  4.99it/s]

{'loss': 0.0001, 'learning_rate': 6.800000000000001e-05, 'epoch': 0.02}


  2%|▏         | 351/15547 [01:13<50:40,  5.00it/s]

{'loss': 0.0001, 'learning_rate': 7e-05, 'epoch': 0.02}


  2%|▏         | 361/15547 [01:15<50:26,  5.02it/s]

{'loss': 0.0001, 'learning_rate': 7.2e-05, 'epoch': 0.02}


  2%|▏         | 371/15547 [01:17<50:34,  5.00it/s]

{'loss': 0.0001, 'learning_rate': 7.4e-05, 'epoch': 0.02}


  2%|▏         | 381/15547 [01:19<50:24,  5.01it/s]

{'loss': 0.0001, 'learning_rate': 7.6e-05, 'epoch': 0.02}


  3%|▎         | 391/15547 [01:21<50:25,  5.01it/s]

{'loss': 0.0001, 'learning_rate': 7.800000000000001e-05, 'epoch': 0.03}


  3%|▎         | 401/15547 [01:23<56:08,  4.50it/s]

{'loss': 0.0001, 'learning_rate': 8e-05, 'epoch': 0.03}


  3%|▎         | 411/15547 [01:25<50:33,  4.99it/s]

{'loss': 0.0001, 'learning_rate': 8.2e-05, 'epoch': 0.03}


  3%|▎         | 421/15547 [01:27<51:27,  4.90it/s]

{'loss': 0.0001, 'learning_rate': 8.4e-05, 'epoch': 0.03}


  3%|▎         | 431/15547 [01:29<50:20,  5.01it/s]

{'loss': 0.0001, 'learning_rate': 8.6e-05, 'epoch': 0.03}


  3%|▎         | 441/15547 [01:31<50:12,  5.01it/s]

{'loss': 0.0001, 'learning_rate': 8.800000000000001e-05, 'epoch': 0.03}


  3%|▎         | 451/15547 [01:33<50:25,  4.99it/s]

{'loss': 0.0001, 'learning_rate': 9e-05, 'epoch': 0.03}


  3%|▎         | 461/15547 [01:35<50:50,  4.95it/s]

{'loss': 0.0001, 'learning_rate': 9.200000000000001e-05, 'epoch': 0.03}


  3%|▎         | 471/15547 [01:37<50:47,  4.95it/s]

{'loss': 0.0001, 'learning_rate': 9.4e-05, 'epoch': 0.03}


  3%|▎         | 481/15547 [01:39<51:05,  4.91it/s]

{'loss': 0.0001, 'learning_rate': 9.6e-05, 'epoch': 0.03}


  3%|▎         | 491/15547 [01:41<50:20,  4.98it/s]

{'loss': 0.0001, 'learning_rate': 9.8e-05, 'epoch': 0.03}


  3%|▎         | 500/15547 [01:43<58:28,  4.29it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0001, 'learning_rate': 0.0001, 'epoch': 0.03}


                                                   
  3%|▎         | 500/15547 [01:46<58:28,  4.29it/s]Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json


{'eval_loss': 3.67025968444068e-05, 'eval_runtime': 3.0367, 'eval_samples_per_second': 823.271, 'eval_steps_per_second': 13.172, 'epoch': 0.03}


Model weights saved in ./results/checkpoint-500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  3%|▎         | 511/15547 [01:50<59:23,  4.22it/s]  

{'loss': 0.0, 'learning_rate': 9.993354156974812e-05, 'epoch': 0.03}


  3%|▎         | 521/15547 [01:52<50:30,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.986708313949625e-05, 'epoch': 0.03}


  3%|▎         | 531/15547 [01:54<50:22,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.980062470924438e-05, 'epoch': 0.03}


  3%|▎         | 541/15547 [01:56<50:16,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.973416627899249e-05, 'epoch': 0.03}


  4%|▎         | 551/15547 [01:58<50:14,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.966770784874062e-05, 'epoch': 0.04}


  4%|▎         | 561/15547 [02:00<50:10,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.960124941848874e-05, 'epoch': 0.04}


  4%|▎         | 571/15547 [02:02<49:54,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.953479098823686e-05, 'epoch': 0.04}


  4%|▎         | 581/15547 [02:04<49:57,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.946833255798499e-05, 'epoch': 0.04}


  4%|▍         | 591/15547 [02:06<50:02,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.940187412773311e-05, 'epoch': 0.04}


  4%|▍         | 601/15547 [02:08<54:21,  4.58it/s]

{'loss': 0.0, 'learning_rate': 9.933541569748122e-05, 'epoch': 0.04}


  4%|▍         | 611/15547 [02:10<49:58,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.926895726722935e-05, 'epoch': 0.04}


  4%|▍         | 621/15547 [02:12<50:04,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.920249883697748e-05, 'epoch': 0.04}


  4%|▍         | 631/15547 [02:14<50:09,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.91360404067256e-05, 'epoch': 0.04}


  4%|▍         | 641/15547 [02:16<49:42,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.906958197647372e-05, 'epoch': 0.04}


  4%|▍         | 651/15547 [02:18<50:02,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.900312354622184e-05, 'epoch': 0.04}


  4%|▍         | 661/15547 [02:20<49:53,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.893666511596997e-05, 'epoch': 0.04}


  4%|▍         | 671/15547 [02:22<49:44,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.88702066857181e-05, 'epoch': 0.04}


  4%|▍         | 681/15547 [02:24<49:47,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.880374825546621e-05, 'epoch': 0.04}


  4%|▍         | 691/15547 [02:26<49:47,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.873728982521432e-05, 'epoch': 0.04}


  5%|▍         | 701/15547 [02:28<55:01,  4.50it/s]

{'loss': 0.0, 'learning_rate': 9.867083139496245e-05, 'epoch': 0.05}


  5%|▍         | 711/15547 [02:30<49:44,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.860437296471058e-05, 'epoch': 0.05}


  5%|▍         | 721/15547 [02:32<49:39,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.85379145344587e-05, 'epoch': 0.05}


  5%|▍         | 731/15547 [02:34<49:37,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.847145610420682e-05, 'epoch': 0.05}


  5%|▍         | 741/15547 [02:36<49:13,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.840499767395494e-05, 'epoch': 0.05}


  5%|▍         | 750/15547 [02:38<49:09,  5.02it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 9.833853924370307e-05, 'epoch': 0.05}


                                                   
  5%|▍         | 751/15547 [02:41<4:32:01,  1.10s/it]

{'eval_loss': 1.6296453395625576e-05, 'eval_runtime': 3.0096, 'eval_samples_per_second': 830.687, 'eval_steps_per_second': 13.291, 'epoch': 0.05}


  5%|▍         | 761/15547 [02:43<55:50,  4.41it/s]  

{'loss': 0.0, 'learning_rate': 9.82720808134512e-05, 'epoch': 0.05}


  5%|▍         | 771/15547 [02:45<49:35,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.820562238319931e-05, 'epoch': 0.05}


  5%|▌         | 781/15547 [02:47<49:32,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.813916395294743e-05, 'epoch': 0.05}


  5%|▌         | 791/15547 [02:49<49:24,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.807270552269557e-05, 'epoch': 0.05}


  5%|▌         | 801/15547 [02:51<53:24,  4.60it/s]

{'loss': 0.0, 'learning_rate': 9.800624709244368e-05, 'epoch': 0.05}


  5%|▌         | 811/15547 [02:53<49:28,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.79397886621918e-05, 'epoch': 0.05}


  5%|▌         | 821/15547 [02:55<49:02,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.787333023193993e-05, 'epoch': 0.05}


  5%|▌         | 831/15547 [02:57<49:06,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.780687180168806e-05, 'epoch': 0.05}


  5%|▌         | 841/15547 [02:59<48:59,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.774041337143617e-05, 'epoch': 0.05}


  5%|▌         | 851/15547 [03:01<49:21,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.76739549411843e-05, 'epoch': 0.05}


  6%|▌         | 861/15547 [03:03<48:59,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.760749651093241e-05, 'epoch': 0.06}


  6%|▌         | 871/15547 [03:05<49:00,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.754103808068053e-05, 'epoch': 0.06}


  6%|▌         | 881/15547 [03:07<48:56,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.747457965042867e-05, 'epoch': 0.06}


  6%|▌         | 891/15547 [03:09<49:06,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.740812122017679e-05, 'epoch': 0.06}


  6%|▌         | 901/15547 [03:11<52:41,  4.63it/s]

{'loss': 0.0, 'learning_rate': 9.73416627899249e-05, 'epoch': 0.06}


  6%|▌         | 911/15547 [03:13<48:55,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.727520435967303e-05, 'epoch': 0.06}


  6%|▌         | 921/15547 [03:15<48:42,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.720874592942116e-05, 'epoch': 0.06}


  6%|▌         | 931/15547 [03:17<48:40,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.714228749916927e-05, 'epoch': 0.06}


  6%|▌         | 941/15547 [03:19<48:41,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.70758290689174e-05, 'epoch': 0.06}


  6%|▌         | 951/15547 [03:21<48:44,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.700937063866552e-05, 'epoch': 0.06}


  6%|▌         | 961/15547 [03:23<48:30,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.694291220841365e-05, 'epoch': 0.06}


  6%|▌         | 971/15547 [03:25<48:20,  5.02it/s]

{'loss': 0.0, 'learning_rate': 9.687645377816177e-05, 'epoch': 0.06}


  6%|▋         | 981/15547 [03:27<48:37,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.680999534790989e-05, 'epoch': 0.06}


  6%|▋         | 991/15547 [03:29<48:34,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.6743536917658e-05, 'epoch': 0.06}


  6%|▋         | 1000/15547 [03:31<53:51,  4.50it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 9.667707848740613e-05, 'epoch': 0.06}


                                                    
  6%|▋         | 1000/15547 [03:34<53:51,  4.50it/s]Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json


{'eval_loss': 9.75561579252826e-06, 'eval_runtime': 3.0069, 'eval_samples_per_second': 831.423, 'eval_steps_per_second': 13.303, 'epoch': 0.06}


Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  7%|▋         | 1011/15547 [03:37<56:22,  4.30it/s]  

{'loss': 0.0, 'learning_rate': 9.661062005715426e-05, 'epoch': 0.06}


  7%|▋         | 1021/15547 [03:39<49:09,  4.92it/s]

{'loss': 0.0, 'learning_rate': 9.654416162690238e-05, 'epoch': 0.07}


  7%|▋         | 1031/15547 [03:41<48:31,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.64777031966505e-05, 'epoch': 0.07}


  7%|▋         | 1041/15547 [03:43<48:26,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.641124476639862e-05, 'epoch': 0.07}


  7%|▋         | 1051/15547 [03:45<48:13,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.634478633614675e-05, 'epoch': 0.07}


  7%|▋         | 1061/15547 [03:47<48:19,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.627832790589488e-05, 'epoch': 0.07}


  7%|▋         | 1071/15547 [03:49<48:24,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.621186947564299e-05, 'epoch': 0.07}


  7%|▋         | 1081/15547 [03:51<48:36,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.61454110453911e-05, 'epoch': 0.07}


  7%|▋         | 1091/15547 [03:53<48:21,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.607895261513923e-05, 'epoch': 0.07}


  7%|▋         | 1101/15547 [03:55<52:16,  4.61it/s]

{'loss': 0.0, 'learning_rate': 9.601249418488736e-05, 'epoch': 0.07}


  7%|▋         | 1111/15547 [03:57<48:31,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.594603575463548e-05, 'epoch': 0.07}


  7%|▋         | 1121/15547 [03:59<48:25,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.58795773243836e-05, 'epoch': 0.07}


  7%|▋         | 1131/15547 [04:01<48:12,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.581311889413172e-05, 'epoch': 0.07}


  7%|▋         | 1141/15547 [04:03<48:17,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.574666046387985e-05, 'epoch': 0.07}


  7%|▋         | 1151/15547 [04:05<48:02,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.568020203362798e-05, 'epoch': 0.07}


  7%|▋         | 1161/15547 [04:07<48:06,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.56137436033761e-05, 'epoch': 0.07}


  8%|▊         | 1171/15547 [04:09<48:08,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.554728517312421e-05, 'epoch': 0.08}


  8%|▊         | 1181/15547 [04:11<47:55,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.548082674287234e-05, 'epoch': 0.08}


  8%|▊         | 1191/15547 [04:13<47:56,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.541436831262047e-05, 'epoch': 0.08}


  8%|▊         | 1201/15547 [04:15<52:21,  4.57it/s]

{'loss': 0.0, 'learning_rate': 9.534790988236858e-05, 'epoch': 0.08}


  8%|▊         | 1211/15547 [04:17<48:11,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.528145145211671e-05, 'epoch': 0.08}


  8%|▊         | 1221/15547 [04:19<47:52,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.521499302186482e-05, 'epoch': 0.08}


  8%|▊         | 1231/15547 [04:21<47:55,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.514853459161295e-05, 'epoch': 0.08}


  8%|▊         | 1241/15547 [04:23<47:39,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.508207616136108e-05, 'epoch': 0.08}


  8%|▊         | 1250/15547 [04:25<46:36,  5.11it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 9.50156177311092e-05, 'epoch': 0.08}


                                                    
  8%|▊         | 1251/15547 [04:28<4:23:00,  1.10s/it]

{'eval_loss': 6.448341082432307e-06, 'eval_runtime': 3.0102, 'eval_samples_per_second': 830.509, 'eval_steps_per_second': 13.288, 'epoch': 0.08}


  8%|▊         | 1261/15547 [04:30<53:41,  4.43it/s]  

{'loss': 0.0, 'learning_rate': 9.494915930085731e-05, 'epoch': 0.08}


  8%|▊         | 1271/15547 [04:32<47:36,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.488270087060544e-05, 'epoch': 0.08}


  8%|▊         | 1281/15547 [04:34<47:21,  5.02it/s]

{'loss': 0.0, 'learning_rate': 9.481624244035357e-05, 'epoch': 0.08}


  8%|▊         | 1291/15547 [04:36<47:33,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.474978401010168e-05, 'epoch': 0.08}


  8%|▊         | 1301/15547 [04:38<51:20,  4.62it/s]

{'loss': 0.0, 'learning_rate': 9.468332557984981e-05, 'epoch': 0.08}


  8%|▊         | 1311/15547 [04:40<47:21,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.461686714959793e-05, 'epoch': 0.08}


  8%|▊         | 1321/15547 [04:42<47:26,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.455040871934605e-05, 'epoch': 0.08}


  9%|▊         | 1331/15547 [04:44<47:19,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.448395028909418e-05, 'epoch': 0.09}


  9%|▊         | 1341/15547 [04:46<48:06,  4.92it/s]

{'loss': 0.0, 'learning_rate': 9.44174918588423e-05, 'epoch': 0.09}


  9%|▊         | 1351/15547 [04:48<47:25,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.435103342859041e-05, 'epoch': 0.09}


  9%|▉         | 1361/15547 [04:50<47:24,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.428457499833856e-05, 'epoch': 0.09}


  9%|▉         | 1371/15547 [04:52<47:16,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.421811656808667e-05, 'epoch': 0.09}


  9%|▉         | 1381/15547 [04:54<47:26,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.415165813783479e-05, 'epoch': 0.09}


  9%|▉         | 1391/15547 [04:56<47:27,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.408519970758291e-05, 'epoch': 0.09}


  9%|▉         | 1401/15547 [04:58<51:29,  4.58it/s]

{'loss': 0.0, 'learning_rate': 9.401874127733103e-05, 'epoch': 0.09}


  9%|▉         | 1411/15547 [05:00<46:35,  5.06it/s]

{'loss': 0.0, 'learning_rate': 9.395228284707916e-05, 'epoch': 0.09}


  9%|▉         | 1421/15547 [05:02<47:21,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.388582441682729e-05, 'epoch': 0.09}


  9%|▉         | 1431/15547 [05:04<46:50,  5.02it/s]

{'loss': 0.0, 'learning_rate': 9.38193659865754e-05, 'epoch': 0.09}


  9%|▉         | 1441/15547 [05:06<46:43,  5.03it/s]

{'loss': 0.0, 'learning_rate': 9.375290755632352e-05, 'epoch': 0.09}


  9%|▉         | 1451/15547 [05:08<47:01,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.368644912607166e-05, 'epoch': 0.09}


  9%|▉         | 1461/15547 [05:10<46:47,  5.02it/s]

{'loss': 0.0, 'learning_rate': 9.361999069581977e-05, 'epoch': 0.09}


  9%|▉         | 1471/15547 [05:12<46:52,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.355353226556789e-05, 'epoch': 0.09}


 10%|▉         | 1481/15547 [05:14<46:51,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.348707383531602e-05, 'epoch': 0.1}


 10%|▉         | 1491/15547 [05:16<47:09,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.342061540506414e-05, 'epoch': 0.1}


 10%|▉         | 1500/15547 [05:18<53:18,  4.39it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 9.335415697481226e-05, 'epoch': 0.1}


                                                    
 10%|▉         | 1500/15547 [05:21<53:18,  4.39it/s]Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json


{'eval_loss': 4.833160346606746e-06, 'eval_runtime': 3.0356, 'eval_samples_per_second': 823.556, 'eval_steps_per_second': 13.177, 'epoch': 0.1}


Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 10%|▉         | 1511/15547 [05:24<55:22,  4.22it/s]  

{'loss': 0.0, 'learning_rate': 9.328769854456039e-05, 'epoch': 0.1}


 10%|▉         | 1521/15547 [05:26<46:58,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.32212401143085e-05, 'epoch': 0.1}


 10%|▉         | 1531/15547 [05:28<46:06,  5.07it/s]

{'loss': 0.0, 'learning_rate': 9.315478168405662e-05, 'epoch': 0.1}


 10%|▉         | 1541/15547 [05:30<46:59,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.308832325380476e-05, 'epoch': 0.1}


 10%|▉         | 1551/15547 [05:32<46:46,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.302186482355287e-05, 'epoch': 0.1}


 10%|█         | 1561/15547 [05:34<46:18,  5.03it/s]

{'loss': 0.0, 'learning_rate': 9.295540639330099e-05, 'epoch': 0.1}


 10%|█         | 1571/15547 [05:36<46:58,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.288894796304912e-05, 'epoch': 0.1}


 10%|█         | 1581/15547 [05:38<46:48,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.282248953279725e-05, 'epoch': 0.1}


 10%|█         | 1591/15547 [05:40<46:49,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.275603110254536e-05, 'epoch': 0.1}


 10%|█         | 1601/15547 [05:42<50:43,  4.58it/s]

{'loss': 0.0, 'learning_rate': 9.268957267229349e-05, 'epoch': 0.1}


 10%|█         | 1611/15547 [05:44<46:59,  4.94it/s]

{'loss': 0.0, 'learning_rate': 9.26231142420416e-05, 'epoch': 0.1}


 10%|█         | 1621/15547 [05:46<46:46,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.255665581178973e-05, 'epoch': 0.1}


 10%|█         | 1631/15547 [05:48<46:19,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.249019738153786e-05, 'epoch': 0.1}


 11%|█         | 1641/15547 [05:50<46:06,  5.03it/s]

{'loss': 0.0, 'learning_rate': 9.242373895128598e-05, 'epoch': 0.11}


 11%|█         | 1651/15547 [05:52<46:55,  4.93it/s]

{'loss': 0.0, 'learning_rate': 9.235728052103409e-05, 'epoch': 0.11}


 11%|█         | 1661/15547 [05:54<46:27,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.229082209078222e-05, 'epoch': 0.11}


 11%|█         | 1671/15547 [05:56<46:25,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.222436366053035e-05, 'epoch': 0.11}


 11%|█         | 1681/15547 [05:58<46:09,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.215790523027846e-05, 'epoch': 0.11}


 11%|█         | 1691/15547 [06:00<46:21,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.209144680002659e-05, 'epoch': 0.11}


 11%|█         | 1701/15547 [06:02<50:04,  4.61it/s]

{'loss': 0.0, 'learning_rate': 9.202498836977471e-05, 'epoch': 0.11}


 11%|█         | 1711/15547 [06:04<45:45,  5.04it/s]

{'loss': 0.0, 'learning_rate': 9.195852993952284e-05, 'epoch': 0.11}


 11%|█         | 1721/15547 [06:06<45:52,  5.02it/s]

{'loss': 0.0, 'learning_rate': 9.189207150927096e-05, 'epoch': 0.11}


 11%|█         | 1731/15547 [06:08<45:58,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.182561307901908e-05, 'epoch': 0.11}


 11%|█         | 1741/15547 [06:10<46:08,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.17591546487672e-05, 'epoch': 0.11}


 11%|█▏        | 1750/15547 [06:12<45:57,  5.00it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 9.169269621851532e-05, 'epoch': 0.11}


                                                    
 11%|█▏        | 1751/15547 [06:15<4:15:13,  1.11s/it]

{'eval_loss': 3.747941946130595e-06, 'eval_runtime': 3.0222, 'eval_samples_per_second': 827.216, 'eval_steps_per_second': 13.235, 'epoch': 0.11}


 11%|█▏        | 1761/15547 [06:17<52:27,  4.38it/s]  

{'loss': 0.0, 'learning_rate': 9.162623778826345e-05, 'epoch': 0.11}


 11%|█▏        | 1771/15547 [06:19<46:16,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.155977935801157e-05, 'epoch': 0.11}


 11%|█▏        | 1781/15547 [06:21<46:17,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.14933209277597e-05, 'epoch': 0.11}


 12%|█▏        | 1791/15547 [06:23<46:15,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.142686249750781e-05, 'epoch': 0.12}


 12%|█▏        | 1801/15547 [06:25<50:13,  4.56it/s]

{'loss': 0.0, 'learning_rate': 9.136040406725594e-05, 'epoch': 0.12}


 12%|█▏        | 1811/15547 [06:27<46:03,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.129394563700407e-05, 'epoch': 0.12}


 12%|█▏        | 1821/15547 [06:29<46:02,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.122748720675218e-05, 'epoch': 0.12}


 12%|█▏        | 1831/15547 [06:31<46:02,  4.96it/s]

{'loss': 0.0, 'learning_rate': 9.11610287765003e-05, 'epoch': 0.12}


 12%|█▏        | 1841/15547 [06:33<45:59,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.109457034624843e-05, 'epoch': 0.12}


 12%|█▏        | 1851/15547 [06:35<45:48,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.102811191599655e-05, 'epoch': 0.12}


 12%|█▏        | 1861/15547 [06:37<44:59,  5.07it/s]

{'loss': 0.0, 'learning_rate': 9.096165348574467e-05, 'epoch': 0.12}


 12%|█▏        | 1871/15547 [06:39<45:33,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.08951950554928e-05, 'epoch': 0.12}


 12%|█▏        | 1881/15547 [06:41<45:11,  5.04it/s]

{'loss': 0.0, 'learning_rate': 9.082873662524091e-05, 'epoch': 0.12}


 12%|█▏        | 1891/15547 [06:43<45:24,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.076227819498904e-05, 'epoch': 0.12}


 12%|█▏        | 1901/15547 [06:45<50:43,  4.48it/s]

{'loss': 0.0, 'learning_rate': 9.069581976473717e-05, 'epoch': 0.12}


 12%|█▏        | 1911/15547 [06:47<45:37,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.062936133448528e-05, 'epoch': 0.12}


 12%|█▏        | 1921/15547 [06:49<45:20,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.05629029042334e-05, 'epoch': 0.12}


 12%|█▏        | 1931/15547 [06:51<45:22,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.049644447398153e-05, 'epoch': 0.12}


 12%|█▏        | 1941/15547 [06:53<45:07,  5.03it/s]

{'loss': 0.0, 'learning_rate': 9.042998604372966e-05, 'epoch': 0.12}


 13%|█▎        | 1951/15547 [06:55<45:26,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.036352761347777e-05, 'epoch': 0.13}


 13%|█▎        | 1961/15547 [06:57<45:23,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.02970691832259e-05, 'epoch': 0.13}


 13%|█▎        | 1971/15547 [06:59<45:12,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.023061075297401e-05, 'epoch': 0.13}


 13%|█▎        | 1981/15547 [07:01<45:18,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.016415232272214e-05, 'epoch': 0.13}


 13%|█▎        | 1991/15547 [07:03<45:25,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.009769389247027e-05, 'epoch': 0.13}


 13%|█▎        | 2000/15547 [07:05<49:54,  4.52it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 9.003123546221839e-05, 'epoch': 0.13}


                                                    
 13%|█▎        | 2000/15547 [07:08<49:54,  4.52it/s]Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json


{'eval_loss': 3.0426838293351466e-06, 'eval_runtime': 3.0558, 'eval_samples_per_second': 818.107, 'eval_steps_per_second': 13.09, 'epoch': 0.13}


Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 13%|█▎        | 2011/15547 [07:11<52:44,  4.28it/s]  

{'loss': 0.0, 'learning_rate': 8.99647770319665e-05, 'epoch': 0.13}


 13%|█▎        | 2021/15547 [07:13<45:05,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.989831860171464e-05, 'epoch': 0.13}


 13%|█▎        | 2031/15547 [07:15<45:22,  4.96it/s]

{'loss': 0.0, 'learning_rate': 8.983186017146276e-05, 'epoch': 0.13}


 13%|█▎        | 2041/15547 [07:17<45:55,  4.90it/s]

{'loss': 0.0, 'learning_rate': 8.976540174121087e-05, 'epoch': 0.13}


 13%|█▎        | 2051/15547 [07:19<44:41,  5.03it/s]

{'loss': 0.0, 'learning_rate': 8.9698943310959e-05, 'epoch': 0.13}


 13%|█▎        | 2061/15547 [07:21<44:57,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.963248488070712e-05, 'epoch': 0.13}


 13%|█▎        | 2071/15547 [07:23<44:45,  5.02it/s]

{'loss': 0.0, 'learning_rate': 8.956602645045525e-05, 'epoch': 0.13}


 13%|█▎        | 2081/15547 [07:25<44:48,  5.01it/s]

{'loss': 0.0, 'learning_rate': 8.949956802020337e-05, 'epoch': 0.13}


 13%|█▎        | 2091/15547 [07:27<44:53,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.943310958995149e-05, 'epoch': 0.13}


 14%|█▎        | 2101/15547 [07:29<49:39,  4.51it/s]

{'loss': 0.0, 'learning_rate': 8.93666511596996e-05, 'epoch': 0.14}


 14%|█▎        | 2111/15547 [07:31<44:54,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.930019272944773e-05, 'epoch': 0.14}


 14%|█▎        | 2121/15547 [07:33<44:52,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.923373429919586e-05, 'epoch': 0.14}


 14%|█▎        | 2131/15547 [07:35<44:48,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.916727586894398e-05, 'epoch': 0.14}


 14%|█▍        | 2141/15547 [07:37<44:45,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.910081743869209e-05, 'epoch': 0.14}


 14%|█▍        | 2151/15547 [07:39<44:39,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.903435900844023e-05, 'epoch': 0.14}


 14%|█▍        | 2161/15547 [07:41<44:36,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.896790057818835e-05, 'epoch': 0.14}


 14%|█▍        | 2171/15547 [07:43<44:28,  5.01it/s]

{'loss': 0.0, 'learning_rate': 8.890144214793646e-05, 'epoch': 0.14}


 14%|█▍        | 2181/15547 [07:45<45:18,  4.92it/s]

{'loss': 0.0, 'learning_rate': 8.883498371768459e-05, 'epoch': 0.14}


 14%|█▍        | 2191/15547 [07:47<44:56,  4.95it/s]

{'loss': 0.0, 'learning_rate': 8.876852528743272e-05, 'epoch': 0.14}


 14%|█▍        | 2201/15547 [07:49<48:32,  4.58it/s]

{'loss': 0.0, 'learning_rate': 8.870206685718083e-05, 'epoch': 0.14}


 14%|█▍        | 2211/15547 [07:51<44:46,  4.96it/s]

{'loss': 0.0, 'learning_rate': 8.863560842692896e-05, 'epoch': 0.14}


 14%|█▍        | 2221/15547 [07:53<44:34,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.856914999667708e-05, 'epoch': 0.14}


 14%|█▍        | 2231/15547 [07:55<44:26,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.85026915664252e-05, 'epoch': 0.14}


 14%|█▍        | 2241/15547 [07:57<44:28,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.843623313617334e-05, 'epoch': 0.14}


 14%|█▍        | 2250/15547 [07:59<44:13,  5.01it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 8.836977470592145e-05, 'epoch': 0.14}


                                                    
 14%|█▍        | 2251/15547 [08:02<4:05:33,  1.11s/it]

{'eval_loss': 2.1167520571907517e-06, 'eval_runtime': 3.0162, 'eval_samples_per_second': 828.853, 'eval_steps_per_second': 13.262, 'epoch': 0.14}


 15%|█▍        | 2261/15547 [08:04<50:11,  4.41it/s]  

{'loss': 0.0, 'learning_rate': 8.830331627566957e-05, 'epoch': 0.15}


 15%|█▍        | 2271/15547 [08:06<44:34,  4.96it/s]

{'loss': 0.0, 'learning_rate': 8.82368578454177e-05, 'epoch': 0.15}


 15%|█▍        | 2281/15547 [08:08<44:18,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.817039941516582e-05, 'epoch': 0.15}


 15%|█▍        | 2291/15547 [08:10<44:09,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.810394098491394e-05, 'epoch': 0.15}


 15%|█▍        | 2301/15547 [08:12<48:08,  4.59it/s]

{'loss': 0.0, 'learning_rate': 8.803748255466207e-05, 'epoch': 0.15}


 15%|█▍        | 2311/15547 [08:14<44:25,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.797102412441018e-05, 'epoch': 0.15}


 15%|█▍        | 2321/15547 [08:16<43:58,  5.01it/s]

{'loss': 0.0, 'learning_rate': 8.790456569415831e-05, 'epoch': 0.15}


 15%|█▍        | 2331/15547 [08:18<44:14,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.783810726390644e-05, 'epoch': 0.15}


 15%|█▌        | 2341/15547 [08:20<44:00,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.777164883365455e-05, 'epoch': 0.15}


 15%|█▌        | 2351/15547 [08:22<43:58,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.770519040340267e-05, 'epoch': 0.15}


 15%|█▌        | 2361/15547 [08:24<44:01,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.76387319731508e-05, 'epoch': 0.15}


 15%|█▌        | 2371/15547 [08:26<44:21,  4.95it/s]

{'loss': 0.0, 'learning_rate': 8.757227354289892e-05, 'epoch': 0.15}


 15%|█▌        | 2381/15547 [08:28<44:08,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.750581511264704e-05, 'epoch': 0.15}


 15%|█▌        | 2391/15547 [08:30<44:05,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.743935668239517e-05, 'epoch': 0.15}


 15%|█▌        | 2401/15547 [08:32<47:26,  4.62it/s]

{'loss': 0.0, 'learning_rate': 8.737289825214328e-05, 'epoch': 0.15}


 16%|█▌        | 2411/15547 [08:34<44:24,  4.93it/s]

{'loss': 0.0, 'learning_rate': 8.730643982189141e-05, 'epoch': 0.16}


 16%|█▌        | 2421/15547 [08:36<44:05,  4.96it/s]

{'loss': 0.0, 'learning_rate': 8.723998139163954e-05, 'epoch': 0.16}


 16%|█▌        | 2431/15547 [08:38<44:08,  4.95it/s]

{'loss': 0.0, 'learning_rate': 8.717352296138766e-05, 'epoch': 0.16}


 16%|█▌        | 2441/15547 [08:40<43:43,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.710706453113577e-05, 'epoch': 0.16}


 16%|█▌        | 2451/15547 [08:42<44:00,  4.96it/s]

{'loss': 0.0, 'learning_rate': 8.70406061008839e-05, 'epoch': 0.16}


 16%|█▌        | 2461/15547 [08:44<43:54,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.697414767063203e-05, 'epoch': 0.16}


 16%|█▌        | 2471/15547 [08:46<43:45,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.690768924038014e-05, 'epoch': 0.16}


 16%|█▌        | 2481/15547 [08:48<43:57,  4.95it/s]

{'loss': 0.0, 'learning_rate': 8.684123081012827e-05, 'epoch': 0.16}


 16%|█▌        | 2491/15547 [08:50<43:49,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.677477237987639e-05, 'epoch': 0.16}


 16%|█▌        | 2500/15547 [08:52<50:13,  4.33it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 8.670831394962451e-05, 'epoch': 0.16}


                                                    
 16%|█▌        | 2500/15547 [08:55<50:13,  4.33it/s]Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json


{'eval_loss': 1.673006295277446e-06, 'eval_runtime': 3.0244, 'eval_samples_per_second': 826.602, 'eval_steps_per_second': 13.226, 'epoch': 0.16}


Model weights saved in ./results/checkpoint-2500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 16%|█▌        | 2511/15547 [08:58<51:14,  4.24it/s]  

{'loss': 0.0, 'learning_rate': 8.664185551937264e-05, 'epoch': 0.16}


 16%|█▌        | 2521/15547 [09:00<43:43,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.657539708912076e-05, 'epoch': 0.16}


 16%|█▋        | 2531/15547 [09:02<43:30,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.650893865886887e-05, 'epoch': 0.16}


 16%|█▋        | 2541/15547 [09:04<43:27,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.6442480228617e-05, 'epoch': 0.16}


 16%|█▋        | 2551/15547 [09:06<43:26,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.637602179836513e-05, 'epoch': 0.16}


 16%|█▋        | 2561/15547 [09:08<43:17,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.630956336811324e-05, 'epoch': 0.16}


 17%|█▋        | 2571/15547 [09:10<43:12,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.624310493786137e-05, 'epoch': 0.17}


 17%|█▋        | 2581/15547 [09:12<43:20,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.617664650760949e-05, 'epoch': 0.17}


 17%|█▋        | 2591/15547 [09:14<43:15,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.611018807735762e-05, 'epoch': 0.17}


 17%|█▋        | 2601/15547 [09:16<46:48,  4.61it/s]

{'loss': 0.0, 'learning_rate': 8.604372964710574e-05, 'epoch': 0.17}


 17%|█▋        | 2611/15547 [09:18<43:11,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.597727121685386e-05, 'epoch': 0.17}


 17%|█▋        | 2621/15547 [09:20<43:16,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.591081278660197e-05, 'epoch': 0.17}


 17%|█▋        | 2631/15547 [09:22<43:18,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.58443543563501e-05, 'epoch': 0.17}


 17%|█▋        | 2641/15547 [09:24<43:17,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.577789592609823e-05, 'epoch': 0.17}


 17%|█▋        | 2651/15547 [09:26<43:06,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.571143749584635e-05, 'epoch': 0.17}


 17%|█▋        | 2661/15547 [09:28<43:04,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.564497906559448e-05, 'epoch': 0.17}


 17%|█▋        | 2671/15547 [09:30<42:58,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.557852063534259e-05, 'epoch': 0.17}


 17%|█▋        | 2681/15547 [09:32<43:06,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.551206220509072e-05, 'epoch': 0.17}


 17%|█▋        | 2691/15547 [09:34<42:53,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.544560377483885e-05, 'epoch': 0.17}


 17%|█▋        | 2701/15547 [09:36<46:00,  4.65it/s]

{'loss': 0.0, 'learning_rate': 8.537914534458696e-05, 'epoch': 0.17}


 17%|█▋        | 2711/15547 [09:38<42:59,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.531268691433508e-05, 'epoch': 0.17}


 18%|█▊        | 2721/15547 [09:40<43:01,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.524622848408322e-05, 'epoch': 0.17}


 18%|█▊        | 2731/15547 [09:42<42:46,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.517977005383133e-05, 'epoch': 0.18}


 18%|█▊        | 2741/15547 [09:44<42:46,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.511331162357945e-05, 'epoch': 0.18}


 18%|█▊        | 2750/15547 [09:46<42:32,  5.01it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 8.504685319332758e-05, 'epoch': 0.18}


                                                    
 18%|█▊        | 2751/15547 [09:49<3:56:21,  1.11s/it]

{'eval_loss': 1.4885349628457334e-06, 'eval_runtime': 3.0148, 'eval_samples_per_second': 829.239, 'eval_steps_per_second': 13.268, 'epoch': 0.18}


 18%|█▊        | 2761/15547 [09:51<48:03,  4.43it/s]  

{'loss': 0.0, 'learning_rate': 8.498039476307569e-05, 'epoch': 0.18}


 18%|█▊        | 2771/15547 [09:53<42:44,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.491393633282382e-05, 'epoch': 0.18}


 18%|█▊        | 2781/15547 [09:55<42:47,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.484747790257195e-05, 'epoch': 0.18}


 18%|█▊        | 2791/15547 [09:57<42:50,  4.96it/s]

{'loss': 0.0, 'learning_rate': 8.478101947232006e-05, 'epoch': 0.18}


 18%|█▊        | 2801/15547 [09:59<46:52,  4.53it/s]

{'loss': 0.0, 'learning_rate': 8.471456104206818e-05, 'epoch': 0.18}


 18%|█▊        | 2811/15547 [10:01<42:53,  4.95it/s]

{'loss': 0.0, 'learning_rate': 8.464810261181632e-05, 'epoch': 0.18}


 18%|█▊        | 2821/15547 [10:03<42:37,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.458164418156444e-05, 'epoch': 0.18}


 18%|█▊        | 2831/15547 [10:05<42:23,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.451518575131255e-05, 'epoch': 0.18}


 18%|█▊        | 2841/15547 [10:07<43:00,  4.92it/s]

{'loss': 0.0, 'learning_rate': 8.444872732106068e-05, 'epoch': 0.18}


 18%|█▊        | 2851/15547 [10:09<42:32,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.438226889080881e-05, 'epoch': 0.18}


 18%|█▊        | 2861/15547 [10:11<42:31,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.431581046055692e-05, 'epoch': 0.18}


 18%|█▊        | 2871/15547 [10:13<42:26,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.424935203030505e-05, 'epoch': 0.18}


 19%|█▊        | 2881/15547 [10:15<42:28,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.418289360005317e-05, 'epoch': 0.19}


 19%|█▊        | 2891/15547 [10:17<42:26,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.411643516980128e-05, 'epoch': 0.19}


 19%|█▊        | 2901/15547 [10:20<45:36,  4.62it/s]

{'loss': 0.0, 'learning_rate': 8.404997673954942e-05, 'epoch': 0.19}


 19%|█▊        | 2911/15547 [10:22<42:23,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.398351830929754e-05, 'epoch': 0.19}


 19%|█▉        | 2921/15547 [10:24<42:15,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.391705987904565e-05, 'epoch': 0.19}


 19%|█▉        | 2931/15547 [10:26<41:33,  5.06it/s]

{'loss': 0.0, 'learning_rate': 8.385060144879378e-05, 'epoch': 0.19}


 19%|█▉        | 2941/15547 [10:28<41:37,  5.05it/s]

{'loss': 0.0, 'learning_rate': 8.378414301854191e-05, 'epoch': 0.19}


 19%|█▉        | 2951/15547 [10:29<41:36,  5.05it/s]

{'loss': 0.0, 'learning_rate': 8.371768458829003e-05, 'epoch': 0.19}


 19%|█▉        | 2961/15547 [10:31<41:34,  5.05it/s]

{'loss': 0.0, 'learning_rate': 8.365122615803815e-05, 'epoch': 0.19}


 19%|█▉        | 2971/15547 [10:33<41:46,  5.02it/s]

{'loss': 0.0, 'learning_rate': 8.358476772778627e-05, 'epoch': 0.19}


 19%|█▉        | 2981/15547 [10:35<41:43,  5.02it/s]

{'loss': 0.0, 'learning_rate': 8.35183092975344e-05, 'epoch': 0.19}


 19%|█▉        | 2991/15547 [10:37<41:45,  5.01it/s]

{'loss': 0.0, 'learning_rate': 8.345185086728253e-05, 'epoch': 0.19}


 19%|█▉        | 3000/15547 [10:39<46:15,  4.52it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 8.338539243703064e-05, 'epoch': 0.19}


                                                    
 19%|█▉        | 3000/15547 [10:42<46:15,  4.52it/s]Saving model checkpoint to ./results/checkpoint-3000
Configuration saved in ./results/checkpoint-3000/config.json


{'eval_loss': 1.2355488934190362e-06, 'eval_runtime': 3.0299, 'eval_samples_per_second': 825.098, 'eval_steps_per_second': 13.202, 'epoch': 0.19}


Model weights saved in ./results/checkpoint-3000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 19%|█▉        | 3011/15547 [10:45<48:49,  4.28it/s]  

{'loss': 0.0, 'learning_rate': 8.331893400677876e-05, 'epoch': 0.19}


 19%|█▉        | 3021/15547 [10:47<41:54,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.325247557652688e-05, 'epoch': 0.19}


 19%|█▉        | 3031/15547 [10:49<41:33,  5.02it/s]

{'loss': 0.0, 'learning_rate': 8.318601714627501e-05, 'epoch': 0.19}


 20%|█▉        | 3041/15547 [10:51<41:53,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.311955871602313e-05, 'epoch': 0.2}


 20%|█▉        | 3051/15547 [10:53<41:39,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.305310028577126e-05, 'epoch': 0.2}


 20%|█▉        | 3061/15547 [10:55<41:37,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.298664185551937e-05, 'epoch': 0.2}


 20%|█▉        | 3071/15547 [10:57<41:24,  5.02it/s]

{'loss': 0.0, 'learning_rate': 8.29201834252675e-05, 'epoch': 0.2}


 20%|█▉        | 3081/15547 [10:59<41:32,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.285372499501563e-05, 'epoch': 0.2}


 20%|█▉        | 3091/15547 [11:01<41:37,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.278726656476374e-05, 'epoch': 0.2}


 20%|█▉        | 3101/15547 [11:03<44:33,  4.65it/s]

{'loss': 0.0, 'learning_rate': 8.272080813451186e-05, 'epoch': 0.2}


 20%|██        | 3111/15547 [11:05<41:53,  4.95it/s]

{'loss': 0.0, 'learning_rate': 8.265434970425999e-05, 'epoch': 0.2}


 20%|██        | 3121/15547 [11:07<41:34,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.258789127400812e-05, 'epoch': 0.2}


 20%|██        | 3131/15547 [11:09<41:19,  5.01it/s]

{'loss': 0.0, 'learning_rate': 8.252143284375623e-05, 'epoch': 0.2}


 20%|██        | 3141/15547 [11:11<41:19,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.245497441350436e-05, 'epoch': 0.2}


 20%|██        | 3151/15547 [11:13<41:20,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.238851598325247e-05, 'epoch': 0.2}


 20%|██        | 3161/15547 [11:15<41:24,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.23220575530006e-05, 'epoch': 0.2}


 20%|██        | 3171/15547 [11:17<41:18,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.225559912274873e-05, 'epoch': 0.2}


 20%|██        | 3181/15547 [11:19<41:34,  4.96it/s]

{'loss': 0.0, 'learning_rate': 8.218914069249685e-05, 'epoch': 0.2}


 21%|██        | 3191/15547 [11:21<41:58,  4.91it/s]

{'loss': 0.0, 'learning_rate': 8.212268226224496e-05, 'epoch': 0.21}


 21%|██        | 3201/15547 [11:23<44:53,  4.58it/s]

{'loss': 0.0, 'learning_rate': 8.205622383199309e-05, 'epoch': 0.21}


 21%|██        | 3211/15547 [11:25<41:35,  4.94it/s]

{'loss': 0.0, 'learning_rate': 8.198976540174122e-05, 'epoch': 0.21}


 21%|██        | 3221/15547 [11:27<41:18,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.192330697148933e-05, 'epoch': 0.21}


 21%|██        | 3231/15547 [11:29<41:27,  4.95it/s]

{'loss': 0.0, 'learning_rate': 8.185684854123746e-05, 'epoch': 0.21}


 21%|██        | 3241/15547 [11:31<41:17,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.179039011098558e-05, 'epoch': 0.21}


 21%|██        | 3250/15547 [11:33<40:55,  5.01it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 8.17239316807337e-05, 'epoch': 0.21}


                                                    
 21%|██        | 3251/15547 [11:37<3:48:33,  1.12s/it]

{'eval_loss': 9.565964091962087e-07, 'eval_runtime': 3.0388, 'eval_samples_per_second': 822.704, 'eval_steps_per_second': 13.163, 'epoch': 0.21}


 21%|██        | 3261/15547 [11:39<46:26,  4.41it/s]  

{'loss': 0.0, 'learning_rate': 8.165747325048183e-05, 'epoch': 0.21}


 21%|██        | 3271/15547 [11:41<41:16,  4.96it/s]

{'loss': 0.0, 'learning_rate': 8.159101482022995e-05, 'epoch': 0.21}


 21%|██        | 3281/15547 [11:43<41:09,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.152455638997806e-05, 'epoch': 0.21}


 21%|██        | 3291/15547 [11:45<41:03,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.145809795972619e-05, 'epoch': 0.21}


 21%|██        | 3301/15547 [11:47<44:25,  4.59it/s]

{'loss': 0.0, 'learning_rate': 8.139163952947432e-05, 'epoch': 0.21}


 21%|██▏       | 3311/15547 [11:49<41:11,  4.95it/s]

{'loss': 0.0, 'learning_rate': 8.132518109922244e-05, 'epoch': 0.21}


 21%|██▏       | 3321/15547 [11:51<40:59,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.125872266897056e-05, 'epoch': 0.21}


 21%|██▏       | 3331/15547 [11:53<40:39,  5.01it/s]

{'loss': 0.0, 'learning_rate': 8.119226423871868e-05, 'epoch': 0.21}


 21%|██▏       | 3341/15547 [11:55<40:57,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.112580580846681e-05, 'epoch': 0.21}


 22%|██▏       | 3351/15547 [11:57<41:02,  4.95it/s]

{'loss': 0.0, 'learning_rate': 8.105934737821494e-05, 'epoch': 0.22}


 22%|██▏       | 3361/15547 [11:59<40:53,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.099288894796305e-05, 'epoch': 0.22}


 22%|██▏       | 3371/15547 [12:01<40:45,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.092643051771117e-05, 'epoch': 0.22}


 22%|██▏       | 3381/15547 [12:03<40:34,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.085997208745931e-05, 'epoch': 0.22}


 22%|██▏       | 3391/15547 [12:05<40:39,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.079351365720742e-05, 'epoch': 0.22}


 22%|██▏       | 3401/15547 [12:07<43:09,  4.69it/s]

{'loss': 0.0, 'learning_rate': 8.072705522695554e-05, 'epoch': 0.22}


 22%|██▏       | 3411/15547 [12:09<40:34,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.066059679670367e-05, 'epoch': 0.22}


 22%|██▏       | 3421/15547 [12:11<40:39,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.059413836645178e-05, 'epoch': 0.22}


 22%|██▏       | 3431/15547 [12:13<40:18,  5.01it/s]

{'loss': 0.0, 'learning_rate': 8.052767993619991e-05, 'epoch': 0.22}


 22%|██▏       | 3441/15547 [12:15<40:18,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.046122150594804e-05, 'epoch': 0.22}


 22%|██▏       | 3451/15547 [12:17<40:08,  5.02it/s]

{'loss': 0.0, 'learning_rate': 8.039476307569615e-05, 'epoch': 0.22}


 22%|██▏       | 3461/15547 [12:19<40:24,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.032830464544427e-05, 'epoch': 0.22}


 22%|██▏       | 3471/15547 [12:21<40:09,  5.01it/s]

{'loss': 0.0, 'learning_rate': 8.026184621519241e-05, 'epoch': 0.22}


 22%|██▏       | 3481/15547 [12:23<40:07,  5.01it/s]

{'loss': 0.0, 'learning_rate': 8.019538778494053e-05, 'epoch': 0.22}


 22%|██▏       | 3491/15547 [12:25<40:17,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.012892935468864e-05, 'epoch': 0.22}


 23%|██▎       | 3500/15547 [12:26<44:40,  4.49it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 8.006247092443677e-05, 'epoch': 0.23}


                                                    
 23%|██▎       | 3500/15547 [12:29<44:40,  4.49it/s]Saving model checkpoint to ./results/checkpoint-3500
Configuration saved in ./results/checkpoint-3500/config.json


{'eval_loss': 7.933505798973783e-07, 'eval_runtime': 3.0423, 'eval_samples_per_second': 821.751, 'eval_steps_per_second': 13.148, 'epoch': 0.23}


Model weights saved in ./results/checkpoint-3500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 23%|██▎       | 3511/15547 [12:33<47:36,  4.21it/s]  

{'loss': 0.0, 'learning_rate': 7.99960124941849e-05, 'epoch': 0.23}


 23%|██▎       | 3521/15547 [12:35<40:30,  4.95it/s]

{'loss': 0.0, 'learning_rate': 7.992955406393301e-05, 'epoch': 0.23}


 23%|██▎       | 3531/15547 [12:37<40:07,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.986309563368114e-05, 'epoch': 0.23}


 23%|██▎       | 3541/15547 [12:39<40:11,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.979663720342926e-05, 'epoch': 0.23}


 23%|██▎       | 3551/15547 [12:41<40:05,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.973017877317738e-05, 'epoch': 0.23}


 23%|██▎       | 3561/15547 [12:43<40:16,  4.96it/s]

{'loss': 0.0, 'learning_rate': 7.966372034292551e-05, 'epoch': 0.23}


 23%|██▎       | 3571/15547 [12:45<40:10,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.959726191267363e-05, 'epoch': 0.23}


 23%|██▎       | 3581/15547 [12:47<40:05,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.953080348242174e-05, 'epoch': 0.23}


 23%|██▎       | 3591/15547 [12:49<40:08,  4.96it/s]

{'loss': 0.0, 'learning_rate': 7.946434505216987e-05, 'epoch': 0.23}


 23%|██▎       | 3601/15547 [12:51<43:05,  4.62it/s]

{'loss': 0.0, 'learning_rate': 7.9397886621918e-05, 'epoch': 0.23}


 23%|██▎       | 3611/15547 [12:53<40:19,  4.93it/s]

{'loss': 0.0, 'learning_rate': 7.933142819166611e-05, 'epoch': 0.23}


 23%|██▎       | 3621/15547 [12:55<39:54,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.926496976141424e-05, 'epoch': 0.23}


 23%|██▎       | 3631/15547 [12:57<39:55,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.919851133116236e-05, 'epoch': 0.23}


 23%|██▎       | 3641/15547 [12:59<40:03,  4.95it/s]

{'loss': 0.0, 'learning_rate': 7.913205290091049e-05, 'epoch': 0.23}


 23%|██▎       | 3651/15547 [13:01<39:47,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.906559447065862e-05, 'epoch': 0.23}


 24%|██▎       | 3661/15547 [13:03<39:34,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.899913604040673e-05, 'epoch': 0.24}


 24%|██▎       | 3671/15547 [13:05<39:44,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.893267761015484e-05, 'epoch': 0.24}


 24%|██▎       | 3681/15547 [13:07<39:43,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.886621917990297e-05, 'epoch': 0.24}


 24%|██▎       | 3691/15547 [13:09<39:36,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.87997607496511e-05, 'epoch': 0.24}


 24%|██▍       | 3701/15547 [13:11<42:46,  4.62it/s]

{'loss': 0.0, 'learning_rate': 7.873330231939922e-05, 'epoch': 0.24}


 24%|██▍       | 3711/15547 [13:13<39:52,  4.95it/s]

{'loss': 0.0, 'learning_rate': 7.866684388914735e-05, 'epoch': 0.24}


 24%|██▍       | 3721/15547 [13:15<39:27,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.860038545889546e-05, 'epoch': 0.24}


 24%|██▍       | 3731/15547 [13:17<39:28,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.853392702864359e-05, 'epoch': 0.24}


 24%|██▍       | 3741/15547 [13:19<39:32,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.846746859839172e-05, 'epoch': 0.24}


 24%|██▍       | 3750/15547 [13:21<39:25,  4.99it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 7.840101016813983e-05, 'epoch': 0.24}


                                                    
 24%|██▍       | 3751/15547 [13:24<3:39:03,  1.11s/it]

{'eval_loss': 6.117085717960435e-07, 'eval_runtime': 3.0304, 'eval_samples_per_second': 824.964, 'eval_steps_per_second': 13.199, 'epoch': 0.24}


 24%|██▍       | 3761/15547 [13:26<44:53,  4.38it/s]  

{'loss': 0.0, 'learning_rate': 7.833455173788795e-05, 'epoch': 0.24}


 24%|██▍       | 3771/15547 [13:28<39:27,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.826809330763608e-05, 'epoch': 0.24}


 24%|██▍       | 3781/15547 [13:30<39:13,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.82016348773842e-05, 'epoch': 0.24}


 24%|██▍       | 3791/15547 [13:32<39:13,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.813517644713232e-05, 'epoch': 0.24}


 24%|██▍       | 3801/15547 [13:34<42:17,  4.63it/s]

{'loss': 0.0, 'learning_rate': 7.806871801688045e-05, 'epoch': 0.24}


 25%|██▍       | 3811/15547 [13:36<39:20,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.800225958662856e-05, 'epoch': 0.25}


 25%|██▍       | 3821/15547 [13:38<38:57,  5.02it/s]

{'loss': 0.0, 'learning_rate': 7.793580115637669e-05, 'epoch': 0.25}


 25%|██▍       | 3831/15547 [13:40<39:00,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.786934272612482e-05, 'epoch': 0.25}


 25%|██▍       | 3841/15547 [13:42<39:09,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.780288429587293e-05, 'epoch': 0.25}


 25%|██▍       | 3851/15547 [13:44<39:09,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.773642586562105e-05, 'epoch': 0.25}


 25%|██▍       | 3861/15547 [13:46<39:00,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.766996743536918e-05, 'epoch': 0.25}


 25%|██▍       | 3871/15547 [13:48<39:04,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.76035090051173e-05, 'epoch': 0.25}


 25%|██▍       | 3881/15547 [13:50<39:04,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.753705057486542e-05, 'epoch': 0.25}


 25%|██▌       | 3891/15547 [13:52<39:09,  4.96it/s]

{'loss': 0.0, 'learning_rate': 7.747059214461355e-05, 'epoch': 0.25}


 25%|██▌       | 3901/15547 [13:54<42:25,  4.58it/s]

{'loss': 0.0, 'learning_rate': 7.740413371436167e-05, 'epoch': 0.25}


 25%|██▌       | 3911/15547 [13:56<39:01,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.73376752841098e-05, 'epoch': 0.25}


 25%|██▌       | 3921/15547 [13:58<38:48,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.727121685385792e-05, 'epoch': 0.25}


 25%|██▌       | 3931/15547 [14:00<38:51,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.720475842360604e-05, 'epoch': 0.25}


 25%|██▌       | 3941/15547 [14:02<38:57,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.713829999335415e-05, 'epoch': 0.25}


 25%|██▌       | 3951/15547 [14:04<38:42,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.707184156310228e-05, 'epoch': 0.25}


 25%|██▌       | 3961/15547 [14:06<38:40,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.700538313285041e-05, 'epoch': 0.25}


 26%|██▌       | 3971/15547 [14:08<38:56,  4.95it/s]

{'loss': 0.0, 'learning_rate': 7.693892470259852e-05, 'epoch': 0.26}


 26%|██▌       | 3981/15547 [14:10<38:52,  4.96it/s]

{'loss': 0.0, 'learning_rate': 7.687246627234665e-05, 'epoch': 0.26}


 26%|██▌       | 3991/15547 [14:12<38:37,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.680600784209477e-05, 'epoch': 0.26}


 26%|██▌       | 4000/15547 [14:14<42:43,  4.50it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 7.67395494118429e-05, 'epoch': 0.26}


                                                    
 26%|██▌       | 4000/15547 [14:17<42:43,  4.50it/s]Saving model checkpoint to ./results/checkpoint-4000
Configuration saved in ./results/checkpoint-4000/config.json


{'eval_loss': 4.3942225147475256e-07, 'eval_runtime': 3.0225, 'eval_samples_per_second': 827.124, 'eval_steps_per_second': 13.234, 'epoch': 0.26}


Model weights saved in ./results/checkpoint-4000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 26%|██▌       | 4011/15547 [14:20<45:23,  4.24it/s]  

{'loss': 0.0, 'learning_rate': 7.667309098159102e-05, 'epoch': 0.26}


 26%|██▌       | 4021/15547 [14:22<38:35,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.660663255133914e-05, 'epoch': 0.26}


 26%|██▌       | 4031/15547 [14:24<38:39,  4.96it/s]

{'loss': 0.0, 'learning_rate': 7.654017412108725e-05, 'epoch': 0.26}


 26%|██▌       | 4041/15547 [14:26<38:22,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.64737156908354e-05, 'epoch': 0.26}


 26%|██▌       | 4051/15547 [14:28<38:19,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.640725726058351e-05, 'epoch': 0.26}


 26%|██▌       | 4061/15547 [14:30<38:18,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.634079883033163e-05, 'epoch': 0.26}


 26%|██▌       | 4071/15547 [14:32<38:13,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.627434040007975e-05, 'epoch': 0.26}


 26%|██▌       | 4081/15547 [14:34<38:23,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.620788196982788e-05, 'epoch': 0.26}


 26%|██▋       | 4091/15547 [14:36<38:09,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.6141423539576e-05, 'epoch': 0.26}


 26%|██▋       | 4101/15547 [14:38<41:51,  4.56it/s]

{'loss': 0.0, 'learning_rate': 7.607496510932413e-05, 'epoch': 0.26}


 26%|██▋       | 4111/15547 [14:40<38:29,  4.95it/s]

{'loss': 0.0, 'learning_rate': 7.600850667907224e-05, 'epoch': 0.26}


 27%|██▋       | 4121/15547 [14:42<38:04,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.594204824882036e-05, 'epoch': 0.27}


 27%|██▋       | 4131/15547 [14:44<37:57,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.58755898185685e-05, 'epoch': 0.27}


 27%|██▋       | 4141/15547 [14:46<38:08,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.580913138831661e-05, 'epoch': 0.27}


 27%|██▋       | 4151/15547 [14:48<37:59,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.574267295806473e-05, 'epoch': 0.27}


 27%|██▋       | 4161/15547 [14:50<37:57,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.567621452781286e-05, 'epoch': 0.27}


 27%|██▋       | 4171/15547 [14:52<37:50,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.560975609756099e-05, 'epoch': 0.27}


 27%|██▋       | 4181/15547 [14:54<37:51,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.55432976673091e-05, 'epoch': 0.27}


 27%|██▋       | 4191/15547 [14:56<37:56,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.547683923705723e-05, 'epoch': 0.27}


 27%|██▋       | 4201/15547 [14:58<40:49,  4.63it/s]

{'loss': 0.0, 'learning_rate': 7.541038080680534e-05, 'epoch': 0.27}


 27%|██▋       | 4211/15547 [15:00<37:53,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.534392237655347e-05, 'epoch': 0.27}


 27%|██▋       | 4221/15547 [15:02<37:49,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.52774639463016e-05, 'epoch': 0.27}


 27%|██▋       | 4231/15547 [15:04<37:33,  5.02it/s]

{'loss': 0.0, 'learning_rate': 7.521100551604972e-05, 'epoch': 0.27}


 27%|██▋       | 4241/15547 [15:06<37:39,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.514454708579783e-05, 'epoch': 0.27}


 27%|██▋       | 4250/15547 [15:08<37:23,  5.04it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 7.507808865554596e-05, 'epoch': 0.27}


                                                    
 27%|██▋       | 4251/15547 [15:11<3:27:57,  1.10s/it]

{'eval_loss': 3.3959153711293766e-07, 'eval_runtime': 3.0052, 'eval_samples_per_second': 831.888, 'eval_steps_per_second': 13.31, 'epoch': 0.27}


 27%|██▋       | 4261/15547 [15:13<42:43,  4.40it/s]  

{'loss': 0.0, 'learning_rate': 7.501163022529409e-05, 'epoch': 0.27}


 27%|██▋       | 4271/15547 [15:15<37:37,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.49451717950422e-05, 'epoch': 0.27}


 28%|██▊       | 4281/15547 [15:17<37:31,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.487871336479033e-05, 'epoch': 0.28}


 28%|██▊       | 4291/15547 [15:19<37:32,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.481225493453845e-05, 'epoch': 0.28}


 28%|██▊       | 4301/15547 [15:21<40:55,  4.58it/s]

{'loss': 0.0, 'learning_rate': 7.474579650428658e-05, 'epoch': 0.28}


 28%|██▊       | 4311/15547 [15:23<37:39,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.46793380740347e-05, 'epoch': 0.28}


 28%|██▊       | 4321/15547 [15:25<37:22,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.461287964378282e-05, 'epoch': 0.28}


 28%|██▊       | 4331/15547 [15:27<37:18,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.454642121353093e-05, 'epoch': 0.28}


 28%|██▊       | 4341/15547 [15:29<37:08,  5.03it/s]

{'loss': 0.0, 'learning_rate': 7.447996278327906e-05, 'epoch': 0.28}


 28%|██▊       | 4351/15547 [15:31<37:21,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.441350435302719e-05, 'epoch': 0.28}


 28%|██▊       | 4361/15547 [15:33<37:12,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.43470459227753e-05, 'epoch': 0.28}


 28%|██▊       | 4371/15547 [15:35<37:16,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.428058749252343e-05, 'epoch': 0.28}


 28%|██▊       | 4381/15547 [15:37<37:17,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.421412906227155e-05, 'epoch': 0.28}


 28%|██▊       | 4391/15547 [15:39<37:23,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.414767063201968e-05, 'epoch': 0.28}


 28%|██▊       | 4401/15547 [15:41<39:55,  4.65it/s]

{'loss': 0.0, 'learning_rate': 7.40812122017678e-05, 'epoch': 0.28}


 28%|██▊       | 4411/15547 [15:43<37:10,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.401475377151592e-05, 'epoch': 0.28}


 28%|██▊       | 4421/15547 [15:45<37:02,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.394829534126404e-05, 'epoch': 0.28}


 29%|██▊       | 4431/15547 [15:47<37:06,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.388183691101216e-05, 'epoch': 0.28}


 29%|██▊       | 4441/15547 [15:49<37:02,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.381537848076029e-05, 'epoch': 0.29}


 29%|██▊       | 4451/15547 [15:51<37:02,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.374892005050841e-05, 'epoch': 0.29}


 29%|██▊       | 4461/15547 [15:53<37:04,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.368246162025654e-05, 'epoch': 0.29}


 29%|██▉       | 4471/15547 [15:55<37:03,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.361600319000465e-05, 'epoch': 0.29}


 29%|██▉       | 4481/15547 [15:57<37:04,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.354954475975278e-05, 'epoch': 0.29}


 29%|██▉       | 4491/15547 [15:59<36:56,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.348308632950091e-05, 'epoch': 0.29}


 29%|██▉       | 4500/15547 [16:01<41:25,  4.44it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 7.341662789924902e-05, 'epoch': 0.29}


                                                    
 29%|██▉       | 4500/15547 [16:04<41:25,  4.44it/s]Saving model checkpoint to ./results/checkpoint-4500
Configuration saved in ./results/checkpoint-4500/config.json


{'eval_loss': 3.177656537900475e-07, 'eval_runtime': 3.0157, 'eval_samples_per_second': 828.984, 'eval_steps_per_second': 13.264, 'epoch': 0.29}


Model weights saved in ./results/checkpoint-4500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 29%|██▉       | 4511/15547 [16:07<43:24,  4.24it/s]  

{'loss': 0.0, 'learning_rate': 7.335016946899714e-05, 'epoch': 0.29}


 29%|██▉       | 4521/15547 [16:09<36:51,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.328371103874527e-05, 'epoch': 0.29}


 29%|██▉       | 4531/15547 [16:11<36:41,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.32172526084934e-05, 'epoch': 0.29}


 29%|██▉       | 4541/15547 [16:13<36:31,  5.02it/s]

{'loss': 0.0, 'learning_rate': 7.315079417824151e-05, 'epoch': 0.29}


 29%|██▉       | 4551/15547 [16:15<36:35,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.308433574798964e-05, 'epoch': 0.29}


 29%|██▉       | 4561/15547 [16:17<36:41,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.301787731773775e-05, 'epoch': 0.29}


 29%|██▉       | 4571/15547 [16:19<36:34,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.295141888748588e-05, 'epoch': 0.29}


 29%|██▉       | 4581/15547 [16:21<36:33,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.288496045723401e-05, 'epoch': 0.29}


 30%|██▉       | 4591/15547 [16:23<36:41,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.281850202698213e-05, 'epoch': 0.3}


 30%|██▉       | 4601/15547 [16:25<39:41,  4.60it/s]

{'loss': 0.0, 'learning_rate': 7.275204359673024e-05, 'epoch': 0.3}


 30%|██▉       | 4611/15547 [16:27<36:26,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.268558516647838e-05, 'epoch': 0.3}


 30%|██▉       | 4621/15547 [16:29<36:38,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.26191267362265e-05, 'epoch': 0.3}


 30%|██▉       | 4631/15547 [16:31<36:14,  5.02it/s]

{'loss': 0.0, 'learning_rate': 7.255266830597461e-05, 'epoch': 0.3}


 30%|██▉       | 4641/15547 [16:33<36:29,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.248620987572274e-05, 'epoch': 0.3}


 30%|██▉       | 4651/15547 [16:35<36:23,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.241975144547086e-05, 'epoch': 0.3}


 30%|██▉       | 4661/15547 [16:37<36:19,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.235329301521898e-05, 'epoch': 0.3}


 30%|███       | 4671/15547 [16:39<36:13,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.228683458496711e-05, 'epoch': 0.3}


 30%|███       | 4681/15547 [16:41<36:15,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.222037615471523e-05, 'epoch': 0.3}


 30%|███       | 4691/15547 [16:43<36:20,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.215391772446334e-05, 'epoch': 0.3}


 30%|███       | 4701/15547 [16:45<38:48,  4.66it/s]

{'loss': 0.0, 'learning_rate': 7.208745929421149e-05, 'epoch': 0.3}


 30%|███       | 4711/15547 [16:47<36:21,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.20210008639596e-05, 'epoch': 0.3}


 30%|███       | 4721/15547 [16:49<36:05,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.195454243370772e-05, 'epoch': 0.3}


 30%|███       | 4731/15547 [16:51<36:10,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.188808400345584e-05, 'epoch': 0.3}


 30%|███       | 4741/15547 [16:53<36:04,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.182162557320397e-05, 'epoch': 0.3}


 31%|███       | 4750/15547 [16:55<35:45,  5.03it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 7.175516714295209e-05, 'epoch': 0.31}


                                                    
 31%|███       | 4751/15547 [16:58<3:18:25,  1.10s/it]

{'eval_loss': 3.061294364670175e-07, 'eval_runtime': 2.9993, 'eval_samples_per_second': 833.522, 'eval_steps_per_second': 13.336, 'epoch': 0.31}


 31%|███       | 4761/15547 [17:00<40:34,  4.43it/s]  

{'loss': 0.0, 'learning_rate': 7.168870871270022e-05, 'epoch': 0.31}


 31%|███       | 4771/15547 [17:02<36:12,  4.96it/s]

{'loss': 0.0, 'learning_rate': 7.162225028244833e-05, 'epoch': 0.31}


 31%|███       | 4781/15547 [17:04<35:51,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.155579185219645e-05, 'epoch': 0.31}


 31%|███       | 4791/15547 [17:06<35:52,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.148933342194459e-05, 'epoch': 0.31}


 31%|███       | 4801/15547 [17:08<38:38,  4.64it/s]

{'loss': 0.0, 'learning_rate': 7.14228749916927e-05, 'epoch': 0.31}


 31%|███       | 4811/15547 [17:10<36:03,  4.96it/s]

{'loss': 0.0, 'learning_rate': 7.135641656144082e-05, 'epoch': 0.31}


 31%|███       | 4821/15547 [17:12<35:50,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.128995813118895e-05, 'epoch': 0.31}


 31%|███       | 4831/15547 [17:14<35:43,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.122349970093707e-05, 'epoch': 0.31}


 31%|███       | 4841/15547 [17:16<35:41,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.115704127068519e-05, 'epoch': 0.31}


 31%|███       | 4851/15547 [17:18<35:44,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.109058284043332e-05, 'epoch': 0.31}


 31%|███▏      | 4861/15547 [17:20<35:44,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.102412441018143e-05, 'epoch': 0.31}


 31%|███▏      | 4871/15547 [17:22<35:03,  5.08it/s]

{'loss': 0.0, 'learning_rate': 7.095766597992956e-05, 'epoch': 0.31}


 31%|███▏      | 4881/15547 [17:24<35:14,  5.04it/s]

{'loss': 0.0, 'learning_rate': 7.089120754967769e-05, 'epoch': 0.31}


 31%|███▏      | 4891/15547 [17:26<35:29,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.08247491194258e-05, 'epoch': 0.31}


 32%|███▏      | 4901/15547 [17:28<38:03,  4.66it/s]

{'loss': 0.0, 'learning_rate': 7.075829068917392e-05, 'epoch': 0.32}


 32%|███▏      | 4911/15547 [17:30<35:24,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.069183225892205e-05, 'epoch': 0.32}


 32%|███▏      | 4921/15547 [17:32<35:23,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.062537382867018e-05, 'epoch': 0.32}


 32%|███▏      | 4931/15547 [17:34<35:05,  5.04it/s]

{'loss': 0.0, 'learning_rate': 7.055891539841829e-05, 'epoch': 0.32}


 32%|███▏      | 4941/15547 [17:36<35:09,  5.03it/s]

{'loss': 0.0, 'learning_rate': 7.049245696816642e-05, 'epoch': 0.32}


 32%|███▏      | 4951/15547 [17:38<35:16,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.042599853791454e-05, 'epoch': 0.32}


 32%|███▏      | 4961/15547 [17:40<35:02,  5.03it/s]

{'loss': 0.0, 'learning_rate': 7.035954010766266e-05, 'epoch': 0.32}


 32%|███▏      | 4971/15547 [17:42<35:01,  5.03it/s]

{'loss': 0.0, 'learning_rate': 7.029308167741079e-05, 'epoch': 0.32}


 32%|███▏      | 4981/15547 [17:44<34:59,  5.03it/s]

{'loss': 0.0, 'learning_rate': 7.022662324715891e-05, 'epoch': 0.32}


 32%|███▏      | 4991/15547 [17:46<35:02,  5.02it/s]

{'loss': 0.0, 'learning_rate': 7.016016481690702e-05, 'epoch': 0.32}


 32%|███▏      | 5000/15547 [17:48<38:54,  4.52it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 7.009370638665515e-05, 'epoch': 0.32}


                                                    
 32%|███▏      | 5000/15547 [17:51<38:54,  4.52it/s]Saving model checkpoint to ./results/checkpoint-5000
Configuration saved in ./results/checkpoint-5000/config.json


{'eval_loss': 2.830009577792225e-07, 'eval_runtime': 3.0231, 'eval_samples_per_second': 826.954, 'eval_steps_per_second': 13.231, 'epoch': 0.32}


Model weights saved in ./results/checkpoint-5000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 32%|███▏      | 5011/15547 [17:54<40:48,  4.30it/s]  

{'loss': 0.0, 'learning_rate': 7.002724795640328e-05, 'epoch': 0.32}


 32%|███▏      | 5021/15547 [17:56<35:01,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.99607895261514e-05, 'epoch': 0.32}


 32%|███▏      | 5031/15547 [17:58<35:08,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.989433109589952e-05, 'epoch': 0.32}


 32%|███▏      | 5041/15547 [18:00<35:08,  4.98it/s]

{'loss': 0.0, 'learning_rate': 6.982787266564764e-05, 'epoch': 0.32}


 32%|███▏      | 5051/15547 [18:02<34:42,  5.04it/s]

{'loss': 0.0, 'learning_rate': 6.976141423539577e-05, 'epoch': 0.32}


 33%|███▎      | 5061/15547 [18:04<34:45,  5.03it/s]

{'loss': 0.0, 'learning_rate': 6.96949558051439e-05, 'epoch': 0.33}


 33%|███▎      | 5071/15547 [18:06<34:44,  5.03it/s]

{'loss': 0.0, 'learning_rate': 6.962849737489201e-05, 'epoch': 0.33}


 33%|███▎      | 5081/15547 [18:08<34:47,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.956203894464012e-05, 'epoch': 0.33}


 33%|███▎      | 5091/15547 [18:10<34:34,  5.04it/s]

{'loss': 0.0, 'learning_rate': 6.949558051438825e-05, 'epoch': 0.33}


 33%|███▎      | 5101/15547 [18:12<38:16,  4.55it/s]

{'loss': 0.0, 'learning_rate': 6.942912208413638e-05, 'epoch': 0.33}


 33%|███▎      | 5111/15547 [18:14<34:40,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.93626636538845e-05, 'epoch': 0.33}


 33%|███▎      | 5121/15547 [18:16<34:32,  5.03it/s]

{'loss': 0.0, 'learning_rate': 6.929620522363263e-05, 'epoch': 0.33}


 33%|███▎      | 5131/15547 [18:18<34:35,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.922974679338074e-05, 'epoch': 0.33}


 33%|███▎      | 5141/15547 [18:20<34:32,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.916328836312887e-05, 'epoch': 0.33}


 33%|███▎      | 5151/15547 [18:22<34:28,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.9096829932877e-05, 'epoch': 0.33}


 33%|███▎      | 5161/15547 [18:24<34:25,  5.03it/s]

{'loss': 0.0, 'learning_rate': 6.903037150262511e-05, 'epoch': 0.33}


 33%|███▎      | 5171/15547 [18:26<34:31,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.896391307237323e-05, 'epoch': 0.33}


 33%|███▎      | 5181/15547 [18:28<34:29,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.889745464212136e-05, 'epoch': 0.33}


 33%|███▎      | 5191/15547 [18:30<34:23,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.883099621186948e-05, 'epoch': 0.33}


 33%|███▎      | 5201/15547 [18:32<37:32,  4.59it/s]

{'loss': 0.0, 'learning_rate': 6.87645377816176e-05, 'epoch': 0.33}


 34%|███▎      | 5211/15547 [18:34<34:36,  4.98it/s]

{'loss': 0.0, 'learning_rate': 6.869807935136573e-05, 'epoch': 0.34}


 34%|███▎      | 5221/15547 [18:36<34:19,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.863162092111384e-05, 'epoch': 0.34}


 34%|███▎      | 5231/15547 [18:38<34:15,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.856516249086197e-05, 'epoch': 0.34}


 34%|███▎      | 5241/15547 [18:40<34:20,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.84987040606101e-05, 'epoch': 0.34}


 34%|███▍      | 5250/15547 [18:41<33:59,  5.05it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 6.843224563035821e-05, 'epoch': 0.34}


                                                    
 34%|███▍      | 5251/15547 [18:45<3:09:56,  1.11s/it]

{'eval_loss': 2.6256884666508995e-07, 'eval_runtime': 3.0175, 'eval_samples_per_second': 828.494, 'eval_steps_per_second': 13.256, 'epoch': 0.34}


 34%|███▍      | 5261/15547 [18:47<38:37,  4.44it/s]  

{'loss': 0.0, 'learning_rate': 6.836578720010633e-05, 'epoch': 0.34}


 34%|███▍      | 5271/15547 [18:49<34:25,  4.98it/s]

{'loss': 0.0, 'learning_rate': 6.829932876985447e-05, 'epoch': 0.34}


 34%|███▍      | 5281/15547 [18:51<34:18,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.823287033960259e-05, 'epoch': 0.34}


 34%|███▍      | 5291/15547 [18:53<34:09,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.81664119093507e-05, 'epoch': 0.34}


 34%|███▍      | 5301/15547 [18:55<36:39,  4.66it/s]

{'loss': 0.0, 'learning_rate': 6.809995347909883e-05, 'epoch': 0.34}


 34%|███▍      | 5311/15547 [18:57<34:14,  4.98it/s]

{'loss': 0.0, 'learning_rate': 6.803349504884694e-05, 'epoch': 0.34}


 34%|███▍      | 5321/15547 [18:59<34:23,  4.96it/s]

{'loss': 0.0, 'learning_rate': 6.796703661859507e-05, 'epoch': 0.34}


 34%|███▍      | 5331/15547 [19:01<34:17,  4.97it/s]

{'loss': 0.0, 'learning_rate': 6.790057818834319e-05, 'epoch': 0.34}


 34%|███▍      | 5341/15547 [19:03<34:16,  4.96it/s]

{'loss': 0.0, 'learning_rate': 6.783411975809132e-05, 'epoch': 0.34}


 34%|███▍      | 5351/15547 [19:05<34:12,  4.97it/s]

{'loss': 0.0, 'learning_rate': 6.776766132783943e-05, 'epoch': 0.34}


 34%|███▍      | 5361/15547 [19:07<33:59,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.770120289758756e-05, 'epoch': 0.34}


 35%|███▍      | 5371/15547 [19:09<33:52,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.763474446733569e-05, 'epoch': 0.35}


 35%|███▍      | 5381/15547 [19:11<33:52,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.75682860370838e-05, 'epoch': 0.35}


 35%|███▍      | 5391/15547 [19:13<34:14,  4.94it/s]

{'loss': 0.0, 'learning_rate': 6.750182760683192e-05, 'epoch': 0.35}


 35%|███▍      | 5401/15547 [19:15<36:54,  4.58it/s]

{'loss': 0.0, 'learning_rate': 6.743536917658006e-05, 'epoch': 0.35}


 35%|███▍      | 5411/15547 [19:17<34:08,  4.95it/s]

{'loss': 0.0, 'learning_rate': 6.736891074632818e-05, 'epoch': 0.35}


 35%|███▍      | 5421/15547 [19:19<34:00,  4.96it/s]

{'loss': 0.0, 'learning_rate': 6.730245231607629e-05, 'epoch': 0.35}


 35%|███▍      | 5431/15547 [19:21<33:58,  4.96it/s]

{'loss': 0.0, 'learning_rate': 6.723599388582442e-05, 'epoch': 0.35}


 35%|███▍      | 5441/15547 [19:23<33:43,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.716953545557255e-05, 'epoch': 0.35}


 35%|███▌      | 5451/15547 [19:25<33:44,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.710307702532066e-05, 'epoch': 0.35}


 35%|███▌      | 5461/15547 [19:27<33:33,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.703661859506879e-05, 'epoch': 0.35}


 35%|███▌      | 5471/15547 [19:29<33:41,  4.98it/s]

{'loss': 0.0, 'learning_rate': 6.69701601648169e-05, 'epoch': 0.35}


 35%|███▌      | 5481/15547 [19:31<33:31,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.690370173456502e-05, 'epoch': 0.35}


 35%|███▌      | 5491/15547 [19:33<33:34,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.683724330431316e-05, 'epoch': 0.35}


 35%|███▌      | 5500/15547 [19:35<36:38,  4.57it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 6.677078487406128e-05, 'epoch': 0.35}


                                                    
 35%|███▌      | 5500/15547 [19:38<36:38,  4.57it/s]Saving model checkpoint to ./results/checkpoint-5500
Configuration saved in ./results/checkpoint-5500/config.json


{'eval_loss': 2.450479996696231e-07, 'eval_runtime': 3.0025, 'eval_samples_per_second': 832.633, 'eval_steps_per_second': 13.322, 'epoch': 0.35}


Model weights saved in ./results/checkpoint-5500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 35%|███▌      | 5511/15547 [19:41<39:32,  4.23it/s]  

{'loss': 0.0, 'learning_rate': 6.670432644380939e-05, 'epoch': 0.35}


 36%|███▌      | 5521/15547 [19:43<33:30,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.663786801355752e-05, 'epoch': 0.36}


 36%|███▌      | 5531/15547 [19:45<33:17,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.657140958330565e-05, 'epoch': 0.36}


 36%|███▌      | 5541/15547 [19:47<33:19,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.650495115305376e-05, 'epoch': 0.36}


 36%|███▌      | 5551/15547 [19:49<33:09,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.64384927228019e-05, 'epoch': 0.36}


 36%|███▌      | 5561/15547 [19:51<33:17,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.637203429255001e-05, 'epoch': 0.36}


 36%|███▌      | 5571/15547 [19:53<33:08,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.630557586229814e-05, 'epoch': 0.36}


 36%|███▌      | 5581/15547 [19:55<33:10,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.623911743204627e-05, 'epoch': 0.36}


 36%|███▌      | 5591/15547 [19:57<33:27,  4.96it/s]

{'loss': 0.0, 'learning_rate': 6.617265900179438e-05, 'epoch': 0.36}


 36%|███▌      | 5601/15547 [19:59<36:02,  4.60it/s]

{'loss': 0.0, 'learning_rate': 6.61062005715425e-05, 'epoch': 0.36}


 36%|███▌      | 5611/15547 [20:01<33:19,  4.97it/s]

{'loss': 0.0, 'learning_rate': 6.603974214129062e-05, 'epoch': 0.36}


 36%|███▌      | 5621/15547 [20:03<33:05,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.597328371103875e-05, 'epoch': 0.36}


 36%|███▌      | 5631/15547 [20:05<33:06,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.590682528078687e-05, 'epoch': 0.36}


 36%|███▋      | 5641/15547 [20:07<32:59,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.5840366850535e-05, 'epoch': 0.36}


 36%|███▋      | 5651/15547 [20:09<33:06,  4.98it/s]

{'loss': 0.0, 'learning_rate': 6.577390842028311e-05, 'epoch': 0.36}


 36%|███▋      | 5661/15547 [20:11<32:53,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.570744999003124e-05, 'epoch': 0.36}


 36%|███▋      | 5671/15547 [20:13<32:48,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.564099155977937e-05, 'epoch': 0.36}


 37%|███▋      | 5681/15547 [20:15<32:49,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.557453312952748e-05, 'epoch': 0.37}


 37%|███▋      | 5691/15547 [20:17<32:47,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.55080746992756e-05, 'epoch': 0.37}


 37%|███▋      | 5701/15547 [20:19<35:16,  4.65it/s]

{'loss': 0.0, 'learning_rate': 6.544161626902373e-05, 'epoch': 0.37}


 37%|███▋      | 5711/15547 [20:21<33:02,  4.96it/s]

{'loss': 0.0, 'learning_rate': 6.537515783877185e-05, 'epoch': 0.37}


 37%|███▋      | 5721/15547 [20:23<32:49,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.530869940851997e-05, 'epoch': 0.37}


 37%|███▋      | 5731/15547 [20:25<32:44,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.52422409782681e-05, 'epoch': 0.37}


 37%|███▋      | 5741/15547 [20:27<32:37,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.517578254801621e-05, 'epoch': 0.37}


 37%|███▋      | 5750/15547 [20:29<32:20,  5.05it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 6.510932411776434e-05, 'epoch': 0.37}


                                                    
 37%|███▋      | 5751/15547 [20:32<3:00:41,  1.11s/it]

{'eval_loss': 2.272363985866832e-07, 'eval_runtime': 3.0134, 'eval_samples_per_second': 829.621, 'eval_steps_per_second': 13.274, 'epoch': 0.37}


 37%|███▋      | 5761/15547 [20:34<36:51,  4.43it/s]  

{'loss': 0.0, 'learning_rate': 6.504286568751247e-05, 'epoch': 0.37}


 37%|███▋      | 5771/15547 [20:36<32:37,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.497640725726059e-05, 'epoch': 0.37}


 37%|███▋      | 5781/15547 [20:38<32:32,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.49099488270087e-05, 'epoch': 0.37}


 37%|███▋      | 5791/15547 [20:40<32:41,  4.97it/s]

{'loss': 0.0, 'learning_rate': 6.484349039675683e-05, 'epoch': 0.37}


 37%|███▋      | 5801/15547 [20:42<34:55,  4.65it/s]

{'loss': 0.0, 'learning_rate': 6.477703196650496e-05, 'epoch': 0.37}


 37%|███▋      | 5811/15547 [20:44<32:28,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.471057353625307e-05, 'epoch': 0.37}


 37%|███▋      | 5821/15547 [20:46<32:23,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.46441151060012e-05, 'epoch': 0.37}


 38%|███▊      | 5831/15547 [20:48<32:20,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.457765667574932e-05, 'epoch': 0.37}


 38%|███▊      | 5841/15547 [20:50<32:21,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.451119824549744e-05, 'epoch': 0.38}


 38%|███▊      | 5851/15547 [20:52<32:22,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.444473981524557e-05, 'epoch': 0.38}


 38%|███▊      | 5861/15547 [20:54<32:14,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.437828138499369e-05, 'epoch': 0.38}


 38%|███▊      | 5871/15547 [20:56<32:10,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.43118229547418e-05, 'epoch': 0.38}


 38%|███▊      | 5881/15547 [20:58<32:11,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.424536452448993e-05, 'epoch': 0.38}


 38%|███▊      | 5891/15547 [21:00<32:05,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.417890609423806e-05, 'epoch': 0.38}


 38%|███▊      | 5901/15547 [21:02<34:42,  4.63it/s]

{'loss': 0.0, 'learning_rate': 6.411244766398617e-05, 'epoch': 0.38}


 38%|███▊      | 5911/15547 [21:04<32:26,  4.95it/s]

{'loss': 0.0, 'learning_rate': 6.40459892337343e-05, 'epoch': 0.38}


 38%|███▊      | 5921/15547 [21:06<32:09,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.397953080348242e-05, 'epoch': 0.38}


 38%|███▊      | 5931/15547 [21:08<32:04,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.391307237323055e-05, 'epoch': 0.38}


 38%|███▊      | 5941/15547 [21:10<32:04,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.384661394297867e-05, 'epoch': 0.38}


 38%|███▊      | 5951/15547 [21:12<31:50,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.378015551272679e-05, 'epoch': 0.38}


 38%|███▊      | 5961/15547 [21:14<31:54,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.37136970824749e-05, 'epoch': 0.38}


 38%|███▊      | 5971/15547 [21:16<31:57,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.364723865222305e-05, 'epoch': 0.38}


 38%|███▊      | 5981/15547 [21:18<31:49,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.358078022197116e-05, 'epoch': 0.38}


 39%|███▊      | 5991/15547 [21:20<31:45,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.351432179171928e-05, 'epoch': 0.39}


 39%|███▊      | 6000/15547 [21:22<35:09,  4.53it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 6.34478633614674e-05, 'epoch': 0.39}


                                                    
 39%|███▊      | 6000/15547 [21:25<35:09,  4.53it/s]Saving model checkpoint to ./results/checkpoint-6000
Configuration saved in ./results/checkpoint-6000/config.json


{'eval_loss': 1.9675549367548228e-07, 'eval_runtime': 3.0022, 'eval_samples_per_second': 832.728, 'eval_steps_per_second': 13.324, 'epoch': 0.39}


Model weights saved in ./results/checkpoint-6000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 39%|███▊      | 6011/15547 [21:28<37:34,  4.23it/s]  

{'loss': 0.0, 'learning_rate': 6.338140493121552e-05, 'epoch': 0.39}


 39%|███▊      | 6021/15547 [21:30<31:55,  4.97it/s]

{'loss': 0.0, 'learning_rate': 6.331494650096365e-05, 'epoch': 0.39}


 39%|███▉      | 6031/15547 [21:32<31:47,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.324848807071178e-05, 'epoch': 0.39}


 39%|███▉      | 6041/15547 [21:34<31:35,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.318202964045989e-05, 'epoch': 0.39}


 39%|███▉      | 6051/15547 [21:36<31:39,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.311557121020801e-05, 'epoch': 0.39}


 39%|███▉      | 6061/15547 [21:38<31:33,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.304911277995615e-05, 'epoch': 0.39}


 39%|███▉      | 6071/15547 [21:40<31:34,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.298265434970426e-05, 'epoch': 0.39}


 39%|███▉      | 6081/15547 [21:42<31:30,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.291619591945238e-05, 'epoch': 0.39}


 39%|███▉      | 6091/15547 [21:44<31:38,  4.98it/s]

{'loss': 0.0, 'learning_rate': 6.284973748920051e-05, 'epoch': 0.39}


 39%|███▉      | 6101/15547 [21:46<33:59,  4.63it/s]

{'loss': 0.0, 'learning_rate': 6.278327905894864e-05, 'epoch': 0.39}


 39%|███▉      | 6111/15547 [21:48<31:34,  4.98it/s]

{'loss': 0.0, 'learning_rate': 6.271682062869675e-05, 'epoch': 0.39}


 39%|███▉      | 6121/15547 [21:50<31:20,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.265036219844488e-05, 'epoch': 0.39}


 39%|███▉      | 6131/15547 [21:52<31:15,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.2583903768193e-05, 'epoch': 0.39}


 39%|███▉      | 6141/15547 [21:54<31:20,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.251744533794111e-05, 'epoch': 0.39}


 40%|███▉      | 6151/15547 [21:56<31:37,  4.95it/s]

{'loss': 0.0, 'learning_rate': 6.245098690768925e-05, 'epoch': 0.4}


 40%|███▉      | 6161/15547 [21:58<31:17,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.238452847743737e-05, 'epoch': 0.4}


 40%|███▉      | 6171/15547 [22:00<31:16,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.231807004718548e-05, 'epoch': 0.4}


 40%|███▉      | 6181/15547 [22:02<31:19,  4.98it/s]

{'loss': 0.0, 'learning_rate': 6.225161161693361e-05, 'epoch': 0.4}


 40%|███▉      | 6191/15547 [22:04<31:12,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.218515318668174e-05, 'epoch': 0.4}


 40%|███▉      | 6201/15547 [22:06<33:16,  4.68it/s]

{'loss': 0.0, 'learning_rate': 6.211869475642985e-05, 'epoch': 0.4}


 40%|███▉      | 6211/15547 [22:08<31:07,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.205223632617798e-05, 'epoch': 0.4}


 40%|████      | 6221/15547 [22:10<30:58,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.19857778959261e-05, 'epoch': 0.4}


 40%|████      | 6231/15547 [22:12<31:17,  4.96it/s]

{'loss': 0.0, 'learning_rate': 6.191931946567423e-05, 'epoch': 0.4}


 40%|████      | 6241/15547 [22:14<31:12,  4.97it/s]

{'loss': 0.0, 'learning_rate': 6.185286103542235e-05, 'epoch': 0.4}


 40%|████      | 6250/15547 [22:16<30:36,  5.06it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 6.178640260517047e-05, 'epoch': 0.4}


                                                    
 40%|████      | 6251/15547 [22:19<2:50:52,  1.10s/it]

{'eval_loss': 1.8156548264869343e-07, 'eval_runtime': 3.0056, 'eval_samples_per_second': 831.788, 'eval_steps_per_second': 13.309, 'epoch': 0.4}


 40%|████      | 6261/15547 [22:21<34:51,  4.44it/s]  

{'loss': 0.0, 'learning_rate': 6.171994417491858e-05, 'epoch': 0.4}


 40%|████      | 6271/15547 [22:23<31:08,  4.96it/s]

{'loss': 0.0, 'learning_rate': 6.165348574466671e-05, 'epoch': 0.4}


 40%|████      | 6281/15547 [22:25<30:56,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.158702731441484e-05, 'epoch': 0.4}


 40%|████      | 6291/15547 [22:27<30:54,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.152056888416296e-05, 'epoch': 0.4}


 41%|████      | 6301/15547 [22:29<34:27,  4.47it/s]

{'loss': 0.0, 'learning_rate': 6.145411045391108e-05, 'epoch': 0.41}


 41%|████      | 6311/15547 [22:31<30:59,  4.97it/s]

{'loss': 0.0, 'learning_rate': 6.13876520236592e-05, 'epoch': 0.41}


 41%|████      | 6321/15547 [22:33<30:41,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.132119359340733e-05, 'epoch': 0.41}


 41%|████      | 6331/15547 [22:35<30:41,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.125473516315546e-05, 'epoch': 0.41}


 41%|████      | 6341/15547 [22:37<30:42,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.118827673290357e-05, 'epoch': 0.41}


 41%|████      | 6351/15547 [22:39<30:45,  4.98it/s]

{'loss': 0.0, 'learning_rate': 6.112181830265169e-05, 'epoch': 0.41}


 41%|████      | 6361/15547 [22:41<30:35,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.105535987239981e-05, 'epoch': 0.41}


 41%|████      | 6371/15547 [22:43<30:32,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.0988901442147937e-05, 'epoch': 0.41}


 41%|████      | 6381/15547 [22:45<30:29,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.092244301189606e-05, 'epoch': 0.41}


 41%|████      | 6391/15547 [22:47<30:34,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.085598458164419e-05, 'epoch': 0.41}


 41%|████      | 6401/15547 [22:49<33:32,  4.54it/s]

{'loss': 0.0, 'learning_rate': 6.078952615139231e-05, 'epoch': 0.41}


 41%|████      | 6411/15547 [22:51<30:39,  4.97it/s]

{'loss': 0.0, 'learning_rate': 6.0723067721140424e-05, 'epoch': 0.41}


 41%|████▏     | 6421/15547 [22:53<30:32,  4.98it/s]

{'loss': 0.0, 'learning_rate': 6.065660929088855e-05, 'epoch': 0.41}


 41%|████▏     | 6431/15547 [22:55<30:21,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.0590150860636674e-05, 'epoch': 0.41}


 41%|████▏     | 6441/15547 [22:57<30:15,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.0523692430384795e-05, 'epoch': 0.41}


 41%|████▏     | 6451/15547 [22:59<30:10,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.0457234000132924e-05, 'epoch': 0.41}


 42%|████▏     | 6461/15547 [23:01<30:16,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.039077556988104e-05, 'epoch': 0.42}


 42%|████▏     | 6471/15547 [23:03<30:16,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.032431713962916e-05, 'epoch': 0.42}


 42%|████▏     | 6481/15547 [23:05<30:12,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.025785870937729e-05, 'epoch': 0.42}


 42%|████▏     | 6491/15547 [23:07<30:14,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.019140027912541e-05, 'epoch': 0.42}


 42%|████▏     | 6500/15547 [23:09<33:40,  4.48it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 6.0124941848873526e-05, 'epoch': 0.42}


                                                    
 42%|████▏     | 6500/15547 [23:12<33:40,  4.48it/s]Saving model checkpoint to ./results/checkpoint-6500
Configuration saved in ./results/checkpoint-6500/config.json


{'eval_loss': 1.7502181037798437e-07, 'eval_runtime': 3.0037, 'eval_samples_per_second': 832.304, 'eval_steps_per_second': 13.317, 'epoch': 0.42}


Model weights saved in ./results/checkpoint-6500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 42%|████▏     | 6511/15547 [23:15<35:26,  4.25it/s]  

{'loss': 0.0, 'learning_rate': 6.005848341862166e-05, 'epoch': 0.42}


 42%|████▏     | 6521/15547 [23:17<30:02,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.9992024988369776e-05, 'epoch': 0.42}


 42%|████▏     | 6531/15547 [23:19<29:49,  5.04it/s]

{'loss': 0.0, 'learning_rate': 5.99255665581179e-05, 'epoch': 0.42}


 42%|████▏     | 6541/15547 [23:21<29:59,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.9859108127866026e-05, 'epoch': 0.42}


 42%|████▏     | 6551/15547 [23:23<29:54,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.979264969761415e-05, 'epoch': 0.42}


 42%|████▏     | 6561/15547 [23:25<29:50,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.972619126736226e-05, 'epoch': 0.42}


 42%|████▏     | 6571/15547 [23:27<29:48,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.965973283711039e-05, 'epoch': 0.42}


 42%|████▏     | 6581/15547 [23:29<29:44,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.959327440685851e-05, 'epoch': 0.42}


 42%|████▏     | 6591/15547 [23:31<29:53,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.952681597660663e-05, 'epoch': 0.42}


 42%|████▏     | 6601/15547 [23:33<32:34,  4.58it/s]

{'loss': 0.0, 'learning_rate': 5.9460357546354764e-05, 'epoch': 0.42}


 43%|████▎     | 6611/15547 [23:35<29:52,  4.98it/s]

{'loss': 0.0, 'learning_rate': 5.939389911610288e-05, 'epoch': 0.43}


 43%|████▎     | 6621/15547 [23:37<29:36,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.9327440685851e-05, 'epoch': 0.43}


 43%|████▎     | 6631/15547 [23:39<29:40,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.926098225559913e-05, 'epoch': 0.43}


 43%|████▎     | 6641/15547 [23:41<29:31,  5.03it/s]

{'loss': 0.0, 'learning_rate': 5.919452382534725e-05, 'epoch': 0.43}


 43%|████▎     | 6651/15547 [23:43<29:30,  5.03it/s]

{'loss': 0.0, 'learning_rate': 5.9128065395095365e-05, 'epoch': 0.43}


 43%|████▎     | 6661/15547 [23:45<29:37,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.9061606964843494e-05, 'epoch': 0.43}


 43%|████▎     | 6671/15547 [23:47<29:33,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.8995148534591616e-05, 'epoch': 0.43}


 43%|████▎     | 6681/15547 [23:49<29:32,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.892869010433974e-05, 'epoch': 0.43}


 43%|████▎     | 6691/15547 [23:51<29:28,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.8862231674087866e-05, 'epoch': 0.43}


 43%|████▎     | 6701/15547 [23:53<31:30,  4.68it/s]

{'loss': 0.0, 'learning_rate': 5.879577324383598e-05, 'epoch': 0.43}


 43%|████▎     | 6711/15547 [23:55<29:19,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.87293148135841e-05, 'epoch': 0.43}


 43%|████▎     | 6721/15547 [23:57<29:10,  5.04it/s]

{'loss': 0.0, 'learning_rate': 5.866285638333223e-05, 'epoch': 0.43}


 43%|████▎     | 6731/15547 [23:59<29:09,  5.04it/s]

{'loss': 0.0, 'learning_rate': 5.859639795308035e-05, 'epoch': 0.43}


 43%|████▎     | 6741/15547 [24:01<29:41,  4.94it/s]

{'loss': 0.0, 'learning_rate': 5.852993952282847e-05, 'epoch': 0.43}


 43%|████▎     | 6750/15547 [24:03<29:08,  5.03it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 5.8463481092576596e-05, 'epoch': 0.43}


                                                    
 43%|████▎     | 6751/15547 [24:06<2:41:26,  1.10s/it]

{'eval_loss': 1.7128766671703488e-07, 'eval_runtime': 3.0039, 'eval_samples_per_second': 832.247, 'eval_steps_per_second': 13.316, 'epoch': 0.43}


 43%|████▎     | 6761/15547 [24:08<32:51,  4.46it/s]  

{'loss': 0.0, 'learning_rate': 5.839702266232472e-05, 'epoch': 0.43}


 44%|████▎     | 6771/15547 [24:10<29:34,  4.94it/s]

{'loss': 0.0, 'learning_rate': 5.833056423207284e-05, 'epoch': 0.44}


 44%|████▎     | 6781/15547 [24:12<29:16,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.826410580182097e-05, 'epoch': 0.44}


 44%|████▎     | 6791/15547 [24:14<29:17,  4.98it/s]

{'loss': 0.0, 'learning_rate': 5.819764737156908e-05, 'epoch': 0.44}


 44%|████▎     | 6801/15547 [24:16<31:37,  4.61it/s]

{'loss': 0.0, 'learning_rate': 5.8131188941317205e-05, 'epoch': 0.44}


 44%|████▍     | 6811/15547 [24:18<29:25,  4.95it/s]

{'loss': 0.0, 'learning_rate': 5.8064730511065333e-05, 'epoch': 0.44}


 44%|████▍     | 6821/15547 [24:20<29:17,  4.97it/s]

{'loss': 0.0, 'learning_rate': 5.7998272080813455e-05, 'epoch': 0.44}


 44%|████▍     | 6831/15547 [24:22<29:04,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.793181365056157e-05, 'epoch': 0.44}


 44%|████▍     | 6841/15547 [24:24<29:10,  4.97it/s]

{'loss': 0.0, 'learning_rate': 5.7865355220309705e-05, 'epoch': 0.44}


 44%|████▍     | 6851/15547 [24:26<28:58,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.779889679005782e-05, 'epoch': 0.44}


 44%|████▍     | 6861/15547 [24:28<28:51,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.773243835980594e-05, 'epoch': 0.44}


 44%|████▍     | 6871/15547 [24:30<28:49,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.766597992955407e-05, 'epoch': 0.44}


 44%|████▍     | 6881/15547 [24:32<28:57,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.759952149930219e-05, 'epoch': 0.44}


 44%|████▍     | 6891/15547 [24:34<28:55,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.753306306905031e-05, 'epoch': 0.44}


 44%|████▍     | 6901/15547 [24:36<31:08,  4.63it/s]

{'loss': 0.0, 'learning_rate': 5.7466604638798436e-05, 'epoch': 0.44}


 44%|████▍     | 6911/15547 [24:38<28:39,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.740014620854656e-05, 'epoch': 0.44}


 45%|████▍     | 6921/15547 [24:40<28:30,  5.04it/s]

{'loss': 0.0, 'learning_rate': 5.733368777829467e-05, 'epoch': 0.45}


 45%|████▍     | 6931/15547 [24:42<28:32,  5.03it/s]

{'loss': 0.0, 'learning_rate': 5.726722934804281e-05, 'epoch': 0.45}


 45%|████▍     | 6941/15547 [24:44<28:30,  5.03it/s]

{'loss': 0.0, 'learning_rate': 5.720077091779092e-05, 'epoch': 0.45}


 45%|████▍     | 6951/15547 [24:46<28:31,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.7134312487539045e-05, 'epoch': 0.45}


 45%|████▍     | 6961/15547 [24:48<28:23,  5.04it/s]

{'loss': 0.0, 'learning_rate': 5.706785405728717e-05, 'epoch': 0.45}


 45%|████▍     | 6971/15547 [24:50<28:30,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.7001395627035295e-05, 'epoch': 0.45}


 45%|████▍     | 6981/15547 [24:52<28:24,  5.03it/s]

{'loss': 0.0, 'learning_rate': 5.693493719678341e-05, 'epoch': 0.45}


 45%|████▍     | 6991/15547 [24:54<28:20,  5.03it/s]

{'loss': 0.0, 'learning_rate': 5.686847876653154e-05, 'epoch': 0.45}


 45%|████▌     | 7000/15547 [24:56<31:42,  4.49it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 5.680202033627966e-05, 'epoch': 0.45}


                                                    
 45%|████▌     | 7000/15547 [24:59<31:42,  4.49it/s]Saving model checkpoint to ./results/checkpoint-7000
Configuration saved in ./results/checkpoint-7000/config.json


{'eval_loss': 1.5987514245807688e-07, 'eval_runtime': 3.0166, 'eval_samples_per_second': 828.736, 'eval_steps_per_second': 13.26, 'epoch': 0.45}


Model weights saved in ./results/checkpoint-7000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 45%|████▌     | 7011/15547 [25:02<33:16,  4.28it/s]  

{'loss': 0.0, 'learning_rate': 5.673556190602778e-05, 'epoch': 0.45}


 45%|████▌     | 7021/15547 [25:04<28:32,  4.98it/s]

{'loss': 0.0, 'learning_rate': 5.666910347577591e-05, 'epoch': 0.45}


 45%|████▌     | 7031/15547 [25:06<28:23,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.6602645045524025e-05, 'epoch': 0.45}


 45%|████▌     | 7041/15547 [25:08<28:21,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.653618661527215e-05, 'epoch': 0.45}


 45%|████▌     | 7051/15547 [25:10<28:29,  4.97it/s]

{'loss': 0.0, 'learning_rate': 5.6469728185020275e-05, 'epoch': 0.45}


 45%|████▌     | 7061/15547 [25:12<28:16,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.64032697547684e-05, 'epoch': 0.45}


 45%|████▌     | 7071/15547 [25:14<28:12,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.633681132451651e-05, 'epoch': 0.45}


 46%|████▌     | 7081/15547 [25:16<28:12,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.627035289426465e-05, 'epoch': 0.46}


 46%|████▌     | 7091/15547 [25:18<28:12,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.620389446401276e-05, 'epoch': 0.46}


 46%|████▌     | 7101/15547 [25:20<30:43,  4.58it/s]

{'loss': 0.0, 'learning_rate': 5.6137436033760884e-05, 'epoch': 0.46}


 46%|████▌     | 7111/15547 [25:22<28:18,  4.97it/s]

{'loss': 0.0, 'learning_rate': 5.607097760350901e-05, 'epoch': 0.46}


 46%|████▌     | 7121/15547 [25:24<27:59,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.600451917325713e-05, 'epoch': 0.46}


 46%|████▌     | 7131/15547 [25:26<28:04,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.593806074300525e-05, 'epoch': 0.46}


 46%|████▌     | 7141/15547 [25:28<28:01,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.587160231275338e-05, 'epoch': 0.46}


 46%|████▌     | 7151/15547 [25:30<27:51,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.58051438825015e-05, 'epoch': 0.46}


 46%|████▌     | 7161/15547 [25:32<27:48,  5.03it/s]

{'loss': 0.0, 'learning_rate': 5.5738685452249614e-05, 'epoch': 0.46}


 46%|████▌     | 7171/15547 [25:34<27:52,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.567222702199775e-05, 'epoch': 0.46}


 46%|████▌     | 7181/15547 [25:36<28:03,  4.97it/s]

{'loss': 0.0, 'learning_rate': 5.5605768591745865e-05, 'epoch': 0.46}


 46%|████▋     | 7191/15547 [25:38<27:51,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.5539310161493986e-05, 'epoch': 0.46}


 46%|████▋     | 7201/15547 [25:40<30:38,  4.54it/s]

{'loss': 0.0, 'learning_rate': 5.5472851731242115e-05, 'epoch': 0.46}


 46%|████▋     | 7211/15547 [25:42<27:52,  4.98it/s]

{'loss': 0.0, 'learning_rate': 5.540639330099024e-05, 'epoch': 0.46}


 46%|████▋     | 7221/15547 [25:44<27:49,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.533993487073835e-05, 'epoch': 0.46}


 47%|████▋     | 7231/15547 [25:46<27:45,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.527347644048648e-05, 'epoch': 0.47}


 47%|████▋     | 7241/15547 [25:48<27:48,  4.98it/s]

{'loss': 0.0, 'learning_rate': 5.52070180102346e-05, 'epoch': 0.47}


 47%|████▋     | 7250/15547 [25:50<27:24,  5.04it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 5.514055957998272e-05, 'epoch': 0.47}


                                                    
 47%|████▋     | 7251/15547 [25:53<2:32:45,  1.10s/it]

{'eval_loss': 1.471959762966435e-07, 'eval_runtime': 3.0077, 'eval_samples_per_second': 831.213, 'eval_steps_per_second': 13.299, 'epoch': 0.47}


 47%|████▋     | 7261/15547 [25:55<31:12,  4.43it/s]  

{'loss': 0.0, 'learning_rate': 5.507410114973085e-05, 'epoch': 0.47}


 47%|████▋     | 7271/15547 [25:57<27:36,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.500764271947897e-05, 'epoch': 0.47}


 47%|████▋     | 7281/15547 [25:59<27:30,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.494118428922709e-05, 'epoch': 0.47}


 47%|████▋     | 7291/15547 [26:01<27:37,  4.98it/s]

{'loss': 0.0, 'learning_rate': 5.487472585897522e-05, 'epoch': 0.47}


 47%|████▋     | 7301/15547 [26:03<29:48,  4.61it/s]

{'loss': 0.0, 'learning_rate': 5.480826742872334e-05, 'epoch': 0.47}


 47%|████▋     | 7311/15547 [26:05<27:38,  4.97it/s]

{'loss': 0.0, 'learning_rate': 5.4741808998471454e-05, 'epoch': 0.47}


 47%|████▋     | 7321/15547 [26:07<27:24,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.467535056821958e-05, 'epoch': 0.47}


 47%|████▋     | 7331/15547 [26:09<27:24,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.4608892137967704e-05, 'epoch': 0.47}


 47%|████▋     | 7341/15547 [26:11<27:09,  5.04it/s]

{'loss': 0.0, 'learning_rate': 5.4542433707715826e-05, 'epoch': 0.47}


 47%|████▋     | 7351/15547 [26:13<27:13,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.4475975277463955e-05, 'epoch': 0.47}


 47%|████▋     | 7361/15547 [26:15<27:19,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.440951684721207e-05, 'epoch': 0.47}


 47%|████▋     | 7371/15547 [26:17<27:17,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.434305841696019e-05, 'epoch': 0.47}


 47%|████▋     | 7381/15547 [26:19<27:13,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.427659998670832e-05, 'epoch': 0.47}


 48%|████▊     | 7391/15547 [26:21<27:09,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.421014155645644e-05, 'epoch': 0.48}


 48%|████▊     | 7401/15547 [26:23<29:07,  4.66it/s]

{'loss': 0.0, 'learning_rate': 5.4143683126204556e-05, 'epoch': 0.48}


 48%|████▊     | 7411/15547 [26:25<27:29,  4.93it/s]

{'loss': 0.0, 'learning_rate': 5.407722469595269e-05, 'epoch': 0.48}


 48%|████▊     | 7421/15547 [26:27<27:00,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.401076626570081e-05, 'epoch': 0.48}


 48%|████▊     | 7431/15547 [26:29<27:05,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.394430783544893e-05, 'epoch': 0.48}


 48%|████▊     | 7441/15547 [26:31<27:03,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.387784940519706e-05, 'epoch': 0.48}


 48%|████▊     | 7451/15547 [26:33<27:04,  4.98it/s]

{'loss': 0.0, 'learning_rate': 5.381139097494517e-05, 'epoch': 0.48}


 48%|████▊     | 7461/15547 [26:35<26:56,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.3744932544693294e-05, 'epoch': 0.48}


 48%|████▊     | 7471/15547 [26:37<26:55,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.367847411444142e-05, 'epoch': 0.48}


 48%|████▊     | 7481/15547 [26:39<26:58,  4.98it/s]

{'loss': 0.0, 'learning_rate': 5.3612015684189544e-05, 'epoch': 0.48}


 48%|████▊     | 7491/15547 [26:41<26:52,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.354555725393766e-05, 'epoch': 0.48}


 48%|████▊     | 7500/15547 [26:43<29:59,  4.47it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 5.3479098823685794e-05, 'epoch': 0.48}


                                                    
 48%|████▊     | 7500/15547 [26:46<29:59,  4.47it/s]Saving model checkpoint to ./results/checkpoint-7500
Configuration saved in ./results/checkpoint-7500/config.json


{'eval_loss': 1.3147651145573036e-07, 'eval_runtime': 3.0079, 'eval_samples_per_second': 831.138, 'eval_steps_per_second': 13.298, 'epoch': 0.48}


Model weights saved in ./results/checkpoint-7500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 48%|████▊     | 7511/15547 [26:49<31:28,  4.26it/s]  

{'loss': 0.0, 'learning_rate': 5.341264039343391e-05, 'epoch': 0.48}


 48%|████▊     | 7521/15547 [26:51<26:52,  4.98it/s]

{'loss': 0.0, 'learning_rate': 5.334618196318203e-05, 'epoch': 0.48}


 48%|████▊     | 7531/15547 [26:53<26:41,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.327972353293016e-05, 'epoch': 0.48}


 49%|████▊     | 7541/15547 [26:55<26:36,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.321326510267828e-05, 'epoch': 0.48}


 49%|████▊     | 7551/15547 [26:57<26:36,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.3146806672426396e-05, 'epoch': 0.49}


 49%|████▊     | 7561/15547 [26:59<26:30,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.3080348242174524e-05, 'epoch': 0.49}


 49%|████▊     | 7571/15547 [27:01<26:27,  5.03it/s]

{'loss': 0.0, 'learning_rate': 5.3013889811922646e-05, 'epoch': 0.49}


 49%|████▉     | 7581/15547 [27:03<26:29,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.294743138167076e-05, 'epoch': 0.49}


 49%|████▉     | 7591/15547 [27:05<26:29,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.2880972951418896e-05, 'epoch': 0.49}


 49%|████▉     | 7601/15547 [27:07<28:31,  4.64it/s]

{'loss': 0.0, 'learning_rate': 5.281451452116701e-05, 'epoch': 0.49}


 49%|████▉     | 7611/15547 [27:09<26:36,  4.97it/s]

{'loss': 0.0, 'learning_rate': 5.274805609091513e-05, 'epoch': 0.49}


 49%|████▉     | 7621/15547 [27:11<26:23,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.268159766066326e-05, 'epoch': 0.49}


 49%|████▉     | 7631/15547 [27:13<26:21,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.2615139230411383e-05, 'epoch': 0.49}


 49%|████▉     | 7641/15547 [27:15<26:19,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.25486808001595e-05, 'epoch': 0.49}


 49%|████▉     | 7651/15547 [27:17<26:16,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.248222236990763e-05, 'epoch': 0.49}


 49%|████▉     | 7661/15547 [27:19<26:15,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.241576393965575e-05, 'epoch': 0.49}


 49%|████▉     | 7671/15547 [27:21<26:17,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.234930550940387e-05, 'epoch': 0.49}


 49%|████▉     | 7681/15547 [27:23<26:11,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.2282847079152e-05, 'epoch': 0.49}


 49%|████▉     | 7691/15547 [27:25<26:13,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.2216388648900114e-05, 'epoch': 0.49}


 50%|████▉     | 7701/15547 [27:27<28:04,  4.66it/s]

{'loss': 0.0, 'learning_rate': 5.2149930218648236e-05, 'epoch': 0.5}


 50%|████▉     | 7711/15547 [27:29<26:07,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.2083471788396364e-05, 'epoch': 0.5}


 50%|████▉     | 7721/15547 [27:31<25:58,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.2017013358144486e-05, 'epoch': 0.5}


 50%|████▉     | 7731/15547 [27:33<25:58,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.19505549278926e-05, 'epoch': 0.5}


 50%|████▉     | 7741/15547 [27:35<25:53,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.1884096497640736e-05, 'epoch': 0.5}


 50%|████▉     | 7750/15547 [27:37<25:41,  5.06it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 5.181763806738885e-05, 'epoch': 0.5}


                                                    
 50%|████▉     | 7751/15547 [27:40<2:23:24,  1.10s/it]

{'eval_loss': 1.189211857877126e-07, 'eval_runtime': 3.0057, 'eval_samples_per_second': 831.74, 'eval_steps_per_second': 13.308, 'epoch': 0.5}


 50%|████▉     | 7761/15547 [27:42<29:08,  4.45it/s]  

{'loss': 0.0, 'learning_rate': 5.175117963713697e-05, 'epoch': 0.5}


 50%|████▉     | 7771/15547 [27:44<25:51,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.16847212068851e-05, 'epoch': 0.5}


 50%|█████     | 7781/15547 [27:46<25:37,  5.05it/s]

{'loss': 0.0, 'learning_rate': 5.1618262776633216e-05, 'epoch': 0.5}


 50%|█████     | 7791/15547 [27:48<25:43,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.155180434638134e-05, 'epoch': 0.5}


 50%|█████     | 7801/15547 [27:50<27:31,  4.69it/s]

{'loss': 0.0, 'learning_rate': 5.1485345916129466e-05, 'epoch': 0.5}


 50%|█████     | 7811/15547 [27:52<25:46,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.141888748587759e-05, 'epoch': 0.5}


 50%|█████     | 7821/15547 [27:54<25:39,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.13524290556257e-05, 'epoch': 0.5}


 50%|█████     | 7831/15547 [27:56<25:38,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.128597062537384e-05, 'epoch': 0.5}


 50%|█████     | 7841/15547 [27:58<25:32,  5.03it/s]

{'loss': 0.0, 'learning_rate': 5.121951219512195e-05, 'epoch': 0.5}


 50%|█████     | 7851/15547 [28:00<25:52,  4.96it/s]

{'loss': 0.0, 'learning_rate': 5.1153053764870075e-05, 'epoch': 0.5}


 51%|█████     | 7861/15547 [28:02<25:47,  4.97it/s]

{'loss': 0.0, 'learning_rate': 5.1086595334618204e-05, 'epoch': 0.51}


 51%|█████     | 7871/15547 [28:04<25:37,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.1020136904366325e-05, 'epoch': 0.51}


 51%|█████     | 7881/15547 [28:06<25:46,  4.96it/s]

{'loss': 0.0, 'learning_rate': 5.095367847411444e-05, 'epoch': 0.51}


 51%|█████     | 7891/15547 [28:08<25:40,  4.97it/s]

{'loss': 0.0, 'learning_rate': 5.088722004386257e-05, 'epoch': 0.51}


 51%|█████     | 7901/15547 [28:10<28:03,  4.54it/s]

{'loss': 0.0, 'learning_rate': 5.082076161361069e-05, 'epoch': 0.51}


 51%|█████     | 7911/15547 [28:12<25:43,  4.95it/s]

{'loss': 0.0, 'learning_rate': 5.075430318335881e-05, 'epoch': 0.51}


 51%|█████     | 7921/15547 [28:14<25:36,  4.96it/s]

{'loss': 0.0, 'learning_rate': 5.068784475310694e-05, 'epoch': 0.51}


 51%|█████     | 7931/15547 [28:16<25:34,  4.96it/s]

{'loss': 0.0, 'learning_rate': 5.0621386322855056e-05, 'epoch': 0.51}


 51%|█████     | 7941/15547 [28:18<25:26,  4.98it/s]

{'loss': 0.0, 'learning_rate': 5.055492789260318e-05, 'epoch': 0.51}


 51%|█████     | 7951/15547 [28:20<25:21,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.0488469462351306e-05, 'epoch': 0.51}


 51%|█████     | 7961/15547 [28:22<25:16,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.042201103209943e-05, 'epoch': 0.51}


 51%|█████▏    | 7971/15547 [28:24<25:17,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.035555260184754e-05, 'epoch': 0.51}


 51%|█████▏    | 7981/15547 [28:26<25:08,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.028909417159567e-05, 'epoch': 0.51}


 51%|█████▏    | 7991/15547 [28:28<25:10,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.022263574134379e-05, 'epoch': 0.51}


 51%|█████▏    | 8000/15547 [28:30<28:33,  4.40it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 5.0156177311091915e-05, 'epoch': 0.51}


                                                    
 51%|█████▏    | 8000/15547 [28:33<28:33,  4.40it/s]Saving model checkpoint to ./results/checkpoint-8000
Configuration saved in ./results/checkpoint-8000/config.json


{'eval_loss': 9.899881092678697e-08, 'eval_runtime': 3.0034, 'eval_samples_per_second': 832.4, 'eval_steps_per_second': 13.318, 'epoch': 0.51}


Model weights saved in ./results/checkpoint-8000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 52%|█████▏    | 8011/15547 [28:36<29:30,  4.26it/s]  

{'loss': 0.0, 'learning_rate': 5.008971888084004e-05, 'epoch': 0.52}


 52%|█████▏    | 8021/15547 [28:38<25:13,  4.97it/s]

{'loss': 0.0, 'learning_rate': 5.002326045058816e-05, 'epoch': 0.52}


 52%|█████▏    | 8031/15547 [28:40<25:04,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.995680202033628e-05, 'epoch': 0.52}


 52%|█████▏    | 8041/15547 [28:42<25:05,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.98903435900844e-05, 'epoch': 0.52}


 52%|█████▏    | 8051/15547 [28:44<25:00,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.982388515983253e-05, 'epoch': 0.52}


 52%|█████▏    | 8061/15547 [28:46<24:56,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.9757426729580645e-05, 'epoch': 0.52}


 52%|█████▏    | 8071/15547 [28:48<24:54,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.9690968299328774e-05, 'epoch': 0.52}


 52%|█████▏    | 8081/15547 [28:50<24:50,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.9624509869076895e-05, 'epoch': 0.52}


 52%|█████▏    | 8091/15547 [28:52<24:44,  5.02it/s]

{'loss': 0.0, 'learning_rate': 4.955805143882502e-05, 'epoch': 0.52}


 52%|█████▏    | 8101/15547 [28:54<27:11,  4.57it/s]

{'loss': 0.0, 'learning_rate': 4.949159300857314e-05, 'epoch': 0.52}


 52%|█████▏    | 8111/15547 [28:56<24:54,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.942513457832126e-05, 'epoch': 0.52}


 52%|█████▏    | 8121/15547 [28:58<24:44,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.935867614806938e-05, 'epoch': 0.52}


 52%|█████▏    | 8131/15547 [29:00<24:44,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.9292217717817504e-05, 'epoch': 0.52}


 52%|█████▏    | 8141/15547 [29:02<24:36,  5.02it/s]

{'loss': 0.0, 'learning_rate': 4.922575928756563e-05, 'epoch': 0.52}


 52%|█████▏    | 8151/15547 [29:04<24:38,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.915930085731375e-05, 'epoch': 0.52}


 52%|█████▏    | 8161/15547 [29:06<24:32,  5.02it/s]

{'loss': 0.0, 'learning_rate': 4.9092842427061876e-05, 'epoch': 0.52}


 53%|█████▎    | 8171/15547 [29:08<24:35,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.902638399681e-05, 'epoch': 0.53}


 53%|█████▎    | 8181/15547 [29:10<24:32,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.895992556655812e-05, 'epoch': 0.53}


 53%|█████▎    | 8191/15547 [29:12<24:29,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.889346713630624e-05, 'epoch': 0.53}


 53%|█████▎    | 8201/15547 [29:14<26:41,  4.59it/s]

{'loss': 0.0, 'learning_rate': 4.882700870605437e-05, 'epoch': 0.53}


 53%|█████▎    | 8211/15547 [29:16<24:31,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.8760550275802485e-05, 'epoch': 0.53}


 53%|█████▎    | 8221/15547 [29:18<24:21,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.869409184555061e-05, 'epoch': 0.53}


 53%|█████▎    | 8231/15547 [29:20<24:26,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.8627633415298735e-05, 'epoch': 0.53}


 53%|█████▎    | 8241/15547 [29:22<24:19,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.8561174985046857e-05, 'epoch': 0.53}


 53%|█████▎    | 8250/15547 [29:23<24:12,  5.02it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 4.849471655479498e-05, 'epoch': 0.53}


                                                    
 53%|█████▎    | 8251/15547 [29:27<2:14:04,  1.10s/it]

{'eval_loss': 6.741712610391914e-08, 'eval_runtime': 2.9983, 'eval_samples_per_second': 833.798, 'eval_steps_per_second': 13.341, 'epoch': 0.53}


 53%|█████▎    | 8261/15547 [29:29<27:24,  4.43it/s]  

{'loss': 0.0, 'learning_rate': 4.84282581245431e-05, 'epoch': 0.53}


 53%|█████▎    | 8271/15547 [29:31<24:22,  4.97it/s]

{'loss': 0.0, 'learning_rate': 4.836179969429122e-05, 'epoch': 0.53}


 53%|█████▎    | 8281/15547 [29:33<24:06,  5.02it/s]

{'loss': 0.0, 'learning_rate': 4.8295341264039343e-05, 'epoch': 0.53}


 53%|█████▎    | 8291/15547 [29:35<24:14,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.822888283378747e-05, 'epoch': 0.53}


 53%|█████▎    | 8301/15547 [29:37<26:28,  4.56it/s]

{'loss': 0.0, 'learning_rate': 4.816242440353559e-05, 'epoch': 0.53}


 53%|█████▎    | 8311/15547 [29:39<24:17,  4.96it/s]

{'loss': 0.0, 'learning_rate': 4.8095965973283715e-05, 'epoch': 0.53}


 54%|█████▎    | 8321/15547 [29:41<24:00,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.802950754303184e-05, 'epoch': 0.54}


 54%|█████▎    | 8331/15547 [29:43<24:06,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.796304911277996e-05, 'epoch': 0.54}


 54%|█████▎    | 8341/15547 [29:45<24:04,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.789659068252808e-05, 'epoch': 0.54}


 54%|█████▎    | 8351/15547 [29:47<23:56,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.78301322522762e-05, 'epoch': 0.54}


 54%|█████▍    | 8361/15547 [29:49<23:49,  5.03it/s]

{'loss': 0.0, 'learning_rate': 4.7763673822024324e-05, 'epoch': 0.54}


 54%|█████▍    | 8371/15547 [29:51<23:59,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.7697215391772446e-05, 'epoch': 0.54}


 54%|█████▍    | 8381/15547 [29:53<23:55,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.7630756961520574e-05, 'epoch': 0.54}


 54%|█████▍    | 8391/15547 [29:55<23:55,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.756429853126869e-05, 'epoch': 0.54}


 54%|█████▍    | 8401/15547 [29:57<25:36,  4.65it/s]

{'loss': 0.0, 'learning_rate': 4.749784010101682e-05, 'epoch': 0.54}


 54%|█████▍    | 8411/15547 [29:59<23:44,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.743138167076494e-05, 'epoch': 0.54}


 54%|█████▍    | 8421/15547 [30:01<23:50,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.736492324051306e-05, 'epoch': 0.54}


 54%|█████▍    | 8431/15547 [30:03<23:43,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.729846481026118e-05, 'epoch': 0.54}


 54%|█████▍    | 8441/15547 [30:05<23:37,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.723200638000931e-05, 'epoch': 0.54}


 54%|█████▍    | 8451/15547 [30:07<23:43,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.7165547949757427e-05, 'epoch': 0.54}


 54%|█████▍    | 8461/15547 [30:09<23:37,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.709908951950555e-05, 'epoch': 0.54}


 54%|█████▍    | 8471/15547 [30:11<23:33,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.703263108925368e-05, 'epoch': 0.54}


 55%|█████▍    | 8481/15547 [30:13<23:32,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.696617265900179e-05, 'epoch': 0.55}


 55%|█████▍    | 8491/15547 [30:15<23:34,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.689971422874992e-05, 'epoch': 0.55}


 55%|█████▍    | 8500/15547 [30:16<26:15,  4.47it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 4.683325579849804e-05, 'epoch': 0.55}


                                                    
 55%|█████▍    | 8500/15547 [30:19<26:15,  4.47it/s]Saving model checkpoint to ./results/checkpoint-8500
Configuration saved in ./results/checkpoint-8500/config.json


{'eval_loss': 4.1482358881239634e-08, 'eval_runtime': 3.0161, 'eval_samples_per_second': 828.89, 'eval_steps_per_second': 13.262, 'epoch': 0.55}


Model weights saved in ./results/checkpoint-8500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 55%|█████▍    | 8511/15547 [30:23<27:30,  4.26it/s]  

{'loss': 0.0, 'learning_rate': 4.6766797368246164e-05, 'epoch': 0.55}


 55%|█████▍    | 8521/15547 [30:25<23:29,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.6700338937994285e-05, 'epoch': 0.55}


 55%|█████▍    | 8531/15547 [30:27<23:17,  5.02it/s]

{'loss': 0.0, 'learning_rate': 4.6633880507742414e-05, 'epoch': 0.55}


 55%|█████▍    | 8541/15547 [30:29<23:30,  4.97it/s]

{'loss': 0.0, 'learning_rate': 4.656742207749053e-05, 'epoch': 0.55}


 55%|█████▌    | 8551/15547 [30:31<23:21,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.650096364723866e-05, 'epoch': 0.55}


 55%|█████▌    | 8561/15547 [30:33<23:18,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.643450521698678e-05, 'epoch': 0.55}


 55%|█████▌    | 8571/15547 [30:35<23:17,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.63680467867349e-05, 'epoch': 0.55}


 55%|█████▌    | 8581/15547 [30:37<23:18,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.630158835648302e-05, 'epoch': 0.55}


 55%|█████▌    | 8591/15547 [30:39<23:13,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.6235129926231144e-05, 'epoch': 0.55}


 55%|█████▌    | 8601/15547 [30:41<25:11,  4.59it/s]

{'loss': 0.0, 'learning_rate': 4.6168671495979266e-05, 'epoch': 0.55}


 55%|█████▌    | 8611/15547 [30:43<23:10,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.610221306572739e-05, 'epoch': 0.55}


 55%|█████▌    | 8621/15547 [30:45<23:10,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.6035754635475516e-05, 'epoch': 0.55}


 56%|█████▌    | 8631/15547 [30:47<23:10,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.596929620522363e-05, 'epoch': 0.56}


 56%|█████▌    | 8641/15547 [30:49<22:54,  5.02it/s]

{'loss': 0.0, 'learning_rate': 4.590283777497176e-05, 'epoch': 0.56}


 56%|█████▌    | 8651/15547 [30:51<23:04,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.583637934471988e-05, 'epoch': 0.56}


 56%|█████▌    | 8661/15547 [30:53<23:01,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.5769920914468e-05, 'epoch': 0.56}


 56%|█████▌    | 8671/15547 [30:55<22:55,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.5703462484216125e-05, 'epoch': 0.56}


 56%|█████▌    | 8681/15547 [30:57<22:59,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.563700405396425e-05, 'epoch': 0.56}


 56%|█████▌    | 8691/15547 [30:59<22:58,  4.97it/s]

{'loss': 0.0, 'learning_rate': 4.557054562371237e-05, 'epoch': 0.56}


 56%|█████▌    | 8701/15547 [31:01<24:46,  4.61it/s]

{'loss': 0.0, 'learning_rate': 4.550408719346049e-05, 'epoch': 0.56}


 56%|█████▌    | 8711/15547 [31:03<22:51,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.543762876320862e-05, 'epoch': 0.56}


 56%|█████▌    | 8721/15547 [31:05<22:45,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.5371170332956734e-05, 'epoch': 0.56}


 56%|█████▌    | 8731/15547 [31:07<22:41,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.530471190270486e-05, 'epoch': 0.56}


 56%|█████▌    | 8741/15547 [31:09<22:52,  4.96it/s]

{'loss': 0.0, 'learning_rate': 4.5238253472452984e-05, 'epoch': 0.56}


 56%|█████▋    | 8750/15547 [31:10<22:27,  5.04it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 4.5171795042201106e-05, 'epoch': 0.56}


                                                    
 56%|█████▋    | 8751/15547 [31:14<2:05:04,  1.10s/it]

{'eval_loss': 1.971326923921879e-08, 'eval_runtime': 3.0053, 'eval_samples_per_second': 831.861, 'eval_steps_per_second': 13.31, 'epoch': 0.56}


 56%|█████▋    | 8761/15547 [31:16<25:34,  4.42it/s]  

{'loss': 0.0, 'learning_rate': 4.510533661194923e-05, 'epoch': 0.56}


 56%|█████▋    | 8771/15547 [31:18<22:45,  4.96it/s]

{'loss': 0.0, 'learning_rate': 4.5038878181697356e-05, 'epoch': 0.56}


 56%|█████▋    | 8781/15547 [31:20<22:31,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.497241975144547e-05, 'epoch': 0.56}


 57%|█████▋    | 8791/15547 [31:22<22:31,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.490596132119359e-05, 'epoch': 0.57}


 57%|█████▋    | 8801/15547 [31:24<24:21,  4.62it/s]

{'loss': 0.0, 'learning_rate': 4.483950289094172e-05, 'epoch': 0.57}


 57%|█████▋    | 8811/15547 [31:26<22:36,  4.97it/s]

{'loss': 0.0, 'learning_rate': 4.4773044460689836e-05, 'epoch': 0.57}


 57%|█████▋    | 8821/15547 [31:28<22:29,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.4706586030437965e-05, 'epoch': 0.57}


 57%|█████▋    | 8831/15547 [31:30<22:23,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.4640127600186086e-05, 'epoch': 0.57}


 57%|█████▋    | 8841/15547 [31:32<22:24,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.457366916993421e-05, 'epoch': 0.57}


 57%|█████▋    | 8851/15547 [31:34<22:15,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.450721073968233e-05, 'epoch': 0.57}


 57%|█████▋    | 8861/15547 [31:36<22:28,  4.96it/s]

{'loss': 0.0, 'learning_rate': 4.444075230943046e-05, 'epoch': 0.57}


 57%|█████▋    | 8871/15547 [31:38<22:16,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.437429387917857e-05, 'epoch': 0.57}


 57%|█████▋    | 8881/15547 [31:40<22:16,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.43078354489267e-05, 'epoch': 0.57}


 57%|█████▋    | 8891/15547 [31:42<22:18,  4.97it/s]

{'loss': 0.0, 'learning_rate': 4.4241377018674823e-05, 'epoch': 0.57}


 57%|█████▋    | 8901/15547 [31:44<24:25,  4.53it/s]

{'loss': 0.0, 'learning_rate': 4.4174918588422945e-05, 'epoch': 0.57}


 57%|█████▋    | 8911/15547 [31:46<22:13,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.410846015817107e-05, 'epoch': 0.57}


 57%|█████▋    | 8921/15547 [31:48<22:05,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.404200172791919e-05, 'epoch': 0.57}


 57%|█████▋    | 8931/15547 [31:50<22:09,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.397554329766731e-05, 'epoch': 0.57}


 58%|█████▊    | 8941/15547 [31:52<22:03,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.390908486741543e-05, 'epoch': 0.58}


 58%|█████▊    | 8951/15547 [31:54<21:59,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.384262643716356e-05, 'epoch': 0.58}


 58%|█████▊    | 8961/15547 [31:56<22:03,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.3776168006911676e-05, 'epoch': 0.58}


 58%|█████▊    | 8971/15547 [31:58<21:57,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.3709709576659804e-05, 'epoch': 0.58}


 58%|█████▊    | 8981/15547 [32:00<21:58,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.3643251146407926e-05, 'epoch': 0.58}


 58%|█████▊    | 8991/15547 [32:02<21:51,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.357679271615605e-05, 'epoch': 0.58}


 58%|█████▊    | 9000/15547 [32:04<24:14,  4.50it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 4.351033428590417e-05, 'epoch': 0.58}


                                                    
 58%|█████▊    | 9000/15547 [32:07<24:14,  4.50it/s]Saving model checkpoint to ./results/checkpoint-9000
Configuration saved in ./results/checkpoint-9000/config.json


{'eval_loss': 1.540928273868758e-08, 'eval_runtime': 3.0178, 'eval_samples_per_second': 828.413, 'eval_steps_per_second': 13.255, 'epoch': 0.58}


Model weights saved in ./results/checkpoint-9000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 58%|█████▊    | 9011/15547 [32:10<25:44,  4.23it/s]  

{'loss': 0.0, 'learning_rate': 4.344387585565229e-05, 'epoch': 0.58}


 58%|█████▊    | 9021/15547 [32:12<21:55,  4.96it/s]

{'loss': 0.0, 'learning_rate': 4.337741742540041e-05, 'epoch': 0.58}


 58%|█████▊    | 9031/15547 [32:14<21:47,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.3310958995148534e-05, 'epoch': 0.58}


 58%|█████▊    | 9041/15547 [32:16<21:45,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.324450056489666e-05, 'epoch': 0.58}


 58%|█████▊    | 9051/15547 [32:18<21:33,  5.02it/s]

{'loss': 0.0, 'learning_rate': 4.317804213464478e-05, 'epoch': 0.58}


 58%|█████▊    | 9061/15547 [32:20<21:44,  4.97it/s]

{'loss': 0.0, 'learning_rate': 4.3111583704392906e-05, 'epoch': 0.58}


 58%|█████▊    | 9071/15547 [32:22<21:38,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.304512527414103e-05, 'epoch': 0.58}


 58%|█████▊    | 9081/15547 [32:24<21:32,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.297866684388915e-05, 'epoch': 0.58}


 58%|█████▊    | 9091/15547 [32:26<21:32,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.291220841363727e-05, 'epoch': 0.58}


 59%|█████▊    | 9101/15547 [32:28<23:28,  4.58it/s]

{'loss': 0.0, 'learning_rate': 4.28457499833854e-05, 'epoch': 0.59}


 59%|█████▊    | 9111/15547 [32:30<21:31,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.2779291553133515e-05, 'epoch': 0.59}


 59%|█████▊    | 9121/15547 [32:32<21:28,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.2712833122881644e-05, 'epoch': 0.59}


 59%|█████▊    | 9131/15547 [32:34<21:20,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.2646374692629765e-05, 'epoch': 0.59}


 59%|█████▉    | 9141/15547 [32:36<21:19,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.257991626237788e-05, 'epoch': 0.59}


 59%|█████▉    | 9151/15547 [32:38<21:18,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.251345783212601e-05, 'epoch': 0.59}


 59%|█████▉    | 9161/15547 [32:40<21:16,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.244699940187413e-05, 'epoch': 0.59}


 59%|█████▉    | 9171/15547 [32:42<21:17,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.238054097162225e-05, 'epoch': 0.59}


 59%|█████▉    | 9181/15547 [32:44<21:12,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.2314082541370374e-05, 'epoch': 0.59}


 59%|█████▉    | 9191/15547 [32:46<21:13,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.22476241111185e-05, 'epoch': 0.59}


 59%|█████▉    | 9201/15547 [32:48<23:08,  4.57it/s]

{'loss': 0.0, 'learning_rate': 4.218116568086662e-05, 'epoch': 0.59}


 59%|█████▉    | 9211/15547 [32:50<21:20,  4.95it/s]

{'loss': 0.0, 'learning_rate': 4.2114707250614746e-05, 'epoch': 0.59}


 59%|█████▉    | 9221/15547 [32:52<21:09,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.204824882036287e-05, 'epoch': 0.59}


 59%|█████▉    | 9231/15547 [32:54<21:05,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.198179039011099e-05, 'epoch': 0.59}


 59%|█████▉    | 9241/15547 [32:56<21:02,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.191533195985911e-05, 'epoch': 0.59}


 59%|█████▉    | 9250/15547 [32:58<20:58,  5.01it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 4.184887352960723e-05, 'epoch': 0.59}


                                                    
 60%|█████▉    | 9251/15547 [33:01<1:56:02,  1.11s/it]

{'eval_loss': 4.241554218964438e-09, 'eval_runtime': 3.0062, 'eval_samples_per_second': 831.613, 'eval_steps_per_second': 13.306, 'epoch': 0.59}


 60%|█████▉    | 9261/15547 [33:03<23:34,  4.44it/s]  

{'loss': 0.0, 'learning_rate': 4.1782415099355355e-05, 'epoch': 0.6}


 60%|█████▉    | 9271/15547 [33:05<20:57,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.1715956669103476e-05, 'epoch': 0.6}


 60%|█████▉    | 9281/15547 [33:07<20:49,  5.02it/s]

{'loss': 0.0, 'learning_rate': 4.1649498238851605e-05, 'epoch': 0.6}


 60%|█████▉    | 9291/15547 [33:09<20:51,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.158303980859972e-05, 'epoch': 0.6}


 60%|█████▉    | 9301/15547 [33:11<22:22,  4.65it/s]

{'loss': 0.0, 'learning_rate': 4.151658137834785e-05, 'epoch': 0.6}


 60%|█████▉    | 9311/15547 [33:13<20:51,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.145012294809596e-05, 'epoch': 0.6}


 60%|█████▉    | 9321/15547 [33:15<20:47,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.138366451784409e-05, 'epoch': 0.6}


 60%|██████    | 9331/15547 [33:17<20:44,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.1317206087592214e-05, 'epoch': 0.6}


 60%|██████    | 9341/15547 [33:19<20:42,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.1250747657340335e-05, 'epoch': 0.6}


 60%|██████    | 9351/15547 [33:21<20:37,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.118428922708846e-05, 'epoch': 0.6}


 60%|██████    | 9361/15547 [33:23<20:39,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.111783079683658e-05, 'epoch': 0.6}


 60%|██████    | 9371/15547 [33:25<20:27,  5.03it/s]

{'loss': 0.0, 'learning_rate': 4.10513723665847e-05, 'epoch': 0.6}


 60%|██████    | 9381/15547 [33:27<20:33,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.098491393633282e-05, 'epoch': 0.6}


 60%|██████    | 9391/15547 [33:29<20:36,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.091845550608095e-05, 'epoch': 0.6}


 60%|██████    | 9401/15547 [33:31<22:08,  4.63it/s]

{'loss': 0.0, 'learning_rate': 4.0851997075829066e-05, 'epoch': 0.6}


 61%|██████    | 9411/15547 [33:33<20:39,  4.95it/s]

{'loss': 0.0, 'learning_rate': 4.0785538645577194e-05, 'epoch': 0.61}


 61%|██████    | 9421/15547 [33:35<20:23,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.0719080215325316e-05, 'epoch': 0.61}


 61%|██████    | 9431/15547 [33:37<20:20,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.065262178507344e-05, 'epoch': 0.61}


 61%|██████    | 9441/15547 [33:39<20:19,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.058616335482156e-05, 'epoch': 0.61}


 61%|██████    | 9451/15547 [33:41<20:12,  5.03it/s]

{'loss': 0.0, 'learning_rate': 4.051970492456969e-05, 'epoch': 0.61}


 61%|██████    | 9461/15547 [33:43<20:09,  5.03it/s]

{'loss': 0.0, 'learning_rate': 4.04532464943178e-05, 'epoch': 0.61}


 61%|██████    | 9471/15547 [33:45<20:16,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.0386788064065925e-05, 'epoch': 0.61}


 61%|██████    | 9481/15547 [33:47<20:17,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.032032963381405e-05, 'epoch': 0.61}


 61%|██████    | 9491/15547 [33:49<20:09,  5.01it/s]

{'loss': 0.0, 'learning_rate': 4.025387120356217e-05, 'epoch': 0.61}


 61%|██████    | 9500/15547 [33:51<22:39,  4.45it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 4.0187412773310297e-05, 'epoch': 0.61}


                                                    
 61%|██████    | 9500/15547 [33:54<22:39,  4.45it/s]Saving model checkpoint to ./results/checkpoint-9500
Configuration saved in ./results/checkpoint-9500/config.json


{'eval_loss': 2.129446619036912e-09, 'eval_runtime': 3.0206, 'eval_samples_per_second': 827.651, 'eval_steps_per_second': 13.242, 'epoch': 0.61}


Model weights saved in ./results/checkpoint-9500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 61%|██████    | 9511/15547 [33:57<23:39,  4.25it/s]  

{'loss': 0.0, 'learning_rate': 4.012095434305842e-05, 'epoch': 0.61}


 61%|██████    | 9521/15547 [33:59<20:13,  4.96it/s]

{'loss': 0.0, 'learning_rate': 4.005449591280654e-05, 'epoch': 0.61}


 61%|██████▏   | 9531/15547 [34:01<20:11,  4.96it/s]

{'loss': 0.0, 'learning_rate': 3.998803748255466e-05, 'epoch': 0.61}


 61%|██████▏   | 9541/15547 [34:03<19:57,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.992157905230279e-05, 'epoch': 0.61}


 61%|██████▏   | 9551/15547 [34:05<19:59,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.9855120622050905e-05, 'epoch': 0.61}


 61%|██████▏   | 9561/15547 [34:07<19:57,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.9788662191799034e-05, 'epoch': 0.61}


 62%|██████▏   | 9571/15547 [34:09<19:56,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.9722203761547156e-05, 'epoch': 0.62}


 62%|██████▏   | 9581/15547 [34:11<19:59,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.965574533129528e-05, 'epoch': 0.62}


 62%|██████▏   | 9591/15547 [34:13<19:56,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.95892869010434e-05, 'epoch': 0.62}


 62%|██████▏   | 9601/15547 [34:15<21:26,  4.62it/s]

{'loss': 0.0, 'learning_rate': 3.952282847079152e-05, 'epoch': 0.62}


 62%|██████▏   | 9611/15547 [34:17<19:52,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.945637004053964e-05, 'epoch': 0.62}


 62%|██████▏   | 9621/15547 [34:19<19:44,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.9389911610287764e-05, 'epoch': 0.62}


 62%|██████▏   | 9631/15547 [34:21<19:43,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.932345318003589e-05, 'epoch': 0.62}


 62%|██████▏   | 9641/15547 [34:23<19:46,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.925699474978401e-05, 'epoch': 0.62}


 62%|██████▏   | 9651/15547 [34:25<19:46,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.9190536319532136e-05, 'epoch': 0.62}


 62%|██████▏   | 9661/15547 [34:27<19:44,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.912407788928026e-05, 'epoch': 0.62}


 62%|██████▏   | 9671/15547 [34:29<19:34,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.905761945902838e-05, 'epoch': 0.62}


 62%|██████▏   | 9681/15547 [34:31<19:36,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.89911610287765e-05, 'epoch': 0.62}


 62%|██████▏   | 9691/15547 [34:33<19:28,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.892470259852462e-05, 'epoch': 0.62}


 62%|██████▏   | 9701/15547 [34:35<21:35,  4.51it/s]

{'loss': 0.0, 'learning_rate': 3.8858244168272745e-05, 'epoch': 0.62}


 62%|██████▏   | 9711/15547 [34:37<19:34,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.8791785738020867e-05, 'epoch': 0.62}


 63%|██████▎   | 9721/15547 [34:39<19:26,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.8725327307768995e-05, 'epoch': 0.63}


 63%|██████▎   | 9731/15547 [34:41<19:31,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.865886887751711e-05, 'epoch': 0.63}


 63%|██████▎   | 9741/15547 [34:43<19:24,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.859241044726524e-05, 'epoch': 0.63}


 63%|██████▎   | 9750/15547 [34:45<19:10,  5.04it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 3.852595201701336e-05, 'epoch': 0.63}


                                                    
 63%|██████▎   | 9751/15547 [34:48<1:46:59,  1.11s/it]

{'eval_loss': 1.0448747422842075e-09, 'eval_runtime': 3.0126, 'eval_samples_per_second': 829.853, 'eval_steps_per_second': 13.278, 'epoch': 0.63}


 63%|██████▎   | 9761/15547 [34:50<21:50,  4.41it/s]  

{'loss': 0.0, 'learning_rate': 3.845949358676148e-05, 'epoch': 0.63}


 63%|██████▎   | 9771/15547 [34:52<19:20,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.8393035156509604e-05, 'epoch': 0.63}


 63%|██████▎   | 9781/15547 [34:54<19:16,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.832657672625773e-05, 'epoch': 0.63}


 63%|██████▎   | 9791/15547 [34:56<19:08,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.826011829600585e-05, 'epoch': 0.63}


 63%|██████▎   | 9801/15547 [34:58<21:04,  4.54it/s]

{'loss': 0.0, 'learning_rate': 3.8193659865753976e-05, 'epoch': 0.63}


 63%|██████▎   | 9811/15547 [35:00<19:12,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.81272014355021e-05, 'epoch': 0.63}


 63%|██████▎   | 9821/15547 [35:02<19:08,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.806074300525021e-05, 'epoch': 0.63}


 63%|██████▎   | 9831/15547 [35:04<19:06,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.799428457499834e-05, 'epoch': 0.63}


 63%|██████▎   | 9841/15547 [35:06<19:04,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.792782614474646e-05, 'epoch': 0.63}


 63%|██████▎   | 9851/15547 [35:08<19:01,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.7861367714494584e-05, 'epoch': 0.63}


 63%|██████▎   | 9861/15547 [35:10<18:55,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.7794909284242706e-05, 'epoch': 0.63}


 63%|██████▎   | 9871/15547 [35:12<19:00,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.7728450853990835e-05, 'epoch': 0.63}


 64%|██████▎   | 9881/15547 [35:14<18:53,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.766199242373895e-05, 'epoch': 0.64}


 64%|██████▎   | 9891/15547 [35:16<18:55,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.759553399348708e-05, 'epoch': 0.64}


 64%|██████▎   | 9901/15547 [35:18<20:18,  4.63it/s]

{'loss': 0.0, 'learning_rate': 3.75290755632352e-05, 'epoch': 0.64}


 64%|██████▎   | 9911/15547 [35:20<18:55,  4.96it/s]

{'loss': 0.0, 'learning_rate': 3.746261713298332e-05, 'epoch': 0.64}


 64%|██████▍   | 9921/15547 [35:22<18:51,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.739615870273144e-05, 'epoch': 0.64}


 64%|██████▍   | 9931/15547 [35:24<18:41,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.7329700272479565e-05, 'epoch': 0.64}


 64%|██████▍   | 9941/15547 [35:26<18:48,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.726324184222769e-05, 'epoch': 0.64}


 64%|██████▍   | 9951/15547 [35:28<18:43,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.719678341197581e-05, 'epoch': 0.64}


 64%|██████▍   | 9961/15547 [35:30<18:43,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.713032498172394e-05, 'epoch': 0.64}


 64%|██████▍   | 9971/15547 [35:32<18:39,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.706386655147205e-05, 'epoch': 0.64}


 64%|██████▍   | 9981/15547 [35:34<18:29,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.699740812122018e-05, 'epoch': 0.64}


 64%|██████▍   | 9991/15547 [35:36<18:28,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.69309496909683e-05, 'epoch': 0.64}


 64%|██████▍   | 10000/15547 [35:38<21:16,  4.35it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 3.6864491260716424e-05, 'epoch': 0.64}


                                                     
 64%|██████▍   | 10000/15547 [35:41<21:16,  4.35it/s]Saving model checkpoint to ./results/checkpoint-10000
Configuration saved in ./results/checkpoint-10000/config.json


{'eval_loss': 6.707456812193868e-10, 'eval_runtime': 3.0167, 'eval_samples_per_second': 828.719, 'eval_steps_per_second': 13.26, 'epoch': 0.64}


Model weights saved in ./results/checkpoint-10000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 64%|██████▍   | 10011/15547 [35:44<21:40,  4.26it/s]  

{'loss': 0.0, 'learning_rate': 3.6798032830464546e-05, 'epoch': 0.64}


 64%|██████▍   | 10021/15547 [35:46<18:36,  4.95it/s]

{'loss': 0.0, 'learning_rate': 3.673157440021267e-05, 'epoch': 0.64}


 65%|██████▍   | 10031/15547 [35:48<18:26,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.666511596996079e-05, 'epoch': 0.65}


 65%|██████▍   | 10041/15547 [35:50<18:23,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.659865753970891e-05, 'epoch': 0.65}


 65%|██████▍   | 10051/15547 [35:52<18:21,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.653219910945704e-05, 'epoch': 0.65}


 65%|██████▍   | 10061/15547 [35:54<18:16,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.6465740679205154e-05, 'epoch': 0.65}


 65%|██████▍   | 10071/15547 [35:56<18:21,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.639928224895328e-05, 'epoch': 0.65}


 65%|██████▍   | 10081/15547 [35:58<18:16,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.6332823818701405e-05, 'epoch': 0.65}


 65%|██████▍   | 10091/15547 [36:00<18:08,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.6266365388449526e-05, 'epoch': 0.65}


 65%|██████▍   | 10101/15547 [36:02<19:51,  4.57it/s]

{'loss': 0.0, 'learning_rate': 3.619990695819765e-05, 'epoch': 0.65}


 65%|██████▌   | 10111/15547 [36:04<18:12,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.6133448527945777e-05, 'epoch': 0.65}


 65%|██████▌   | 10121/15547 [36:06<18:02,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.606699009769389e-05, 'epoch': 0.65}


 65%|██████▌   | 10131/15547 [36:08<18:07,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.600053166744202e-05, 'epoch': 0.65}


 65%|██████▌   | 10141/15547 [36:10<18:01,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.593407323719014e-05, 'epoch': 0.65}


 65%|██████▌   | 10151/15547 [36:12<18:00,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.586761480693826e-05, 'epoch': 0.65}


 65%|██████▌   | 10161/15547 [36:14<18:02,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.5801156376686385e-05, 'epoch': 0.65}


 65%|██████▌   | 10171/15547 [36:16<17:54,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.573469794643451e-05, 'epoch': 0.65}


 65%|██████▌   | 10181/15547 [36:18<17:53,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.566823951618263e-05, 'epoch': 0.65}


 66%|██████▌   | 10191/15547 [36:20<17:51,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.560178108593075e-05, 'epoch': 0.66}


 66%|██████▌   | 10201/15547 [36:22<19:12,  4.64it/s]

{'loss': 0.0, 'learning_rate': 3.553532265567888e-05, 'epoch': 0.66}


 66%|██████▌   | 10211/15547 [36:24<17:52,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.5468864225426994e-05, 'epoch': 0.66}


 66%|██████▌   | 10221/15547 [36:26<17:45,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.540240579517512e-05, 'epoch': 0.66}


 66%|██████▌   | 10231/15547 [36:28<17:46,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.5335947364923244e-05, 'epoch': 0.66}


 66%|██████▌   | 10241/15547 [36:30<17:41,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.5269488934671366e-05, 'epoch': 0.66}


 66%|██████▌   | 10250/15547 [36:32<17:35,  5.02it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 3.520303050441949e-05, 'epoch': 0.66}


                                                     
 66%|██████▌   | 10251/15547 [36:35<1:37:31,  1.10s/it]

{'eval_loss': 9.279205182011196e-11, 'eval_runtime': 3.0026, 'eval_samples_per_second': 832.606, 'eval_steps_per_second': 13.322, 'epoch': 0.66}


 66%|██████▌   | 10261/15547 [36:37<19:57,  4.41it/s]  

{'loss': 0.0, 'learning_rate': 3.513657207416761e-05, 'epoch': 0.66}


 66%|██████▌   | 10271/15547 [36:39<17:45,  4.95it/s]

{'loss': 0.0, 'learning_rate': 3.507011364391573e-05, 'epoch': 0.66}


 66%|██████▌   | 10281/15547 [36:41<17:34,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.500365521366385e-05, 'epoch': 0.66}


 66%|██████▌   | 10291/15547 [36:43<17:36,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.493719678341198e-05, 'epoch': 0.66}


 66%|██████▋   | 10301/15547 [36:45<19:12,  4.55it/s]

{'loss': 0.0, 'learning_rate': 3.4870738353160096e-05, 'epoch': 0.66}


 66%|██████▋   | 10311/15547 [36:47<17:39,  4.94it/s]

{'loss': 0.0, 'learning_rate': 3.4804279922908225e-05, 'epoch': 0.66}


 66%|██████▋   | 10321/15547 [36:49<17:28,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.4737821492656346e-05, 'epoch': 0.66}


 66%|██████▋   | 10331/15547 [36:51<17:26,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.467136306240447e-05, 'epoch': 0.66}


 67%|██████▋   | 10341/15547 [36:53<17:24,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.460490463215259e-05, 'epoch': 0.67}


 67%|██████▋   | 10351/15547 [36:55<17:22,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.453844620190071e-05, 'epoch': 0.67}


 67%|██████▋   | 10361/15547 [36:57<17:22,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.4471987771648833e-05, 'epoch': 0.67}


 67%|██████▋   | 10371/15547 [36:59<17:14,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.4405529341396955e-05, 'epoch': 0.67}


 67%|██████▋   | 10381/15547 [37:01<17:18,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.4339070911145084e-05, 'epoch': 0.67}


 67%|██████▋   | 10391/15547 [37:03<17:09,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.42726124808932e-05, 'epoch': 0.67}


 67%|██████▋   | 10401/15547 [37:05<18:35,  4.61it/s]

{'loss': 0.0, 'learning_rate': 3.420615405064133e-05, 'epoch': 0.67}


 67%|██████▋   | 10411/15547 [37:07<17:14,  4.96it/s]

{'loss': 0.0, 'learning_rate': 3.413969562038945e-05, 'epoch': 0.67}


 67%|██████▋   | 10421/15547 [37:09<17:06,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.407323719013757e-05, 'epoch': 0.67}


 67%|██████▋   | 10431/15547 [37:11<16:57,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.400677875988569e-05, 'epoch': 0.67}


 67%|██████▋   | 10441/15547 [37:13<16:54,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.394032032963382e-05, 'epoch': 0.67}


 67%|██████▋   | 10451/15547 [37:15<16:54,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.3873861899381936e-05, 'epoch': 0.67}


 67%|██████▋   | 10461/15547 [37:17<16:52,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.3807403469130064e-05, 'epoch': 0.67}


 67%|██████▋   | 10471/15547 [37:19<16:58,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.3740945038878186e-05, 'epoch': 0.67}


 67%|██████▋   | 10481/15547 [37:21<16:56,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.367448660862631e-05, 'epoch': 0.67}


 67%|██████▋   | 10491/15547 [37:23<16:50,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.360802817837443e-05, 'epoch': 0.67}


 68%|██████▊   | 10500/15547 [37:25<19:05,  4.41it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 3.354156974812255e-05, 'epoch': 0.68}


                                                     
 68%|██████▊   | 10500/15547 [37:28<19:05,  4.41it/s]Saving model checkpoint to ./results/checkpoint-10500
Configuration saved in ./results/checkpoint-10500/config.json


{'eval_loss': 1.8270074342296994e-10, 'eval_runtime': 3.0148, 'eval_samples_per_second': 829.239, 'eval_steps_per_second': 13.268, 'epoch': 0.68}


Model weights saved in ./results/checkpoint-10500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 68%|██████▊   | 10511/15547 [37:31<19:36,  4.28it/s]  

{'loss': 0.0, 'learning_rate': 3.347511131787067e-05, 'epoch': 0.68}


 68%|██████▊   | 10521/15547 [37:33<16:59,  4.93it/s]

{'loss': 0.0, 'learning_rate': 3.3408652887618795e-05, 'epoch': 0.68}


 68%|██████▊   | 10531/15547 [37:35<16:48,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.334219445736692e-05, 'epoch': 0.68}


 68%|██████▊   | 10541/15547 [37:37<16:34,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.327573602711504e-05, 'epoch': 0.68}


 68%|██████▊   | 10551/15547 [37:39<16:38,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.320927759686317e-05, 'epoch': 0.68}


 68%|██████▊   | 10561/15547 [37:41<16:32,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.314281916661129e-05, 'epoch': 0.68}


 68%|██████▊   | 10571/15547 [37:43<16:31,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.307636073635941e-05, 'epoch': 0.68}


 68%|██████▊   | 10581/15547 [37:45<16:28,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.300990230610753e-05, 'epoch': 0.68}


 68%|██████▊   | 10591/15547 [37:47<16:26,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.2943443875855654e-05, 'epoch': 0.68}


 68%|██████▊   | 10601/15547 [37:49<18:13,  4.52it/s]

{'loss': 0.0, 'learning_rate': 3.2876985445603775e-05, 'epoch': 0.68}


 68%|██████▊   | 10611/15547 [37:51<16:22,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.28105270153519e-05, 'epoch': 0.68}


 68%|██████▊   | 10621/15547 [37:53<16:20,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.2744068585100026e-05, 'epoch': 0.68}


 68%|██████▊   | 10631/15547 [37:55<16:17,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.267761015484814e-05, 'epoch': 0.68}


 68%|██████▊   | 10641/15547 [37:57<16:12,  5.04it/s]

{'loss': 0.0, 'learning_rate': 3.261115172459627e-05, 'epoch': 0.68}


 69%|██████▊   | 10651/15547 [37:59<16:15,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.254469329434439e-05, 'epoch': 0.69}


 69%|██████▊   | 10661/15547 [38:01<16:13,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.247823486409251e-05, 'epoch': 0.69}


 69%|██████▊   | 10671/15547 [38:03<16:08,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.2411776433840634e-05, 'epoch': 0.69}


 69%|██████▊   | 10681/15547 [38:05<16:10,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.2345318003588756e-05, 'epoch': 0.69}


 69%|██████▉   | 10691/15547 [38:07<16:07,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.227885957333688e-05, 'epoch': 0.69}


 69%|██████▉   | 10701/15547 [38:09<17:21,  4.65it/s]

{'loss': 0.0, 'learning_rate': 3.2212401143085e-05, 'epoch': 0.69}


 69%|██████▉   | 10711/15547 [38:11<16:07,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.214594271283313e-05, 'epoch': 0.69}


 69%|██████▉   | 10721/15547 [38:13<15:58,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.207948428258124e-05, 'epoch': 0.69}


 69%|██████▉   | 10731/15547 [38:15<15:58,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.201302585232937e-05, 'epoch': 0.69}


 69%|██████▉   | 10741/15547 [38:17<15:54,  5.04it/s]

{'loss': 0.0, 'learning_rate': 3.194656742207749e-05, 'epoch': 0.69}


 69%|██████▉   | 10750/15547 [38:18<15:54,  5.03it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 3.1880108991825615e-05, 'epoch': 0.69}


                                                     
 69%|██████▉   | 10751/15547 [38:22<1:28:22,  1.11s/it]

{'eval_loss': 6.16516698803693e-11, 'eval_runtime': 3.0104, 'eval_samples_per_second': 830.446, 'eval_steps_per_second': 13.287, 'epoch': 0.69}


 69%|██████▉   | 10761/15547 [38:24<17:58,  4.44it/s]  

{'loss': 0.0, 'learning_rate': 3.181365056157374e-05, 'epoch': 0.69}


 69%|██████▉   | 10771/15547 [38:26<15:56,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.1747192131321865e-05, 'epoch': 0.69}


 69%|██████▉   | 10781/15547 [38:27<15:49,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.168073370106998e-05, 'epoch': 0.69}


 69%|██████▉   | 10791/15547 [38:29<15:53,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.161427527081811e-05, 'epoch': 0.69}


 69%|██████▉   | 10801/15547 [38:32<17:01,  4.64it/s]

{'loss': 0.0, 'learning_rate': 3.154781684056623e-05, 'epoch': 0.69}


 70%|██████▉   | 10811/15547 [38:34<15:49,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.148135841031435e-05, 'epoch': 0.7}


 70%|██████▉   | 10821/15547 [38:36<15:45,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.1414899980062474e-05, 'epoch': 0.7}


 70%|██████▉   | 10831/15547 [38:37<15:39,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.1348441549810596e-05, 'epoch': 0.7}


 70%|██████▉   | 10841/15547 [38:39<15:36,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.128198311955872e-05, 'epoch': 0.7}


 70%|██████▉   | 10851/15547 [38:41<15:34,  5.02it/s]

{'loss': 0.0, 'learning_rate': 3.121552468930684e-05, 'epoch': 0.7}


 70%|██████▉   | 10861/15547 [38:43<15:32,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.114906625905497e-05, 'epoch': 0.7}


 70%|██████▉   | 10871/15547 [38:45<15:33,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.108260782880308e-05, 'epoch': 0.7}


 70%|██████▉   | 10881/15547 [38:47<15:32,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.101614939855121e-05, 'epoch': 0.7}


 70%|███████   | 10891/15547 [38:49<15:29,  5.01it/s]

{'loss': 0.0, 'learning_rate': 3.094969096829933e-05, 'epoch': 0.7}


 70%|███████   | 10901/15547 [38:51<16:46,  4.61it/s]

{'loss': 0.0, 'learning_rate': 3.0883232538047454e-05, 'epoch': 0.7}


 70%|███████   | 10911/15547 [38:53<15:28,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.0816774107795576e-05, 'epoch': 0.7}


 70%|███████   | 10921/15547 [38:55<15:26,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.07503156775437e-05, 'epoch': 0.7}


 70%|███████   | 10931/15547 [38:57<15:28,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.068385724729182e-05, 'epoch': 0.7}


 70%|███████   | 10941/15547 [38:59<15:22,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.061739881703994e-05, 'epoch': 0.7}


 70%|███████   | 10951/15547 [39:01<15:19,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.055094038678806e-05, 'epoch': 0.7}


 71%|███████   | 10961/15547 [39:03<15:19,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.0484481956536188e-05, 'epoch': 0.7}


 71%|███████   | 10971/15547 [39:05<15:15,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.041802352628431e-05, 'epoch': 0.71}


 71%|███████   | 10981/15547 [39:07<15:13,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.0351565096032432e-05, 'epoch': 0.71}


 71%|███████   | 10991/15547 [39:09<15:06,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.0285106665780553e-05, 'epoch': 0.71}


 71%|███████   | 11000/15547 [39:11<17:15,  4.39it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 3.021864823552868e-05, 'epoch': 0.71}


                                                     
 71%|███████   | 11000/15547 [39:14<17:15,  4.39it/s]Saving model checkpoint to ./results/checkpoint-11000
Configuration saved in ./results/checkpoint-11000/config.json


{'eval_loss': 0.0, 'eval_runtime': 3.0352, 'eval_samples_per_second': 823.666, 'eval_steps_per_second': 13.179, 'epoch': 0.71}


Model weights saved in ./results/checkpoint-11000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 71%|███████   | 11011/15547 [39:17<17:36,  4.29it/s]  

{'loss': 0.0, 'learning_rate': 3.0152189805276797e-05, 'epoch': 0.71}


 71%|███████   | 11021/15547 [39:19<15:08,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.0085731375024922e-05, 'epoch': 0.71}


 71%|███████   | 11031/15547 [39:21<15:05,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.0019272944773047e-05, 'epoch': 0.71}


 71%|███████   | 11041/15547 [39:23<15:01,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.9952814514521165e-05, 'epoch': 0.71}


 71%|███████   | 11051/15547 [39:25<15:00,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.988635608426929e-05, 'epoch': 0.71}


 71%|███████   | 11061/15547 [39:27<14:56,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.9819897654017416e-05, 'epoch': 0.71}


 71%|███████   | 11071/15547 [39:29<14:55,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.9753439223765534e-05, 'epoch': 0.71}


 71%|███████▏  | 11081/15547 [39:31<14:51,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.968698079351366e-05, 'epoch': 0.71}


 71%|███████▏  | 11091/15547 [39:33<14:52,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.962052236326178e-05, 'epoch': 0.71}


 71%|███████▏  | 11101/15547 [39:35<15:55,  4.65it/s]

{'loss': 0.0, 'learning_rate': 2.9554063933009903e-05, 'epoch': 0.71}


 71%|███████▏  | 11111/15547 [39:37<14:48,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.9487605502758024e-05, 'epoch': 0.71}


 72%|███████▏  | 11121/15547 [39:39<14:48,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.942114707250615e-05, 'epoch': 0.72}


 72%|███████▏  | 11131/15547 [39:41<14:41,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.9354688642254268e-05, 'epoch': 0.72}


 72%|███████▏  | 11141/15547 [39:43<14:37,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.9288230212002393e-05, 'epoch': 0.72}


 72%|███████▏  | 11151/15547 [39:45<14:37,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.9221771781750518e-05, 'epoch': 0.72}


 72%|███████▏  | 11161/15547 [39:47<14:38,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.9155313351498636e-05, 'epoch': 0.72}


 72%|███████▏  | 11171/15547 [39:49<14:34,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.908885492124676e-05, 'epoch': 0.72}


 72%|███████▏  | 11181/15547 [39:51<14:32,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.9022396490994887e-05, 'epoch': 0.72}


 72%|███████▏  | 11191/15547 [39:53<14:27,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.8955938060743005e-05, 'epoch': 0.72}


 72%|███████▏  | 11201/15547 [39:55<15:28,  4.68it/s]

{'loss': 0.0, 'learning_rate': 2.888947963049113e-05, 'epoch': 0.72}


 72%|███████▏  | 11211/15547 [39:57<14:33,  4.96it/s]

{'loss': 0.0, 'learning_rate': 2.8823021200239252e-05, 'epoch': 0.72}


 72%|███████▏  | 11221/15547 [39:59<14:24,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.875656276998737e-05, 'epoch': 0.72}


 72%|███████▏  | 11231/15547 [40:01<14:20,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.8690104339735495e-05, 'epoch': 0.72}


 72%|███████▏  | 11241/15547 [40:03<14:18,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.862364590948362e-05, 'epoch': 0.72}


 72%|███████▏  | 11250/15547 [40:05<14:14,  5.03it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 2.855718747923174e-05, 'epoch': 0.72}


                                                     
 72%|███████▏  | 11251/15547 [40:08<1:19:31,  1.11s/it]

{'eval_loss': 0.0, 'eval_runtime': 3.0273, 'eval_samples_per_second': 825.812, 'eval_steps_per_second': 13.213, 'epoch': 0.72}


 72%|███████▏  | 11261/15547 [40:10<16:06,  4.43it/s]  

{'loss': 0.0, 'learning_rate': 2.8490729048979864e-05, 'epoch': 0.72}


 72%|███████▏  | 11271/15547 [40:12<14:16,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.842427061872799e-05, 'epoch': 0.72}


 73%|███████▎  | 11281/15547 [40:14<14:12,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.8357812188476107e-05, 'epoch': 0.73}


 73%|███████▎  | 11291/15547 [40:16<14:09,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.8291353758224233e-05, 'epoch': 0.73}


 73%|███████▎  | 11301/15547 [40:18<15:11,  4.66it/s]

{'loss': 0.0, 'learning_rate': 2.8224895327972358e-05, 'epoch': 0.73}


 73%|███████▎  | 11311/15547 [40:20<14:14,  4.96it/s]

{'loss': 0.0, 'learning_rate': 2.8158436897720476e-05, 'epoch': 0.73}


 73%|███████▎  | 11321/15547 [40:22<14:03,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.8091978467468598e-05, 'epoch': 0.73}


 73%|███████▎  | 11331/15547 [40:24<14:06,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.8025520037216723e-05, 'epoch': 0.73}


 73%|███████▎  | 11341/15547 [40:26<14:02,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.795906160696484e-05, 'epoch': 0.73}


 73%|███████▎  | 11351/15547 [40:28<13:55,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.7892603176712966e-05, 'epoch': 0.73}


 73%|███████▎  | 11361/15547 [40:30<14:00,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.782614474646109e-05, 'epoch': 0.73}


 73%|███████▎  | 11371/15547 [40:32<13:55,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.775968631620921e-05, 'epoch': 0.73}


 73%|███████▎  | 11381/15547 [40:34<13:54,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.7693227885957335e-05, 'epoch': 0.73}


 73%|███████▎  | 11391/15547 [40:36<13:49,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.762676945570546e-05, 'epoch': 0.73}


 73%|███████▎  | 11401/15547 [40:38<14:49,  4.66it/s]

{'loss': 0.0, 'learning_rate': 2.756031102545358e-05, 'epoch': 0.73}


 73%|███████▎  | 11411/15547 [40:40<13:47,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.7493852595201704e-05, 'epoch': 0.73}


 73%|███████▎  | 11421/15547 [40:42<13:44,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.7427394164949825e-05, 'epoch': 0.73}


 74%|███████▎  | 11431/15547 [40:44<13:41,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.7360935734697947e-05, 'epoch': 0.74}


 74%|███████▎  | 11441/15547 [40:46<13:35,  5.04it/s]

{'loss': 0.0, 'learning_rate': 2.729447730444607e-05, 'epoch': 0.74}


 74%|███████▎  | 11451/15547 [40:48<13:36,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.7228018874194194e-05, 'epoch': 0.74}


 74%|███████▎  | 11461/15547 [40:50<13:34,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.7161560443942312e-05, 'epoch': 0.74}


 74%|███████▍  | 11471/15547 [40:52<13:37,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.7095102013690437e-05, 'epoch': 0.74}


 74%|███████▍  | 11481/15547 [40:54<13:33,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.7028643583438562e-05, 'epoch': 0.74}


 74%|███████▍  | 11491/15547 [40:56<13:31,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.696218515318668e-05, 'epoch': 0.74}


 74%|███████▍  | 11500/15547 [40:58<15:14,  4.43it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 2.6895726722934806e-05, 'epoch': 0.74}


                                                     
 74%|███████▍  | 11500/15547 [41:01<15:14,  4.43it/s]Saving model checkpoint to ./results/checkpoint-11500
Configuration saved in ./results/checkpoint-11500/config.json


{'eval_loss': 0.0, 'eval_runtime': 3.0395, 'eval_samples_per_second': 822.516, 'eval_steps_per_second': 13.16, 'epoch': 0.74}


Model weights saved in ./results/checkpoint-11500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 74%|███████▍  | 11511/15547 [41:04<15:36,  4.31it/s]  

{'loss': 0.0, 'learning_rate': 2.682926829268293e-05, 'epoch': 0.74}


 74%|███████▍  | 11521/15547 [41:06<13:21,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.676280986243105e-05, 'epoch': 0.74}


 74%|███████▍  | 11531/15547 [41:08<13:22,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.6696351432179174e-05, 'epoch': 0.74}


 74%|███████▍  | 11541/15547 [41:10<13:25,  4.97it/s]

{'loss': 0.0, 'learning_rate': 2.6629893001927296e-05, 'epoch': 0.74}


 74%|███████▍  | 11551/15547 [41:12<13:21,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.6563434571675418e-05, 'epoch': 0.74}


 74%|███████▍  | 11561/15547 [41:14<13:17,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.649697614142354e-05, 'epoch': 0.74}


 74%|███████▍  | 11571/15547 [41:16<13:13,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.6430517711171665e-05, 'epoch': 0.74}


 74%|███████▍  | 11581/15547 [41:18<13:11,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.6364059280919783e-05, 'epoch': 0.74}


 75%|███████▍  | 11591/15547 [41:20<13:13,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.6297600850667908e-05, 'epoch': 0.75}


 75%|███████▍  | 11601/15547 [41:22<14:08,  4.65it/s]

{'loss': 0.0, 'learning_rate': 2.6231142420416033e-05, 'epoch': 0.75}


 75%|███████▍  | 11611/15547 [41:24<13:06,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.6164683990164152e-05, 'epoch': 0.75}


 75%|███████▍  | 11621/15547 [41:26<13:01,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.6098225559912277e-05, 'epoch': 0.75}


 75%|███████▍  | 11631/15547 [41:28<13:03,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.6031767129660402e-05, 'epoch': 0.75}


 75%|███████▍  | 11641/15547 [41:30<13:01,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.596530869940852e-05, 'epoch': 0.75}


 75%|███████▍  | 11651/15547 [41:32<12:59,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.5898850269156642e-05, 'epoch': 0.75}


 75%|███████▌  | 11661/15547 [41:34<12:57,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.5832391838904767e-05, 'epoch': 0.75}


 75%|███████▌  | 11671/15547 [41:36<12:58,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.5765933408652886e-05, 'epoch': 0.75}


 75%|███████▌  | 11681/15547 [41:38<12:54,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.569947497840101e-05, 'epoch': 0.75}


 75%|███████▌  | 11691/15547 [41:40<12:52,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.5633016548149136e-05, 'epoch': 0.75}


 75%|███████▌  | 11701/15547 [41:42<13:46,  4.65it/s]

{'loss': 0.0, 'learning_rate': 2.5566558117897254e-05, 'epoch': 0.75}


 75%|███████▌  | 11711/15547 [41:44<12:49,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.550009968764538e-05, 'epoch': 0.75}


 75%|███████▌  | 11721/15547 [41:46<12:48,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.5433641257393504e-05, 'epoch': 0.75}


 75%|███████▌  | 11731/15547 [41:48<12:40,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.5367182827141623e-05, 'epoch': 0.75}


 76%|███████▌  | 11741/15547 [41:50<12:39,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.5300724396889748e-05, 'epoch': 0.76}


 76%|███████▌  | 11750/15547 [41:52<12:40,  4.99it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 2.523426596663787e-05, 'epoch': 0.76}


                                                     
 76%|███████▌  | 11751/15547 [41:55<1:10:23,  1.11s/it]

{'eval_loss': 0.0, 'eval_runtime': 3.0308, 'eval_samples_per_second': 824.863, 'eval_steps_per_second': 13.198, 'epoch': 0.76}


 76%|███████▌  | 11761/15547 [41:57<14:14,  4.43it/s]  

{'loss': 0.0, 'learning_rate': 2.516780753638599e-05, 'epoch': 0.76}


 76%|███████▌  | 11771/15547 [41:59<12:37,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.5101349106134113e-05, 'epoch': 0.76}


 76%|███████▌  | 11781/15547 [42:01<12:37,  4.97it/s]

{'loss': 0.0, 'learning_rate': 2.5034890675882238e-05, 'epoch': 0.76}


 76%|███████▌  | 11791/15547 [42:03<12:33,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.496843224563036e-05, 'epoch': 0.76}


 76%|███████▌  | 11801/15547 [42:05<13:31,  4.62it/s]

{'loss': 0.0, 'learning_rate': 2.490197381537848e-05, 'epoch': 0.76}


 76%|███████▌  | 11811/15547 [42:07<12:30,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.4835515385126603e-05, 'epoch': 0.76}


 76%|███████▌  | 11821/15547 [42:09<12:28,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.476905695487473e-05, 'epoch': 0.76}


 76%|███████▌  | 11831/15547 [42:11<12:27,  4.97it/s]

{'loss': 0.0, 'learning_rate': 2.470259852462285e-05, 'epoch': 0.76}


 76%|███████▌  | 11841/15547 [42:13<12:25,  4.97it/s]

{'loss': 0.0, 'learning_rate': 2.4636140094370972e-05, 'epoch': 0.76}


 76%|███████▌  | 11851/15547 [42:15<12:18,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.4569681664119097e-05, 'epoch': 0.76}


 76%|███████▋  | 11861/15547 [42:17<12:14,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.450322323386722e-05, 'epoch': 0.76}


 76%|███████▋  | 11871/15547 [42:19<12:16,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.443676480361534e-05, 'epoch': 0.76}


 76%|███████▋  | 11881/15547 [42:21<12:14,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.4370306373363462e-05, 'epoch': 0.76}


 76%|███████▋  | 11891/15547 [42:23<12:13,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.4303847943111584e-05, 'epoch': 0.76}


 77%|███████▋  | 11901/15547 [42:25<13:13,  4.59it/s]

{'loss': 0.0, 'learning_rate': 2.4237389512859706e-05, 'epoch': 0.77}


 77%|███████▋  | 11911/15547 [42:27<12:09,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.417093108260783e-05, 'epoch': 0.77}


 77%|███████▋  | 11921/15547 [42:29<12:03,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.4104472652355953e-05, 'epoch': 0.77}


 77%|███████▋  | 11931/15547 [42:31<12:01,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.4038014222104074e-05, 'epoch': 0.77}


 77%|███████▋  | 11941/15547 [42:33<11:59,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.39715557918522e-05, 'epoch': 0.77}


 77%|███████▋  | 11951/15547 [42:35<11:56,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.390509736160032e-05, 'epoch': 0.77}


 77%|███████▋  | 11961/15547 [42:37<11:57,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.3838638931348443e-05, 'epoch': 0.77}


 77%|███████▋  | 11971/15547 [42:39<11:59,  4.97it/s]

{'loss': 0.0, 'learning_rate': 2.3772180501096568e-05, 'epoch': 0.77}


 77%|███████▋  | 11981/15547 [42:41<11:53,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.370572207084469e-05, 'epoch': 0.77}


 77%|███████▋  | 11991/15547 [42:43<11:51,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.3639263640592808e-05, 'epoch': 0.77}


 77%|███████▋  | 12000/15547 [42:45<13:21,  4.43it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 2.3572805210340933e-05, 'epoch': 0.77}


                                                     
 77%|███████▋  | 12000/15547 [42:48<13:21,  4.43it/s]Saving model checkpoint to ./results/checkpoint-12000
Configuration saved in ./results/checkpoint-12000/config.json


{'eval_loss': 0.0, 'eval_runtime': 3.054, 'eval_samples_per_second': 818.61, 'eval_steps_per_second': 13.098, 'epoch': 0.77}


Model weights saved in ./results/checkpoint-12000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 77%|███████▋  | 12011/15547 [42:51<13:46,  4.28it/s]  

{'loss': 0.0, 'learning_rate': 2.3506346780089055e-05, 'epoch': 0.77}


 77%|███████▋  | 12021/15547 [42:53<11:50,  4.96it/s]

{'loss': 0.0, 'learning_rate': 2.3439888349837177e-05, 'epoch': 0.77}


 77%|███████▋  | 12031/15547 [42:55<11:41,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.3373429919585302e-05, 'epoch': 0.77}


 77%|███████▋  | 12041/15547 [42:57<11:39,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.3306971489333424e-05, 'epoch': 0.77}


 78%|███████▊  | 12051/15547 [42:59<11:38,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.3240513059081545e-05, 'epoch': 0.78}


 78%|███████▊  | 12061/15547 [43:01<11:36,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.317405462882967e-05, 'epoch': 0.78}


 78%|███████▊  | 12071/15547 [43:03<11:34,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.3107596198577792e-05, 'epoch': 0.78}


 78%|███████▊  | 12081/15547 [43:05<11:33,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.3041137768325914e-05, 'epoch': 0.78}


 78%|███████▊  | 12091/15547 [43:07<11:29,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.2974679338074036e-05, 'epoch': 0.78}


 78%|███████▊  | 12101/15547 [43:09<12:18,  4.66it/s]

{'loss': 0.0, 'learning_rate': 2.2908220907822157e-05, 'epoch': 0.78}


 78%|███████▊  | 12111/15547 [43:11<11:32,  4.96it/s]

{'loss': 0.0, 'learning_rate': 2.284176247757028e-05, 'epoch': 0.78}


 78%|███████▊  | 12121/15547 [43:13<11:27,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.27753040473184e-05, 'epoch': 0.78}


 78%|███████▊  | 12131/15547 [43:15<11:23,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.2708845617066526e-05, 'epoch': 0.78}


 78%|███████▊  | 12141/15547 [43:17<11:23,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.2642387186814648e-05, 'epoch': 0.78}


 78%|███████▊  | 12151/15547 [43:19<11:27,  4.94it/s]

{'loss': 0.0, 'learning_rate': 2.257592875656277e-05, 'epoch': 0.78}


 78%|███████▊  | 12161/15547 [43:21<11:18,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.2509470326310894e-05, 'epoch': 0.78}


 78%|███████▊  | 12171/15547 [43:23<11:14,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.2443011896059016e-05, 'epoch': 0.78}


 78%|███████▊  | 12181/15547 [43:25<11:11,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.2376553465807138e-05, 'epoch': 0.78}


 78%|███████▊  | 12191/15547 [43:27<11:09,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.2310095035555263e-05, 'epoch': 0.78}


 78%|███████▊  | 12201/15547 [43:29<11:58,  4.65it/s]

{'loss': 0.0, 'learning_rate': 2.2243636605303385e-05, 'epoch': 0.78}


 79%|███████▊  | 12211/15547 [43:31<11:09,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.2177178175051507e-05, 'epoch': 0.79}


 79%|███████▊  | 12221/15547 [43:33<11:13,  4.94it/s]

{'loss': 0.0, 'learning_rate': 2.2110719744799628e-05, 'epoch': 0.79}


 79%|███████▊  | 12231/15547 [43:35<10:59,  5.03it/s]

{'loss': 0.0, 'learning_rate': 2.204426131454775e-05, 'epoch': 0.79}


 79%|███████▊  | 12241/15547 [43:37<10:59,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.1977802884295872e-05, 'epoch': 0.79}


 79%|███████▉  | 12250/15547 [43:39<10:57,  5.02it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 2.1911344454043997e-05, 'epoch': 0.79}


                                                     
 79%|███████▉  | 12251/15547 [43:42<1:00:59,  1.11s/it]

{'eval_loss': 0.0, 'eval_runtime': 3.0225, 'eval_samples_per_second': 827.141, 'eval_steps_per_second': 13.234, 'epoch': 0.79}


 79%|███████▉  | 12261/15547 [43:44<12:21,  4.43it/s]  

{'loss': 0.0, 'learning_rate': 2.184488602379212e-05, 'epoch': 0.79}


 79%|███████▉  | 12271/15547 [43:46<10:58,  4.97it/s]

{'loss': 0.0, 'learning_rate': 2.177842759354024e-05, 'epoch': 0.79}


 79%|███████▉  | 12281/15547 [43:48<10:53,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.1711969163288365e-05, 'epoch': 0.79}


 79%|███████▉  | 12291/15547 [43:50<10:48,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.1645510733036487e-05, 'epoch': 0.79}


 79%|███████▉  | 12301/15547 [43:52<11:50,  4.57it/s]

{'loss': 0.0, 'learning_rate': 2.157905230278461e-05, 'epoch': 0.79}


 79%|███████▉  | 12311/15547 [43:54<10:46,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.1512593872532734e-05, 'epoch': 0.79}


 79%|███████▉  | 12321/15547 [43:56<10:43,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.1446135442280856e-05, 'epoch': 0.79}


 79%|███████▉  | 12331/15547 [43:58<10:45,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.1379677012028974e-05, 'epoch': 0.79}


 79%|███████▉  | 12341/15547 [44:00<10:39,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.13132185817771e-05, 'epoch': 0.79}


 79%|███████▉  | 12351/15547 [44:02<10:39,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.124676015152522e-05, 'epoch': 0.79}


 80%|███████▉  | 12361/15547 [44:04<10:43,  4.95it/s]

{'loss': 0.0, 'learning_rate': 2.1180301721273343e-05, 'epoch': 0.8}


 80%|███████▉  | 12371/15547 [44:06<10:30,  5.04it/s]

{'loss': 0.0, 'learning_rate': 2.1113843291021468e-05, 'epoch': 0.8}


 80%|███████▉  | 12381/15547 [44:08<10:29,  5.03it/s]

{'loss': 0.0, 'learning_rate': 2.104738486076959e-05, 'epoch': 0.8}


 80%|███████▉  | 12391/15547 [44:10<10:30,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.098092643051771e-05, 'epoch': 0.8}


 80%|███████▉  | 12401/15547 [44:12<11:28,  4.57it/s]

{'loss': 0.0, 'learning_rate': 2.0914468000265836e-05, 'epoch': 0.8}


 80%|███████▉  | 12411/15547 [44:14<10:33,  4.95it/s]

{'loss': 0.0, 'learning_rate': 2.0848009570013958e-05, 'epoch': 0.8}


 80%|███████▉  | 12421/15547 [44:16<10:27,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.078155113976208e-05, 'epoch': 0.8}


 80%|███████▉  | 12431/15547 [44:18<10:25,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.07150927095102e-05, 'epoch': 0.8}


 80%|████████  | 12441/15547 [44:20<10:23,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.0648634279258323e-05, 'epoch': 0.8}


 80%|████████  | 12451/15547 [44:22<10:21,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.0582175849006445e-05, 'epoch': 0.8}


 80%|████████  | 12461/15547 [44:24<10:19,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.051571741875457e-05, 'epoch': 0.8}


 80%|████████  | 12471/15547 [44:26<10:14,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.0449258988502692e-05, 'epoch': 0.8}


 80%|████████  | 12481/15547 [44:28<10:13,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.0382800558250814e-05, 'epoch': 0.8}


 80%|████████  | 12491/15547 [44:30<10:12,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.031634212799894e-05, 'epoch': 0.8}


 80%|████████  | 12500/15547 [44:32<11:13,  4.53it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 2.024988369774706e-05, 'epoch': 0.8}


                                                     
 80%|████████  | 12500/15547 [44:35<11:13,  4.53it/s]Saving model checkpoint to ./results/checkpoint-12500
Configuration saved in ./results/checkpoint-12500/config.json


{'eval_loss': 0.0, 'eval_runtime': 3.014, 'eval_samples_per_second': 829.471, 'eval_steps_per_second': 13.272, 'epoch': 0.8}


Model weights saved in ./results/checkpoint-12500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 80%|████████  | 12511/15547 [44:38<11:52,  4.26it/s]  

{'loss': 0.0, 'learning_rate': 2.0183425267495182e-05, 'epoch': 0.8}


 81%|████████  | 12521/15547 [44:40<10:11,  4.95it/s]

{'loss': 0.0, 'learning_rate': 2.0116966837243307e-05, 'epoch': 0.81}


 81%|████████  | 12531/15547 [44:42<10:02,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.005050840699143e-05, 'epoch': 0.81}


 81%|████████  | 12541/15547 [44:44<10:02,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.998404997673955e-05, 'epoch': 0.81}


 81%|████████  | 12551/15547 [44:46<10:00,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.9917591546487673e-05, 'epoch': 0.81}


 81%|████████  | 12561/15547 [44:48<09:58,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.9851133116235794e-05, 'epoch': 0.81}


 81%|████████  | 12571/15547 [44:50<09:58,  4.97it/s]

{'loss': 0.0, 'learning_rate': 1.9784674685983916e-05, 'epoch': 0.81}


 81%|████████  | 12581/15547 [44:52<09:55,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.971821625573204e-05, 'epoch': 0.81}


 81%|████████  | 12591/15547 [44:54<09:52,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.9651757825480163e-05, 'epoch': 0.81}


 81%|████████  | 12601/15547 [44:56<10:34,  4.65it/s]

{'loss': 0.0, 'learning_rate': 1.9585299395228285e-05, 'epoch': 0.81}


 81%|████████  | 12611/15547 [44:58<09:51,  4.96it/s]

{'loss': 0.0, 'learning_rate': 1.951884096497641e-05, 'epoch': 0.81}


 81%|████████  | 12621/15547 [45:00<09:47,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.945238253472453e-05, 'epoch': 0.81}


 81%|████████  | 12631/15547 [45:02<09:43,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.9385924104472653e-05, 'epoch': 0.81}


 81%|████████▏ | 12641/15547 [45:04<09:40,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.931946567422078e-05, 'epoch': 0.81}


 81%|████████▏ | 12651/15547 [45:06<09:36,  5.02it/s]

{'loss': 0.0, 'learning_rate': 1.92530072439689e-05, 'epoch': 0.81}


 81%|████████▏ | 12661/15547 [45:08<09:38,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.9186548813717022e-05, 'epoch': 0.81}


 82%|████████▏ | 12671/15547 [45:10<09:37,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.9120090383465144e-05, 'epoch': 0.81}


 82%|████████▏ | 12681/15547 [45:12<09:33,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.9053631953213265e-05, 'epoch': 0.82}


 82%|████████▏ | 12691/15547 [45:14<09:30,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.8987173522961387e-05, 'epoch': 0.82}


 82%|████████▏ | 12701/15547 [45:16<10:13,  4.64it/s]

{'loss': 0.0, 'learning_rate': 1.8920715092709512e-05, 'epoch': 0.82}


 82%|████████▏ | 12711/15547 [45:18<09:32,  4.96it/s]

{'loss': 0.0, 'learning_rate': 1.8854256662457634e-05, 'epoch': 0.82}


 82%|████████▏ | 12721/15547 [45:20<09:24,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.8787798232205756e-05, 'epoch': 0.82}


 82%|████████▏ | 12731/15547 [45:22<09:25,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.872133980195388e-05, 'epoch': 0.82}


 82%|████████▏ | 12741/15547 [45:24<09:22,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.8654881371702002e-05, 'epoch': 0.82}


 82%|████████▏ | 12750/15547 [45:26<09:16,  5.02it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 1.8588422941450124e-05, 'epoch': 0.82}


                                                     
 82%|████████▏ | 12751/15547 [45:29<51:14,  1.10s/it]

{'eval_loss': 0.0, 'eval_runtime': 2.9899, 'eval_samples_per_second': 836.135, 'eval_steps_per_second': 13.378, 'epoch': 0.82}


 82%|████████▏ | 12761/15547 [45:31<10:26,  4.45it/s]

{'loss': 0.0, 'learning_rate': 1.8521964511198246e-05, 'epoch': 0.82}


 82%|████████▏ | 12771/15547 [45:33<09:17,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.8455506080946368e-05, 'epoch': 0.82}


 82%|████████▏ | 12781/15547 [45:35<09:10,  5.02it/s]

{'loss': 0.0, 'learning_rate': 1.838904765069449e-05, 'epoch': 0.82}


 82%|████████▏ | 12791/15547 [45:37<09:09,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.8322589220442615e-05, 'epoch': 0.82}


 82%|████████▏ | 12801/15547 [45:39<09:49,  4.66it/s]

{'loss': 0.0, 'learning_rate': 1.8256130790190736e-05, 'epoch': 0.82}


 82%|████████▏ | 12811/15547 [45:41<09:08,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.8189672359938858e-05, 'epoch': 0.82}


 82%|████████▏ | 12821/15547 [45:43<09:05,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.8123213929686983e-05, 'epoch': 0.82}


 83%|████████▎ | 12831/15547 [45:45<09:02,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.8056755499435105e-05, 'epoch': 0.83}


 83%|████████▎ | 12841/15547 [45:47<08:59,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.7990297069183227e-05, 'epoch': 0.83}


 83%|████████▎ | 12851/15547 [45:49<08:59,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.792383863893135e-05, 'epoch': 0.83}


 83%|████████▎ | 12861/15547 [45:51<08:55,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.7857380208679473e-05, 'epoch': 0.83}


 83%|████████▎ | 12871/15547 [45:53<08:53,  5.02it/s]

{'loss': 0.0, 'learning_rate': 1.7790921778427595e-05, 'epoch': 0.83}


 83%|████████▎ | 12881/15547 [45:55<08:52,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.7724463348175717e-05, 'epoch': 0.83}


 83%|████████▎ | 12891/15547 [45:57<08:50,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.765800491792384e-05, 'epoch': 0.83}


 83%|████████▎ | 12901/15547 [45:59<09:33,  4.61it/s]

{'loss': 0.0, 'learning_rate': 1.759154648767196e-05, 'epoch': 0.83}


 83%|████████▎ | 12911/15547 [46:01<08:49,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.7525088057420082e-05, 'epoch': 0.83}


 83%|████████▎ | 12921/15547 [46:03<08:44,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.7458629627168207e-05, 'epoch': 0.83}


 83%|████████▎ | 12931/15547 [46:05<08:39,  5.03it/s]

{'loss': 0.0, 'learning_rate': 1.739217119691633e-05, 'epoch': 0.83}


 83%|████████▎ | 12941/15547 [46:07<08:40,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.732571276666445e-05, 'epoch': 0.83}


 83%|████████▎ | 12951/15547 [46:09<08:38,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.7259254336412576e-05, 'epoch': 0.83}


 83%|████████▎ | 12961/15547 [46:11<08:37,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.7192795906160698e-05, 'epoch': 0.83}


 83%|████████▎ | 12971/15547 [46:13<08:34,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.712633747590882e-05, 'epoch': 0.83}


 83%|████████▎ | 12981/15547 [46:15<08:34,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.7059879045656944e-05, 'epoch': 0.83}


 84%|████████▎ | 12991/15547 [46:17<08:30,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.6993420615405066e-05, 'epoch': 0.84}


 84%|████████▎ | 13000/15547 [46:19<09:27,  4.49it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 1.6926962185153188e-05, 'epoch': 0.84}


                                                     
 84%|████████▎ | 13000/15547 [46:22<09:27,  4.49it/s]Saving model checkpoint to ./results/checkpoint-13000
Configuration saved in ./results/checkpoint-13000/config.json


{'eval_loss': 0.0, 'eval_runtime': 3.0125, 'eval_samples_per_second': 829.863, 'eval_steps_per_second': 13.278, 'epoch': 0.84}


Model weights saved in ./results/checkpoint-13000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 84%|████████▎ | 13011/15547 [46:25<09:55,  4.26it/s]  

{'loss': 0.0, 'learning_rate': 1.686050375490131e-05, 'epoch': 0.84}


 84%|████████▍ | 13021/15547 [46:27<08:27,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.679404532464943e-05, 'epoch': 0.84}


 84%|████████▍ | 13031/15547 [46:29<08:22,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.6727586894397553e-05, 'epoch': 0.84}


 84%|████████▍ | 13041/15547 [46:31<08:25,  4.96it/s]

{'loss': 0.0, 'learning_rate': 1.6661128464145678e-05, 'epoch': 0.84}


 84%|████████▍ | 13051/15547 [46:33<08:24,  4.95it/s]

{'loss': 0.0, 'learning_rate': 1.65946700338938e-05, 'epoch': 0.84}


 84%|████████▍ | 13061/15547 [46:35<08:21,  4.96it/s]

{'loss': 0.0, 'learning_rate': 1.652821160364192e-05, 'epoch': 0.84}


 84%|████████▍ | 13071/15547 [46:37<08:15,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.6461753173390047e-05, 'epoch': 0.84}


 84%|████████▍ | 13081/15547 [46:39<08:13,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.639529474313817e-05, 'epoch': 0.84}


 84%|████████▍ | 13091/15547 [46:41<08:12,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.632883631288629e-05, 'epoch': 0.84}


 84%|████████▍ | 13101/15547 [46:43<08:44,  4.66it/s]

{'loss': 0.0, 'learning_rate': 1.6262377882634412e-05, 'epoch': 0.84}


 84%|████████▍ | 13111/15547 [46:45<08:12,  4.95it/s]

{'loss': 0.0, 'learning_rate': 1.6195919452382534e-05, 'epoch': 0.84}


 84%|████████▍ | 13121/15547 [46:47<08:06,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.6129461022130655e-05, 'epoch': 0.84}


 84%|████████▍ | 13131/15547 [46:49<08:06,  4.97it/s]

{'loss': 0.0, 'learning_rate': 1.606300259187878e-05, 'epoch': 0.84}


 85%|████████▍ | 13141/15547 [46:51<08:03,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.5996544161626902e-05, 'epoch': 0.85}


 85%|████████▍ | 13151/15547 [46:53<07:59,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.5930085731375024e-05, 'epoch': 0.85}


 85%|████████▍ | 13161/15547 [46:55<07:59,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.586362730112315e-05, 'epoch': 0.85}


 85%|████████▍ | 13171/15547 [46:57<07:56,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.579716887087127e-05, 'epoch': 0.85}


 85%|████████▍ | 13181/15547 [46:59<07:54,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.5730710440619393e-05, 'epoch': 0.85}


 85%|████████▍ | 13191/15547 [47:01<07:51,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.5664252010367518e-05, 'epoch': 0.85}


 85%|████████▍ | 13201/15547 [47:03<08:33,  4.57it/s]

{'loss': 0.0, 'learning_rate': 1.559779358011564e-05, 'epoch': 0.85}


 85%|████████▍ | 13211/15547 [47:05<07:51,  4.95it/s]

{'loss': 0.0, 'learning_rate': 1.553133514986376e-05, 'epoch': 0.85}


 85%|████████▌ | 13221/15547 [47:07<07:46,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.5464876719611883e-05, 'epoch': 0.85}


 85%|████████▌ | 13231/15547 [47:09<07:47,  4.96it/s]

{'loss': 0.0, 'learning_rate': 1.5398418289360005e-05, 'epoch': 0.85}


 85%|████████▌ | 13241/15547 [47:11<07:41,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.5331959859108126e-05, 'epoch': 0.85}


 85%|████████▌ | 13250/15547 [47:13<07:36,  5.03it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 1.526550142885625e-05, 'epoch': 0.85}


                                                     
 85%|████████▌ | 13251/15547 [47:16<42:08,  1.10s/it]

{'eval_loss': 0.0, 'eval_runtime': 2.9927, 'eval_samples_per_second': 835.372, 'eval_steps_per_second': 13.366, 'epoch': 0.85}


 85%|████████▌ | 13261/15547 [47:18<08:35,  4.43it/s]

{'loss': 0.0, 'learning_rate': 1.5199042998604373e-05, 'epoch': 0.85}


 85%|████████▌ | 13271/15547 [47:20<07:35,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.5132584568352495e-05, 'epoch': 0.85}


 85%|████████▌ | 13281/15547 [47:22<07:32,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.506612613810062e-05, 'epoch': 0.85}


 85%|████████▌ | 13291/15547 [47:24<07:32,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.4999667707848742e-05, 'epoch': 0.85}


 86%|████████▌ | 13301/15547 [47:26<08:03,  4.64it/s]

{'loss': 0.0, 'learning_rate': 1.4933209277596864e-05, 'epoch': 0.86}


 86%|████████▌ | 13311/15547 [47:28<07:29,  4.97it/s]

{'loss': 0.0, 'learning_rate': 1.4866750847344987e-05, 'epoch': 0.86}


 86%|████████▌ | 13321/15547 [47:30<07:28,  4.96it/s]

{'loss': 0.0, 'learning_rate': 1.4800292417093109e-05, 'epoch': 0.86}


 86%|████████▌ | 13331/15547 [47:32<07:25,  4.97it/s]

{'loss': 0.0, 'learning_rate': 1.473383398684123e-05, 'epoch': 0.86}


 86%|████████▌ | 13341/15547 [47:34<07:23,  4.97it/s]

{'loss': 0.0, 'learning_rate': 1.4667375556589356e-05, 'epoch': 0.86}


 86%|████████▌ | 13351/15547 [47:36<07:18,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.4600917126337477e-05, 'epoch': 0.86}


 86%|████████▌ | 13361/15547 [47:38<07:18,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.4534458696085599e-05, 'epoch': 0.86}


 86%|████████▌ | 13371/15547 [47:40<07:17,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.4468000265833722e-05, 'epoch': 0.86}


 86%|████████▌ | 13381/15547 [47:42<07:14,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.4401541835581844e-05, 'epoch': 0.86}


 86%|████████▌ | 13391/15547 [47:44<07:11,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.4335083405329966e-05, 'epoch': 0.86}


 86%|████████▌ | 13401/15547 [47:46<07:42,  4.64it/s]

{'loss': 0.0, 'learning_rate': 1.4268624975078091e-05, 'epoch': 0.86}


 86%|████████▋ | 13411/15547 [47:48<07:07,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.4202166544826213e-05, 'epoch': 0.86}


 86%|████████▋ | 13421/15547 [47:50<07:04,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.4135708114574333e-05, 'epoch': 0.86}


 86%|████████▋ | 13431/15547 [47:52<07:04,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.4069249684322458e-05, 'epoch': 0.86}


 86%|████████▋ | 13441/15547 [47:54<07:01,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.400279125407058e-05, 'epoch': 0.86}


 87%|████████▋ | 13451/15547 [47:56<06:59,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.3936332823818701e-05, 'epoch': 0.87}


 87%|████████▋ | 13461/15547 [47:58<06:57,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.3869874393566827e-05, 'epoch': 0.87}


 87%|████████▋ | 13471/15547 [48:00<06:55,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.3803415963314947e-05, 'epoch': 0.87}


 87%|████████▋ | 13481/15547 [48:02<06:54,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.3736957533063068e-05, 'epoch': 0.87}


 87%|████████▋ | 13491/15547 [48:04<06:53,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.3670499102811193e-05, 'epoch': 0.87}


 87%|████████▋ | 13500/15547 [48:06<07:43,  4.41it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 1.3604040672559315e-05, 'epoch': 0.87}


                                                     
 87%|████████▋ | 13500/15547 [48:09<07:43,  4.41it/s]Saving model checkpoint to ./results/checkpoint-13500
Configuration saved in ./results/checkpoint-13500/config.json


{'eval_loss': 0.0, 'eval_runtime': 3.0108, 'eval_samples_per_second': 830.341, 'eval_steps_per_second': 13.285, 'epoch': 0.87}


Model weights saved in ./results/checkpoint-13500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 87%|████████▋ | 13511/15547 [48:12<07:58,  4.26it/s]

{'loss': 0.0, 'learning_rate': 1.3537582242307437e-05, 'epoch': 0.87}


 87%|████████▋ | 13521/15547 [48:14<06:46,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.347112381205556e-05, 'epoch': 0.87}


 87%|████████▋ | 13531/15547 [48:16<06:43,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.3404665381803682e-05, 'epoch': 0.87}


 87%|████████▋ | 13541/15547 [48:18<06:42,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.3338206951551804e-05, 'epoch': 0.87}


 87%|████████▋ | 13551/15547 [48:20<06:38,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.3271748521299929e-05, 'epoch': 0.87}


 87%|████████▋ | 13561/15547 [48:22<06:36,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.320529009104805e-05, 'epoch': 0.87}


 87%|████████▋ | 13571/15547 [48:24<06:33,  5.02it/s]

{'loss': 0.0, 'learning_rate': 1.3138831660796172e-05, 'epoch': 0.87}


 87%|████████▋ | 13581/15547 [48:26<06:32,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.3072373230544296e-05, 'epoch': 0.87}


 87%|████████▋ | 13591/15547 [48:28<06:33,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.3005914800292418e-05, 'epoch': 0.87}


 87%|████████▋ | 13601/15547 [48:30<07:07,  4.55it/s]

{'loss': 0.0, 'learning_rate': 1.293945637004054e-05, 'epoch': 0.87}


 88%|████████▊ | 13611/15547 [48:32<06:30,  4.96it/s]

{'loss': 0.0, 'learning_rate': 1.2872997939788664e-05, 'epoch': 0.88}


 88%|████████▊ | 13621/15547 [48:34<06:26,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.2806539509536786e-05, 'epoch': 0.88}


 88%|████████▊ | 13631/15547 [48:36<06:25,  4.97it/s]

{'loss': 0.0, 'learning_rate': 1.2740081079284908e-05, 'epoch': 0.88}


 88%|████████▊ | 13641/15547 [48:38<06:22,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.2673622649033031e-05, 'epoch': 0.88}


 88%|████████▊ | 13651/15547 [48:40<06:20,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.2607164218781153e-05, 'epoch': 0.88}


 88%|████████▊ | 13661/15547 [48:42<06:17,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.2540705788529275e-05, 'epoch': 0.88}


 88%|████████▊ | 13671/15547 [48:44<06:15,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.2474247358277398e-05, 'epoch': 0.88}


 88%|████████▊ | 13681/15547 [48:46<06:13,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.2407788928025522e-05, 'epoch': 0.88}


 88%|████████▊ | 13691/15547 [48:48<06:06,  5.07it/s]

{'loss': 0.0, 'learning_rate': 1.2341330497773643e-05, 'epoch': 0.88}


 88%|████████▊ | 13701/15547 [48:50<06:45,  4.55it/s]

{'loss': 0.0, 'learning_rate': 1.2274872067521765e-05, 'epoch': 0.88}


 88%|████████▊ | 13711/15547 [48:52<06:07,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.2208413637269889e-05, 'epoch': 0.88}


 88%|████████▊ | 13721/15547 [48:54<06:05,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.214195520701801e-05, 'epoch': 0.88}


 88%|████████▊ | 13731/15547 [48:56<06:02,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.2075496776766134e-05, 'epoch': 0.88}


 88%|████████▊ | 13741/15547 [48:58<05:59,  5.02it/s]

{'loss': 0.0, 'learning_rate': 1.2009038346514257e-05, 'epoch': 0.88}


 88%|████████▊ | 13750/15547 [49:00<05:55,  5.05it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 1.1942579916262379e-05, 'epoch': 0.88}


                                                     
 88%|████████▊ | 13751/15547 [49:03<33:04,  1.10s/it]

{'eval_loss': 0.0, 'eval_runtime': 3.0115, 'eval_samples_per_second': 830.152, 'eval_steps_per_second': 13.282, 'epoch': 0.88}


 89%|████████▊ | 13761/15547 [49:05<06:40,  4.46it/s]

{'loss': 0.0, 'learning_rate': 1.18761214860105e-05, 'epoch': 0.89}


 89%|████████▊ | 13771/15547 [49:07<05:53,  5.03it/s]

{'loss': 0.0, 'learning_rate': 1.1809663055758624e-05, 'epoch': 0.89}


 89%|████████▊ | 13781/15547 [49:09<05:57,  4.93it/s]

{'loss': 0.0, 'learning_rate': 1.1743204625506746e-05, 'epoch': 0.89}


 89%|████████▊ | 13791/15547 [49:11<05:52,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.167674619525487e-05, 'epoch': 0.89}


 89%|████████▉ | 13801/15547 [49:13<06:22,  4.57it/s]

{'loss': 0.0, 'learning_rate': 1.1610287765002993e-05, 'epoch': 0.89}


 89%|████████▉ | 13811/15547 [49:15<05:48,  4.97it/s]

{'loss': 0.0, 'learning_rate': 1.1543829334751113e-05, 'epoch': 0.89}


 89%|████████▉ | 13821/15547 [49:17<05:43,  5.02it/s]

{'loss': 0.0, 'learning_rate': 1.1477370904499236e-05, 'epoch': 0.89}


 89%|████████▉ | 13831/15547 [49:19<05:42,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.141091247424736e-05, 'epoch': 0.89}


 89%|████████▉ | 13841/15547 [49:21<05:41,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.1344454043995481e-05, 'epoch': 0.89}


 89%|████████▉ | 13851/15547 [49:23<05:38,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.1277995613743605e-05, 'epoch': 0.89}


 89%|████████▉ | 13861/15547 [49:25<05:35,  5.02it/s]

{'loss': 0.0, 'learning_rate': 1.1211537183491726e-05, 'epoch': 0.89}


 89%|████████▉ | 13871/15547 [49:27<05:34,  5.02it/s]

{'loss': 0.0, 'learning_rate': 1.1145078753239848e-05, 'epoch': 0.89}


 89%|████████▉ | 13881/15547 [49:29<05:31,  5.03it/s]

{'loss': 0.0, 'learning_rate': 1.1078620322987972e-05, 'epoch': 0.89}


 89%|████████▉ | 13891/15547 [49:31<05:29,  5.02it/s]

{'loss': 0.0, 'learning_rate': 1.1012161892736095e-05, 'epoch': 0.89}


 89%|████████▉ | 13901/15547 [49:33<06:00,  4.56it/s]

{'loss': 0.0, 'learning_rate': 1.0945703462484217e-05, 'epoch': 0.89}


 89%|████████▉ | 13911/15547 [49:35<05:27,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.087924503223234e-05, 'epoch': 0.89}


 90%|████████▉ | 13921/15547 [49:37<05:25,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.0812786601980462e-05, 'epoch': 0.9}


 90%|████████▉ | 13931/15547 [49:39<05:21,  5.02it/s]

{'loss': 0.0, 'learning_rate': 1.0746328171728584e-05, 'epoch': 0.9}


 90%|████████▉ | 13941/15547 [49:41<05:19,  5.03it/s]

{'loss': 0.0, 'learning_rate': 1.0679869741476707e-05, 'epoch': 0.9}


 90%|████████▉ | 13951/15547 [49:43<05:18,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.0613411311224829e-05, 'epoch': 0.9}


 90%|████████▉ | 13961/15547 [49:45<05:16,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.0546952880972952e-05, 'epoch': 0.9}


 90%|████████▉ | 13971/15547 [49:47<05:14,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.0480494450721076e-05, 'epoch': 0.9}


 90%|████████▉ | 13981/15547 [49:49<05:13,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.0414036020469196e-05, 'epoch': 0.9}


 90%|████████▉ | 13991/15547 [49:51<05:11,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.0347577590217319e-05, 'epoch': 0.9}


 90%|█████████ | 14000/15547 [49:53<05:59,  4.30it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 1.0281119159965442e-05, 'epoch': 0.9}


                                                     
 90%|█████████ | 14000/15547 [49:56<05:59,  4.30it/s]Saving model checkpoint to ./results/checkpoint-14000
Configuration saved in ./results/checkpoint-14000/config.json


{'eval_loss': 0.0, 'eval_runtime': 3.0322, 'eval_samples_per_second': 824.476, 'eval_steps_per_second': 13.192, 'epoch': 0.9}


Model weights saved in ./results/checkpoint-14000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 90%|█████████ | 14011/15547 [49:59<05:56,  4.31it/s]

{'loss': 0.0, 'learning_rate': 1.0214660729713564e-05, 'epoch': 0.9}


 90%|█████████ | 14021/15547 [50:01<05:04,  5.01it/s]

{'loss': 0.0, 'learning_rate': 1.0148202299461688e-05, 'epoch': 0.9}


 90%|█████████ | 14031/15547 [50:03<05:01,  5.03it/s]

{'loss': 0.0, 'learning_rate': 1.008174386920981e-05, 'epoch': 0.9}


 90%|█████████ | 14041/15547 [50:05<05:00,  5.02it/s]

{'loss': 0.0, 'learning_rate': 1.0015285438957931e-05, 'epoch': 0.9}


 90%|█████████ | 14051/15547 [50:06<04:58,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.948827008706055e-06, 'epoch': 0.9}


 90%|█████████ | 14061/15547 [50:08<04:56,  5.02it/s]

{'loss': 0.0, 'learning_rate': 9.882368578454178e-06, 'epoch': 0.9}


 91%|█████████ | 14071/15547 [50:10<04:54,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.8159101482023e-06, 'epoch': 0.9}


 91%|█████████ | 14081/15547 [50:12<04:52,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.749451717950423e-06, 'epoch': 0.91}


 91%|█████████ | 14091/15547 [50:14<04:50,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.682993287698545e-06, 'epoch': 0.91}


 91%|█████████ | 14101/15547 [50:17<05:15,  4.59it/s]

{'loss': 0.0, 'learning_rate': 9.616534857446667e-06, 'epoch': 0.91}


 91%|█████████ | 14111/15547 [50:19<04:48,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.55007642719479e-06, 'epoch': 0.91}


 91%|█████████ | 14121/15547 [50:21<04:46,  4.98it/s]

{'loss': 0.0, 'learning_rate': 9.483617996942913e-06, 'epoch': 0.91}


 91%|█████████ | 14131/15547 [50:22<04:43,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.417159566691035e-06, 'epoch': 0.91}


 91%|█████████ | 14141/15547 [50:24<04:40,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.350701136439159e-06, 'epoch': 0.91}


 91%|█████████ | 14151/15547 [50:26<04:38,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.28424270618728e-06, 'epoch': 0.91}


 91%|█████████ | 14161/15547 [50:28<04:36,  5.02it/s]

{'loss': 0.0, 'learning_rate': 9.217784275935402e-06, 'epoch': 0.91}


 91%|█████████ | 14171/15547 [50:30<04:34,  5.01it/s]

{'loss': 0.0, 'learning_rate': 9.151325845683526e-06, 'epoch': 0.91}


 91%|█████████ | 14181/15547 [50:32<04:33,  5.00it/s]

{'loss': 0.0, 'learning_rate': 9.084867415431649e-06, 'epoch': 0.91}


 91%|█████████▏| 14191/15547 [50:34<04:31,  4.99it/s]

{'loss': 0.0, 'learning_rate': 9.01840898517977e-06, 'epoch': 0.91}


 91%|█████████▏| 14201/15547 [50:37<04:53,  4.59it/s]

{'loss': 0.0, 'learning_rate': 8.951950554927892e-06, 'epoch': 0.91}


 91%|█████████▏| 14211/15547 [50:38<04:26,  5.01it/s]

{'loss': 0.0, 'learning_rate': 8.885492124676016e-06, 'epoch': 0.91}


 91%|█████████▏| 14221/15547 [50:40<04:23,  5.04it/s]

{'loss': 0.0, 'learning_rate': 8.819033694424138e-06, 'epoch': 0.91}


 92%|█████████▏| 14231/15547 [50:42<04:24,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.752575264172261e-06, 'epoch': 0.92}


 92%|█████████▏| 14241/15547 [50:44<04:23,  4.96it/s]

{'loss': 0.0, 'learning_rate': 8.686116833920384e-06, 'epoch': 0.92}


 92%|█████████▏| 14250/15547 [50:46<04:17,  5.04it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 8.619658403668506e-06, 'epoch': 0.92}


                                                     
 92%|█████████▏| 14251/15547 [50:49<23:53,  1.11s/it]

{'eval_loss': 0.0, 'eval_runtime': 3.013, 'eval_samples_per_second': 829.746, 'eval_steps_per_second': 13.276, 'epoch': 0.92}


 92%|█████████▏| 14261/15547 [50:51<04:50,  4.43it/s]

{'loss': 0.0, 'learning_rate': 8.553199973416628e-06, 'epoch': 0.92}


 92%|█████████▏| 14271/15547 [50:53<04:17,  4.96it/s]

{'loss': 0.0, 'learning_rate': 8.486741543164751e-06, 'epoch': 0.92}


 92%|█████████▏| 14281/15547 [50:55<04:14,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.420283112912873e-06, 'epoch': 0.92}


 92%|█████████▏| 14291/15547 [50:57<04:10,  5.01it/s]

{'loss': 0.0, 'learning_rate': 8.353824682660996e-06, 'epoch': 0.92}


 92%|█████████▏| 14301/15547 [51:00<04:31,  4.58it/s]

{'loss': 0.0, 'learning_rate': 8.28736625240912e-06, 'epoch': 0.92}


 92%|█████████▏| 14311/15547 [51:02<04:08,  4.98it/s]

{'loss': 0.0, 'learning_rate': 8.220907822157242e-06, 'epoch': 0.92}


 92%|█████████▏| 14321/15547 [51:04<04:05,  5.00it/s]

{'loss': 0.0, 'learning_rate': 8.154449391905363e-06, 'epoch': 0.92}


 92%|█████████▏| 14331/15547 [51:06<04:02,  5.01it/s]

{'loss': 0.0, 'learning_rate': 8.087990961653485e-06, 'epoch': 0.92}


 92%|█████████▏| 14341/15547 [51:08<04:01,  4.99it/s]

{'loss': 0.0, 'learning_rate': 8.021532531401609e-06, 'epoch': 0.92}


 92%|█████████▏| 14351/15547 [51:10<03:58,  5.01it/s]

{'loss': 0.0, 'learning_rate': 7.955074101149732e-06, 'epoch': 0.92}


 92%|█████████▏| 14361/15547 [51:11<03:58,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.888615670897854e-06, 'epoch': 0.92}


 92%|█████████▏| 14371/15547 [51:13<03:56,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.822157240645975e-06, 'epoch': 0.92}


 93%|█████████▎| 14381/15547 [51:15<03:53,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.755698810394099e-06, 'epoch': 0.92}


 93%|█████████▎| 14391/15547 [51:17<03:52,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.68924038014222e-06, 'epoch': 0.93}


 93%|█████████▎| 14401/15547 [51:20<04:09,  4.60it/s]

{'loss': 0.0, 'learning_rate': 7.622781949890344e-06, 'epoch': 0.93}


 93%|█████████▎| 14411/15547 [51:22<03:48,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.556323519638467e-06, 'epoch': 0.93}


 93%|█████████▎| 14421/15547 [51:24<03:46,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.489865089386588e-06, 'epoch': 0.93}


 93%|█████████▎| 14431/15547 [51:26<03:43,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.423406659134712e-06, 'epoch': 0.93}


 93%|█████████▎| 14441/15547 [51:28<03:41,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.356948228882834e-06, 'epoch': 0.93}


 93%|█████████▎| 14451/15547 [51:30<03:39,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.290489798630956e-06, 'epoch': 0.93}


 93%|█████████▎| 14461/15547 [51:31<03:37,  4.98it/s]

{'loss': 0.0, 'learning_rate': 7.2240313683790795e-06, 'epoch': 0.93}


 93%|█████████▎| 14471/15547 [51:33<03:36,  4.97it/s]

{'loss': 0.0, 'learning_rate': 7.157572938127202e-06, 'epoch': 0.93}


 93%|█████████▎| 14481/15547 [51:35<03:33,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.091114507875324e-06, 'epoch': 0.93}


 93%|█████████▎| 14491/15547 [51:37<03:31,  4.99it/s]

{'loss': 0.0, 'learning_rate': 7.024656077623447e-06, 'epoch': 0.93}


 93%|█████████▎| 14500/15547 [51:39<03:55,  4.44it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 6.95819764737157e-06, 'epoch': 0.93}


                                                     
 93%|█████████▎| 14500/15547 [51:42<03:55,  4.44it/s]Saving model checkpoint to ./results/checkpoint-14500
Configuration saved in ./results/checkpoint-14500/config.json


{'eval_loss': 0.0, 'eval_runtime': 3.0015, 'eval_samples_per_second': 832.92, 'eval_steps_per_second': 13.327, 'epoch': 0.93}


Model weights saved in ./results/checkpoint-14500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 93%|█████████▎| 14511/15547 [51:46<04:02,  4.27it/s]

{'loss': 0.0, 'learning_rate': 6.8917392171196916e-06, 'epoch': 0.93}


 93%|█████████▎| 14521/15547 [51:48<03:26,  4.97it/s]

{'loss': 0.0, 'learning_rate': 6.825280786867814e-06, 'epoch': 0.93}


 93%|█████████▎| 14531/15547 [51:50<03:22,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.7588223566159376e-06, 'epoch': 0.93}


 94%|█████████▎| 14541/15547 [51:52<03:21,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.692363926364059e-06, 'epoch': 0.94}


 94%|█████████▎| 14551/15547 [51:54<03:18,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.625905496112182e-06, 'epoch': 0.94}


 94%|█████████▎| 14561/15547 [51:56<03:16,  5.01it/s]

{'loss': 0.0, 'learning_rate': 6.559447065860305e-06, 'epoch': 0.94}


 94%|█████████▎| 14571/15547 [51:57<03:14,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.492988635608427e-06, 'epoch': 0.94}


 94%|█████████▍| 14581/15547 [51:59<03:13,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.42653020535655e-06, 'epoch': 0.94}


 94%|█████████▍| 14591/15547 [52:01<03:11,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.360071775104673e-06, 'epoch': 0.94}


 94%|█████████▍| 14601/15547 [52:04<03:24,  4.63it/s]

{'loss': 0.0, 'learning_rate': 6.293613344852795e-06, 'epoch': 0.94}


 94%|█████████▍| 14611/15547 [52:06<03:08,  4.97it/s]

{'loss': 0.0, 'learning_rate': 6.227154914600917e-06, 'epoch': 0.94}


 94%|█████████▍| 14621/15547 [52:08<03:05,  5.00it/s]

{'loss': 0.0, 'learning_rate': 6.16069648434904e-06, 'epoch': 0.94}


 94%|█████████▍| 14631/15547 [52:10<03:03,  4.98it/s]

{'loss': 0.0, 'learning_rate': 6.0942380540971625e-06, 'epoch': 0.94}


 94%|█████████▍| 14641/15547 [52:12<03:00,  5.02it/s]

{'loss': 0.0, 'learning_rate': 6.027779623845285e-06, 'epoch': 0.94}


 94%|█████████▍| 14651/15547 [52:13<02:58,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.961321193593408e-06, 'epoch': 0.94}


 94%|█████████▍| 14661/15547 [52:15<02:56,  5.01it/s]

{'loss': 0.0, 'learning_rate': 5.89486276334153e-06, 'epoch': 0.94}


 94%|█████████▍| 14671/15547 [52:17<02:55,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.828404333089653e-06, 'epoch': 0.94}


 94%|█████████▍| 14681/15547 [52:19<02:53,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.761945902837775e-06, 'epoch': 0.94}


 94%|█████████▍| 14691/15547 [52:21<02:52,  4.97it/s]

{'loss': 0.0, 'learning_rate': 5.695487472585897e-06, 'epoch': 0.94}


 95%|█████████▍| 14701/15547 [52:24<03:00,  4.68it/s]

{'loss': 0.0, 'learning_rate': 5.629029042334021e-06, 'epoch': 0.95}


 95%|█████████▍| 14711/15547 [52:25<02:48,  4.96it/s]

{'loss': 0.0, 'learning_rate': 5.562570612082143e-06, 'epoch': 0.95}


 95%|█████████▍| 14721/15547 [52:27<02:44,  5.02it/s]

{'loss': 0.0, 'learning_rate': 5.496112181830265e-06, 'epoch': 0.95}


 95%|█████████▍| 14731/15547 [52:29<02:43,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.429653751578388e-06, 'epoch': 0.95}


 95%|█████████▍| 14741/15547 [52:31<02:41,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.363195321326511e-06, 'epoch': 0.95}


 95%|█████████▍| 14750/15547 [52:33<02:38,  5.03it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 5.296736891074633e-06, 'epoch': 0.95}


                                                     
 95%|█████████▍| 14751/15547 [52:36<14:35,  1.10s/it]

{'eval_loss': 0.0, 'eval_runtime': 2.9916, 'eval_samples_per_second': 835.678, 'eval_steps_per_second': 13.371, 'epoch': 0.95}


 95%|█████████▍| 14761/15547 [52:38<02:57,  4.44it/s]

{'loss': 0.0, 'learning_rate': 5.230278460822756e-06, 'epoch': 0.95}


 95%|█████████▌| 14771/15547 [52:40<02:35,  4.98it/s]

{'loss': 0.0, 'learning_rate': 5.163820030570879e-06, 'epoch': 0.95}


 95%|█████████▌| 14781/15547 [52:42<02:33,  5.00it/s]

{'loss': 0.0, 'learning_rate': 5.097361600319e-06, 'epoch': 0.95}


 95%|█████████▌| 14791/15547 [52:44<02:29,  5.04it/s]

{'loss': 0.0, 'learning_rate': 5.030903170067124e-06, 'epoch': 0.95}


 95%|█████████▌| 14801/15547 [52:46<02:39,  4.68it/s]

{'loss': 0.0, 'learning_rate': 4.9644447398152455e-06, 'epoch': 0.95}


 95%|█████████▌| 14811/15547 [52:48<02:28,  4.97it/s]

{'loss': 0.0, 'learning_rate': 4.897986309563368e-06, 'epoch': 0.95}


 95%|█████████▌| 14821/15547 [52:50<02:25,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.831527879311491e-06, 'epoch': 0.95}


 95%|█████████▌| 14831/15547 [52:52<02:23,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.765069449059613e-06, 'epoch': 0.95}


 95%|█████████▌| 14841/15547 [52:54<02:21,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.698611018807736e-06, 'epoch': 0.95}


 96%|█████████▌| 14851/15547 [52:56<02:19,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.6321525885558584e-06, 'epoch': 0.96}


 96%|█████████▌| 14861/15547 [52:58<02:18,  4.96it/s]

{'loss': 0.0, 'learning_rate': 4.565694158303981e-06, 'epoch': 0.96}


 96%|█████████▌| 14871/15547 [53:00<02:15,  5.00it/s]

{'loss': 0.0, 'learning_rate': 4.499235728052104e-06, 'epoch': 0.96}


 96%|█████████▌| 14881/15547 [53:02<02:13,  4.97it/s]

{'loss': 0.0, 'learning_rate': 4.432777297800226e-06, 'epoch': 0.96}


 96%|█████████▌| 14891/15547 [53:04<02:11,  4.98it/s]

{'loss': 0.0, 'learning_rate': 4.366318867548349e-06, 'epoch': 0.96}


 96%|█████████▌| 14901/15547 [53:06<02:20,  4.60it/s]

{'loss': 0.0, 'learning_rate': 4.299860437296471e-06, 'epoch': 0.96}


 96%|█████████▌| 14911/15547 [53:08<02:08,  4.94it/s]

{'loss': 0.0, 'learning_rate': 4.233402007044594e-06, 'epoch': 0.96}


 96%|█████████▌| 14921/15547 [53:10<02:05,  4.97it/s]

{'loss': 0.0, 'learning_rate': 4.1669435767927165e-06, 'epoch': 0.96}


 96%|█████████▌| 14931/15547 [53:12<02:03,  4.97it/s]

{'loss': 0.0, 'learning_rate': 4.100485146540839e-06, 'epoch': 0.96}


 96%|█████████▌| 14941/15547 [53:14<02:01,  4.97it/s]

{'loss': 0.0, 'learning_rate': 4.034026716288962e-06, 'epoch': 0.96}


 96%|█████████▌| 14951/15547 [53:16<01:59,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.967568286037084e-06, 'epoch': 0.96}


 96%|█████████▌| 14961/15547 [53:18<01:57,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.901109855785207e-06, 'epoch': 0.96}


 96%|█████████▋| 14971/15547 [53:20<01:55,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.8346514255333286e-06, 'epoch': 0.96}


 96%|█████████▋| 14981/15547 [53:22<01:54,  4.96it/s]

{'loss': 0.0, 'learning_rate': 3.768192995281452e-06, 'epoch': 0.96}


 96%|█████████▋| 14991/15547 [53:24<01:51,  4.99it/s]

{'loss': 0.0, 'learning_rate': 3.701734565029574e-06, 'epoch': 0.96}


 96%|█████████▋| 15000/15547 [53:26<02:03,  4.44it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 3.6352761347776967e-06, 'epoch': 0.96}


                                                     
 96%|█████████▋| 15000/15547 [53:29<02:03,  4.44it/s]Saving model checkpoint to ./results/checkpoint-15000
Configuration saved in ./results/checkpoint-15000/config.json


{'eval_loss': 0.0, 'eval_runtime': 3.0116, 'eval_samples_per_second': 830.135, 'eval_steps_per_second': 13.282, 'epoch': 0.96}


Model weights saved in ./results/checkpoint-15000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
 97%|█████████▋| 15011/15547 [53:33<02:05,  4.26it/s]

{'loss': 0.0, 'learning_rate': 3.568817704525819e-06, 'epoch': 0.97}


 97%|█████████▋| 15021/15547 [53:35<01:45,  4.97it/s]

{'loss': 0.0, 'learning_rate': 3.502359274273942e-06, 'epoch': 0.97}


 97%|█████████▋| 15031/15547 [53:37<01:43,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.4359008440220645e-06, 'epoch': 0.97}


 97%|█████████▋| 15041/15547 [53:39<01:41,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.3694424137701866e-06, 'epoch': 0.97}


 97%|█████████▋| 15051/15547 [53:40<01:38,  5.06it/s]

{'loss': 0.0, 'learning_rate': 3.3029839835183096e-06, 'epoch': 0.97}


 97%|█████████▋| 15061/15547 [53:42<01:36,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.2365255532664318e-06, 'epoch': 0.97}


 97%|█████████▋| 15071/15547 [53:44<01:34,  5.03it/s]

{'loss': 0.0, 'learning_rate': 3.1700671230145544e-06, 'epoch': 0.97}


 97%|█████████▋| 15081/15547 [53:46<01:34,  4.92it/s]

{'loss': 0.0, 'learning_rate': 3.103608692762677e-06, 'epoch': 0.97}


 97%|█████████▋| 15091/15547 [53:48<01:31,  4.98it/s]

{'loss': 0.0, 'learning_rate': 3.0371502625107995e-06, 'epoch': 0.97}


 97%|█████████▋| 15101/15547 [53:51<01:36,  4.60it/s]

{'loss': 0.0, 'learning_rate': 2.970691832258922e-06, 'epoch': 0.97}


 97%|█████████▋| 15111/15547 [53:53<01:28,  4.95it/s]

{'loss': 0.0, 'learning_rate': 2.9042334020070447e-06, 'epoch': 0.97}


 97%|█████████▋| 15121/15547 [53:54<01:24,  5.03it/s]

{'loss': 0.0, 'learning_rate': 2.8377749717551673e-06, 'epoch': 0.97}


 97%|█████████▋| 15131/15547 [53:56<01:23,  5.01it/s]

{'loss': 0.0, 'learning_rate': 2.77131654150329e-06, 'epoch': 0.97}


 97%|█████████▋| 15141/15547 [53:58<01:20,  5.03it/s]

{'loss': 0.0, 'learning_rate': 2.7048581112514124e-06, 'epoch': 0.97}


 97%|█████████▋| 15151/15547 [54:00<01:19,  4.99it/s]

{'loss': 0.0, 'learning_rate': 2.638399680999535e-06, 'epoch': 0.97}


 98%|█████████▊| 15161/15547 [54:02<01:17,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.571941250747657e-06, 'epoch': 0.98}


 98%|█████████▊| 15171/15547 [54:04<01:14,  5.05it/s]

{'loss': 0.0, 'learning_rate': 2.50548282049578e-06, 'epoch': 0.98}


 98%|█████████▊| 15181/15547 [54:06<01:12,  5.03it/s]

{'loss': 0.0, 'learning_rate': 2.4390243902439027e-06, 'epoch': 0.98}


 98%|█████████▊| 15191/15547 [54:08<01:10,  5.04it/s]

{'loss': 0.0, 'learning_rate': 2.372565959992025e-06, 'epoch': 0.98}


 98%|█████████▊| 15201/15547 [54:10<01:15,  4.60it/s]

{'loss': 0.0, 'learning_rate': 2.3061075297401475e-06, 'epoch': 0.98}


 98%|█████████▊| 15211/15547 [54:12<01:07,  4.98it/s]

{'loss': 0.0, 'learning_rate': 2.2396490994882705e-06, 'epoch': 0.98}


 98%|█████████▊| 15221/15547 [54:14<01:05,  5.00it/s]

{'loss': 0.0, 'learning_rate': 2.1731906692363926e-06, 'epoch': 0.98}


 98%|█████████▊| 15231/15547 [54:16<01:02,  5.02it/s]

{'loss': 0.0, 'learning_rate': 2.1067322389845152e-06, 'epoch': 0.98}


 98%|█████████▊| 15241/15547 [54:18<01:00,  5.09it/s]

{'loss': 0.0, 'learning_rate': 2.040273808732638e-06, 'epoch': 0.98}


 98%|█████████▊| 15250/15547 [54:20<00:58,  5.04it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 1.9738153784807604e-06, 'epoch': 0.98}


                                                     
 98%|█████████▊| 15251/15547 [54:23<05:27,  1.10s/it]

{'eval_loss': 0.0, 'eval_runtime': 3.0095, 'eval_samples_per_second': 830.694, 'eval_steps_per_second': 13.291, 'epoch': 0.98}


 98%|█████████▊| 15261/15547 [54:25<01:04,  4.43it/s]

{'loss': 0.0, 'learning_rate': 1.907356948228883e-06, 'epoch': 0.98}


 98%|█████████▊| 15271/15547 [54:27<00:55,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.8408985179770053e-06, 'epoch': 0.98}


 98%|█████████▊| 15281/15547 [54:29<00:54,  4.92it/s]

{'loss': 0.0, 'learning_rate': 1.774440087725128e-06, 'epoch': 0.98}


 98%|█████████▊| 15291/15547 [54:31<00:51,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.7079816574732507e-06, 'epoch': 0.98}


 98%|█████████▊| 15301/15547 [54:34<00:54,  4.48it/s]

{'loss': 0.0, 'learning_rate': 1.641523227221373e-06, 'epoch': 0.98}


 98%|█████████▊| 15311/15547 [54:36<00:47,  4.95it/s]

{'loss': 0.0, 'learning_rate': 1.5750647969694957e-06, 'epoch': 0.98}


 99%|█████████▊| 15321/15547 [54:38<00:45,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.5086063667176182e-06, 'epoch': 0.99}


 99%|█████████▊| 15331/15547 [54:40<00:43,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.4421479364657408e-06, 'epoch': 0.99}


 99%|█████████▊| 15341/15547 [54:42<00:41,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.3756895062138634e-06, 'epoch': 0.99}


 99%|█████████▊| 15351/15547 [54:43<00:39,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.3092310759619858e-06, 'epoch': 0.99}


 99%|█████████▉| 15361/15547 [54:45<00:37,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.2427726457101083e-06, 'epoch': 0.99}


 99%|█████████▉| 15371/15547 [54:47<00:35,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.176314215458231e-06, 'epoch': 0.99}


 99%|█████████▉| 15381/15547 [54:49<00:33,  4.99it/s]

{'loss': 0.0, 'learning_rate': 1.1098557852063535e-06, 'epoch': 0.99}


 99%|█████████▉| 15391/15547 [54:51<00:31,  4.98it/s]

{'loss': 0.0, 'learning_rate': 1.043397354954476e-06, 'epoch': 0.99}


 99%|█████████▉| 15401/15547 [54:54<00:31,  4.57it/s]

{'loss': 0.0, 'learning_rate': 9.769389247025985e-07, 'epoch': 0.99}


 99%|█████████▉| 15411/15547 [54:56<00:27,  4.97it/s]

{'loss': 0.0, 'learning_rate': 9.104804944507211e-07, 'epoch': 0.99}


 99%|█████████▉| 15421/15547 [54:58<00:25,  4.97it/s]

{'loss': 0.0, 'learning_rate': 8.440220641988437e-07, 'epoch': 0.99}


 99%|█████████▉| 15431/15547 [55:00<00:23,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.775636339469662e-07, 'epoch': 0.99}


 99%|█████████▉| 15441/15547 [55:02<00:21,  5.00it/s]

{'loss': 0.0, 'learning_rate': 7.111052036950888e-07, 'epoch': 0.99}


 99%|█████████▉| 15451/15547 [55:04<00:19,  4.99it/s]

{'loss': 0.0, 'learning_rate': 6.446467734432112e-07, 'epoch': 0.99}


 99%|█████████▉| 15461/15547 [55:06<00:17,  4.99it/s]

{'loss': 0.0, 'learning_rate': 5.781883431913338e-07, 'epoch': 0.99}


100%|█████████▉| 15471/15547 [55:08<00:15,  4.97it/s]

{'loss': 0.0, 'learning_rate': 5.117299129394563e-07, 'epoch': 1.0}


100%|█████████▉| 15481/15547 [55:10<00:13,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.4527148268757893e-07, 'epoch': 1.0}


100%|█████████▉| 15491/15547 [55:11<00:11,  5.00it/s]

{'loss': 0.0, 'learning_rate': 3.7881305243570146e-07, 'epoch': 1.0}


100%|█████████▉| 15500/15547 [55:13<00:10,  4.38it/s]***** Running Evaluation *****
  Num examples = 2500
  Batch size = 64


{'loss': 0.0, 'learning_rate': 3.1235462218382404e-07, 'epoch': 1.0}


                                                     
100%|█████████▉| 15500/15547 [55:16<00:10,  4.38it/s]Saving model checkpoint to ./results/checkpoint-15500
Configuration saved in ./results/checkpoint-15500/config.json


{'eval_loss': 0.0, 'eval_runtime': 3.0101, 'eval_samples_per_second': 830.529, 'eval_steps_per_second': 13.288, 'epoch': 1.0}


Model weights saved in ./results/checkpoint-15500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
100%|█████████▉| 15511/15547 [55:20<00:08,  4.25it/s]

{'loss': 0.0, 'learning_rate': 2.4589619193194657e-07, 'epoch': 1.0}


100%|█████████▉| 15521/15547 [55:22<00:05,  4.97it/s]

{'loss': 0.0, 'learning_rate': 1.7943776168006912e-07, 'epoch': 1.0}


100%|█████████▉| 15531/15547 [55:24<00:03,  5.00it/s]

{'loss': 0.0, 'learning_rate': 1.1297933142819168e-07, 'epoch': 1.0}


100%|█████████▉| 15541/15547 [55:26<00:01,  4.99it/s]

{'loss': 0.0, 'learning_rate': 4.652090117631422e-08, 'epoch': 1.0}


100%|██████████| 15547/15547 [55:27<00:00,  5.06it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 15547/15547 [55:27<00:00,  4.67it/s]

{'train_runtime': 3328.8002, 'train_samples_per_second': 149.453, 'train_steps_per_second': 4.67, 'train_loss': 0.03539331228952624, 'epoch': 1.0}





TrainOutput(global_step=15547, training_loss=0.03539331228952624, metrics={'train_runtime': 3328.8002, 'train_samples_per_second': 149.453, 'train_steps_per_second': 4.67, 'train_loss': 0.03539331228952624, 'epoch': 1.0})