In [3]:
import torch
import numpy as np
import pandas as pd

from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from transformers import AutoModel


In [8]:
# Kiểm tra GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
# Tải tokenizer và mô hình
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
model = AutoModel.from_pretrained("vinai/phobert-base").to(device)

In [10]:
# Đọc dữ liệu
train_df = pd.read_csv('train_data.csv')
val_df = pd.read_csv('val_data.csv')

train_texts = train_df['content'].tolist()
train_labels = np.array(train_df['label']).astype(int)
val_texts = val_df['content'].tolist()
val_labels = np.array(val_df['label']).astype(int)

In [11]:
# Tokenize dữ liệu với padding
def tokenize_and_pad(texts, tokenizer, max_length=256):
    return tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length
    )

train_encodings = tokenize_and_pad(train_texts, tokenizer)
val_encodings = tokenize_and_pad(val_texts, tokenizer)

In [12]:
# Kiểm tra dữ liệu sau khi token hóa
print(train_encodings.keys())
print(val_encodings.keys())
print(train_encodings['input_ids'][:2])
print(val_encodings['input_ids'][:2])

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[[0, 1656, 8, 1347, 8915, 336, 5963, 2546, 620, 396, 30, 1302, 9412, 56669, 11, 197, 133, 151, 3634, 848, 99, 396, 123, 292, 336, 20014, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 2925, 474, 2515, 23523, 34, 275, 262, 829, 133

In [13]:
# Tạo dataset
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(device).squeeze()
        return item

    def __len__(self):
        return len(self.labels)

In [14]:
train_dataset = FakeNewsDataset(train_encodings, train_labels.tolist())
val_dataset = FakeNewsDataset(val_encodings, val_labels.tolist())

In [15]:
# Tải mô hình phân loại
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
def train_model(learning_rate, weight_decay, num_train_epochs):
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=weight_decay,
        logging_dir='./logs',
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=50,
        save_steps=50,
        save_total_limit=3,
        load_best_model_at_end=True,
        learning_rate=learning_rate,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    trainer.train()

    eval_result = trainer.evaluate()
    return eval_result['eval_loss']

In [17]:
# Tinh chỉnh siêu tham số với Grid Search
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],
    'weight_decay': [0.01, 0.015, 0.02],
    'num_train_epochs': [3, 4, 5],
}

best_params = None
best_score = float('inf')

for lr in param_grid['learning_rate']:
    for wd in param_grid['weight_decay']:
        for epochs in param_grid['num_train_epochs']:
            eval_loss = train_model(lr, wd, epochs)
            print(f"Learning rate: {lr}, Weight decay: {wd}, Epochs: {epochs}, Eval loss: {eval_loss}")
            if eval_loss < best_score:
                best_score = eval_loss
                best_params = {'learning_rate': lr, 'weight_decay': wd, 'num_train_epochs': epochs}

print(f"Best params: {best_params}, Best eval loss: {best_score}")



  0%|          | 0/423 [00:00<?, ?it/s]

{'loss': 0.7129, 'grad_norm': 3.921420097351074, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.07}
{'loss': 0.6952, 'grad_norm': 3.9515187740325928, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.14}
{'loss': 0.6927, 'grad_norm': 4.282040119171143, 'learning_rate': 3e-06, 'epoch': 0.21}
{'loss': 0.6708, 'grad_norm': 2.932995319366455, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.28}
{'loss': 0.6709, 'grad_norm': 6.047645568847656, 'learning_rate': 5e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.6601576805114746, 'eval_runtime': 212.5339, 'eval_samples_per_second': 1.327, 'eval_steps_per_second': 0.169, 'epoch': 0.35}
{'loss': 0.663, 'grad_norm': 3.6535658836364746, 'learning_rate': 6e-06, 'epoch': 0.43}
{'loss': 0.64, 'grad_norm': 3.5825765132904053, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.5}
{'loss': 0.6077, 'grad_norm': 4.021620750427246, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.57}
{'loss': 0.6546, 'grad_norm': 6.027839183807373, 'learning_rate': 9e-06, 'epoch': 0.64}
{'loss': 0.5909, 'grad_norm': 3.8980185985565186, 'learning_rate': 1e-05, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.5349841713905334, 'eval_runtime': 216.1392, 'eval_samples_per_second': 1.305, 'eval_steps_per_second': 0.167, 'epoch': 0.71}
{'loss': 0.5265, 'grad_norm': 6.771764278411865, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.78}
{'loss': 0.5031, 'grad_norm': 5.916067123413086, 'learning_rate': 1.2e-05, 'epoch': 0.85}
{'loss': 0.5512, 'grad_norm': 10.993598937988281, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.92}
{'loss': 0.4061, 'grad_norm': 6.737442493438721, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.99}
{'loss': 0.4308, 'grad_norm': 4.456228733062744, 'learning_rate': 1.5e-05, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.4007168114185333, 'eval_runtime': 178.7137, 'eval_samples_per_second': 1.578, 'eval_steps_per_second': 0.201, 'epoch': 1.06}
{'loss': 0.3187, 'grad_norm': 8.337357521057129, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.13}
{'loss': 0.4794, 'grad_norm': 37.19474411010742, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.21}
{'loss': 0.4252, 'grad_norm': 8.245497703552246, 'learning_rate': 1.8e-05, 'epoch': 1.28}
{'loss': 0.3459, 'grad_norm': 9.643916130065918, 'learning_rate': 1.9e-05, 'epoch': 1.35}
{'loss': 0.487, 'grad_norm': 8.415472030639648, 'learning_rate': 2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.3932185471057892, 'eval_runtime': 235.6558, 'eval_samples_per_second': 1.197, 'eval_steps_per_second': 0.153, 'epoch': 1.42}
{'loss': 0.4598, 'grad_norm': 17.424936294555664, 'learning_rate': 2.1e-05, 'epoch': 1.49}
{'loss': 0.3926, 'grad_norm': 10.93563461303711, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.56}
{'loss': 0.3701, 'grad_norm': 15.74295425415039, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.63}
{'loss': 0.3287, 'grad_norm': 22.154041290283203, 'learning_rate': 2.4e-05, 'epoch': 1.7}
{'loss': 0.3245, 'grad_norm': 5.805659770965576, 'learning_rate': 2.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.33798202872276306, 'eval_runtime': 207.9478, 'eval_samples_per_second': 1.356, 'eval_steps_per_second': 0.173, 'epoch': 1.77}
{'loss': 0.3033, 'grad_norm': 23.958932876586914, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.84}
{'loss': 0.3737, 'grad_norm': 15.364568710327148, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.91}
{'loss': 0.1727, 'grad_norm': 10.839015007019043, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.99}
{'loss': 0.2892, 'grad_norm': 10.783150672912598, 'learning_rate': 2.9e-05, 'epoch': 2.06}
{'loss': 0.3218, 'grad_norm': 13.726067543029785, 'learning_rate': 3e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.49028533697128296, 'eval_runtime': 221.9122, 'eval_samples_per_second': 1.271, 'eval_steps_per_second': 0.162, 'epoch': 2.13}
{'loss': 0.3548, 'grad_norm': 19.970218658447266, 'learning_rate': 3.1e-05, 'epoch': 2.2}
{'loss': 0.1933, 'grad_norm': 3.8245131969451904, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.27}
{'loss': 0.3241, 'grad_norm': 0.2678028643131256, 'learning_rate': 3.3e-05, 'epoch': 2.34}
{'loss': 0.271, 'grad_norm': 35.42209243774414, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.41}
{'loss': 0.474, 'grad_norm': 7.909788608551025, 'learning_rate': 3.5e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.566775381565094, 'eval_runtime': 218.8765, 'eval_samples_per_second': 1.288, 'eval_steps_per_second': 0.164, 'epoch': 2.48}
{'loss': 0.4782, 'grad_norm': 35.464393615722656, 'learning_rate': 3.6e-05, 'epoch': 2.55}
{'loss': 0.3656, 'grad_norm': 27.4945068359375, 'learning_rate': 3.7e-05, 'epoch': 2.62}
{'loss': 0.2954, 'grad_norm': 3.410459280014038, 'learning_rate': 3.8e-05, 'epoch': 2.7}
{'loss': 0.3215, 'grad_norm': 3.148754835128784, 'learning_rate': 3.9000000000000006e-05, 'epoch': 2.77}
{'loss': 0.3136, 'grad_norm': 0.3942353427410126, 'learning_rate': 4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.3220149874687195, 'eval_runtime': 178.5904, 'eval_samples_per_second': 1.579, 'eval_steps_per_second': 0.202, 'epoch': 2.84}
{'loss': 0.2158, 'grad_norm': 39.900550842285156, 'learning_rate': 4.1e-05, 'epoch': 2.91}
{'loss': 0.2012, 'grad_norm': 67.73053741455078, 'learning_rate': 4.2e-05, 'epoch': 2.98}
{'train_runtime': 9480.908, 'train_samples_per_second': 0.356, 'train_steps_per_second': 0.045, 'train_loss': 0.4320543502803108, 'epoch': 3.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 5e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 0.3220149874687195




  0%|          | 0/564 [00:00<?, ?it/s]

{'loss': 0.1244, 'grad_norm': 0.7871965169906616, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.07}
{'loss': 0.0962, 'grad_norm': 0.36190322041511536, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.14}
{'loss': 0.2778, 'grad_norm': 3.1635282039642334, 'learning_rate': 3e-06, 'epoch': 0.21}
{'loss': 0.2195, 'grad_norm': 8.813382148742676, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.28}
{'loss': 0.1036, 'grad_norm': 2.2122669219970703, 'learning_rate': 5e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.34234383702278137, 'eval_runtime': 226.5648, 'eval_samples_per_second': 1.245, 'eval_steps_per_second': 0.159, 'epoch': 0.35}
{'loss': 0.1253, 'grad_norm': 4.557926177978516, 'learning_rate': 6e-06, 'epoch': 0.43}
{'loss': 0.1928, 'grad_norm': 0.16578838229179382, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.5}
{'loss': 0.1216, 'grad_norm': 34.65119171142578, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.57}
{'loss': 0.0634, 'grad_norm': 2.569261074066162, 'learning_rate': 9e-06, 'epoch': 0.64}
{'loss': 0.126, 'grad_norm': 3.3147103786468506, 'learning_rate': 1e-05, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.41569989919662476, 'eval_runtime': 154.6195, 'eval_samples_per_second': 1.824, 'eval_steps_per_second': 0.233, 'epoch': 0.71}
{'loss': 0.1219, 'grad_norm': 4.259315490722656, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.78}
{'loss': 0.0585, 'grad_norm': 1.044632911682129, 'learning_rate': 1.2e-05, 'epoch': 0.85}
{'loss': 0.1897, 'grad_norm': 2.3972041606903076, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.92}
{'loss': 0.0061, 'grad_norm': 0.0739734098315239, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.99}
{'loss': 0.1263, 'grad_norm': 22.410890579223633, 'learning_rate': 1.5e-05, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.46822530031204224, 'eval_runtime': 151.1584, 'eval_samples_per_second': 1.866, 'eval_steps_per_second': 0.238, 'epoch': 1.06}
{'loss': 0.0484, 'grad_norm': 0.5538607835769653, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.13}
{'loss': 0.0206, 'grad_norm': 0.07662931829690933, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.21}
{'loss': 0.0912, 'grad_norm': 0.07220436632633209, 'learning_rate': 1.8e-05, 'epoch': 1.28}
{'loss': 0.0026, 'grad_norm': 0.0369395837187767, 'learning_rate': 1.9e-05, 'epoch': 1.35}
{'loss': 0.297, 'grad_norm': 125.045166015625, 'learning_rate': 2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.7278611063957214, 'eval_runtime': 149.3975, 'eval_samples_per_second': 1.888, 'eval_steps_per_second': 0.241, 'epoch': 1.42}
{'loss': 0.274, 'grad_norm': 9.493452072143555, 'learning_rate': 2.1e-05, 'epoch': 1.49}
{'loss': 0.2508, 'grad_norm': 12.791748046875, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.56}
{'loss': 0.0348, 'grad_norm': 1.8415225744247437, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.63}
{'loss': 0.1561, 'grad_norm': 0.12792906165122986, 'learning_rate': 2.4e-05, 'epoch': 1.7}
{'loss': 0.0039, 'grad_norm': 0.02365775965154171, 'learning_rate': 2.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.6949474811553955, 'eval_runtime': 149.7839, 'eval_samples_per_second': 1.883, 'eval_steps_per_second': 0.24, 'epoch': 1.77}
{'loss': 0.1797, 'grad_norm': 0.018529245629906654, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.84}
{'loss': 0.5376, 'grad_norm': 6.0164666175842285, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.91}
{'loss': 0.1132, 'grad_norm': 6.316965579986572, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.99}
{'loss': 0.2824, 'grad_norm': 13.903456687927246, 'learning_rate': 2.9e-05, 'epoch': 2.06}
{'loss': 0.249, 'grad_norm': 18.485990524291992, 'learning_rate': 3e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.7048978805541992, 'eval_runtime': 150.4782, 'eval_samples_per_second': 1.874, 'eval_steps_per_second': 0.239, 'epoch': 2.13}
{'loss': 0.1113, 'grad_norm': 1.126023769378662, 'learning_rate': 3.1e-05, 'epoch': 2.2}
{'loss': 0.0024, 'grad_norm': 0.029919132590293884, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.27}
{'loss': 0.1625, 'grad_norm': 0.018340082839131355, 'learning_rate': 3.3e-05, 'epoch': 2.34}
{'loss': 0.0578, 'grad_norm': 0.114692322909832, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.41}
{'loss': 0.0626, 'grad_norm': 0.020410938188433647, 'learning_rate': 3.5e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.6333380341529846, 'eval_runtime': 148.2458, 'eval_samples_per_second': 1.902, 'eval_steps_per_second': 0.243, 'epoch': 2.48}
{'loss': 0.0216, 'grad_norm': 0.014129583723843098, 'learning_rate': 3.6e-05, 'epoch': 2.55}
{'loss': 0.1633, 'grad_norm': 14.994200706481934, 'learning_rate': 3.7e-05, 'epoch': 2.62}
{'loss': 0.0374, 'grad_norm': 0.2241075336933136, 'learning_rate': 3.8e-05, 'epoch': 2.7}
{'loss': 0.0683, 'grad_norm': 21.158565521240234, 'learning_rate': 3.9000000000000006e-05, 'epoch': 2.77}
{'loss': 0.1063, 'grad_norm': 0.1906881332397461, 'learning_rate': 4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.7418174147605896, 'eval_runtime': 150.0504, 'eval_samples_per_second': 1.879, 'eval_steps_per_second': 0.24, 'epoch': 2.84}
{'loss': 0.122, 'grad_norm': 133.0859375, 'learning_rate': 4.1e-05, 'epoch': 2.91}
{'loss': 0.1146, 'grad_norm': 12.044413566589355, 'learning_rate': 4.2e-05, 'epoch': 2.98}
{'loss': 0.0657, 'grad_norm': 0.020367663353681564, 'learning_rate': 4.3e-05, 'epoch': 3.05}
{'loss': 0.0993, 'grad_norm': 0.05558827146887779, 'learning_rate': 4.4000000000000006e-05, 'epoch': 3.12}
{'loss': 0.05, 'grad_norm': 0.023798735812306404, 'learning_rate': 4.5e-05, 'epoch': 3.19}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.7826324701309204, 'eval_runtime': 151.7083, 'eval_samples_per_second': 1.859, 'eval_steps_per_second': 0.237, 'epoch': 3.19}
{'loss': 0.0008, 'grad_norm': 0.010269076563417912, 'learning_rate': 4.600000000000001e-05, 'epoch': 3.26}
{'loss': 0.2514, 'grad_norm': 0.011517581529915333, 'learning_rate': 4.7e-05, 'epoch': 3.33}
{'loss': 0.523, 'grad_norm': 0.058802392333745956, 'learning_rate': 4.8e-05, 'epoch': 3.4}
{'loss': 0.5445, 'grad_norm': 42.84156036376953, 'learning_rate': 4.9e-05, 'epoch': 3.48}
{'loss': 0.1629, 'grad_norm': 0.17320697009563446, 'learning_rate': 5e-05, 'epoch': 3.55}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.5674419403076172, 'eval_runtime': 149.3138, 'eval_samples_per_second': 1.889, 'eval_steps_per_second': 0.241, 'epoch': 3.55}
{'loss': 0.0573, 'grad_norm': 3.953383445739746, 'learning_rate': 4.21875e-05, 'epoch': 3.62}
{'loss': 0.0448, 'grad_norm': 2.741203546524048, 'learning_rate': 3.4375e-05, 'epoch': 3.69}
{'loss': 0.0015, 'grad_norm': 0.018348470330238342, 'learning_rate': 2.6562500000000002e-05, 'epoch': 3.76}
{'loss': 0.1702, 'grad_norm': 0.011588722467422485, 'learning_rate': 1.8750000000000002e-05, 'epoch': 3.83}
{'loss': 0.1487, 'grad_norm': 0.03310811147093773, 'learning_rate': 1.09375e-05, 'epoch': 3.9}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.7117894887924194, 'eval_runtime': 148.3765, 'eval_samples_per_second': 1.901, 'eval_steps_per_second': 0.243, 'epoch': 3.9}
{'loss': 0.0639, 'grad_norm': 0.017369432374835014, 'learning_rate': 3.125e-06, 'epoch': 3.97}


Could not locate the best model at ./results\checkpoint-50\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 9122.2893, 'train_samples_per_second': 0.493, 'train_steps_per_second': 0.062, 'train_loss': 0.13668254762144877, 'epoch': 4.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 5e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 0.5187838673591614




  0%|          | 0/705 [00:00<?, ?it/s]

{'loss': 0.0474, 'grad_norm': 0.0161674153059721, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.07}
{'loss': 0.0008, 'grad_norm': 0.0746709480881691, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.14}
{'loss': 0.1255, 'grad_norm': 0.01370752602815628, 'learning_rate': 3e-06, 'epoch': 0.21}
{'loss': 0.0964, 'grad_norm': 83.09750366210938, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.28}
{'loss': 0.0006, 'grad_norm': 0.022232064977288246, 'learning_rate': 5e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.5458167791366577, 'eval_runtime': 152.0045, 'eval_samples_per_second': 1.855, 'eval_steps_per_second': 0.237, 'epoch': 0.35}
{'loss': 0.001, 'grad_norm': 0.010746937245130539, 'learning_rate': 6e-06, 'epoch': 0.43}
{'loss': 0.0005, 'grad_norm': 0.008216421119868755, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.5}
{'loss': 0.0515, 'grad_norm': 0.006638235412538052, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.57}
{'loss': 0.0003, 'grad_norm': 0.006056687328964472, 'learning_rate': 9e-06, 'epoch': 0.64}
{'loss': 0.0016, 'grad_norm': 0.007553804200142622, 'learning_rate': 1e-05, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.7010101675987244, 'eval_runtime': 150.7575, 'eval_samples_per_second': 1.871, 'eval_steps_per_second': 0.239, 'epoch': 0.71}
{'loss': 0.0003, 'grad_norm': 0.41761934757232666, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.78}
{'loss': 0.1846, 'grad_norm': 0.004999557975679636, 'learning_rate': 1.2e-05, 'epoch': 0.85}
{'loss': 0.1168, 'grad_norm': 0.010690233670175076, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.92}
{'loss': 0.1037, 'grad_norm': 0.004218723624944687, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.99}
{'loss': 0.0003, 'grad_norm': 0.006478494964540005, 'learning_rate': 1.5e-05, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.6049352884292603, 'eval_runtime': 151.7524, 'eval_samples_per_second': 1.858, 'eval_steps_per_second': 0.237, 'epoch': 1.06}
{'loss': 0.0003, 'grad_norm': 0.005279654171317816, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.13}
{'loss': 0.0003, 'grad_norm': 0.012753727845847607, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.21}
{'loss': 0.0372, 'grad_norm': 0.003526925342157483, 'learning_rate': 1.8e-05, 'epoch': 1.28}
{'loss': 0.0002, 'grad_norm': 0.0037761519197374582, 'learning_rate': 1.9e-05, 'epoch': 1.35}
{'loss': 0.0106, 'grad_norm': 0.010881839320063591, 'learning_rate': 2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.8876785039901733, 'eval_runtime': 164.0338, 'eval_samples_per_second': 1.719, 'eval_steps_per_second': 0.219, 'epoch': 1.42}
{'loss': 0.0003, 'grad_norm': 0.0033530069049447775, 'learning_rate': 2.1e-05, 'epoch': 1.49}
{'loss': 0.0002, 'grad_norm': 0.0018526121275499463, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.56}
{'loss': 0.0001, 'grad_norm': 0.0021617799066007137, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.63}
{'loss': 0.0001, 'grad_norm': 0.0019361431477591395, 'learning_rate': 2.4e-05, 'epoch': 1.7}
{'loss': 0.0356, 'grad_norm': 0.0015504328766837716, 'learning_rate': 2.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.9821171164512634, 'eval_runtime': 150.7656, 'eval_samples_per_second': 1.87, 'eval_steps_per_second': 0.239, 'epoch': 1.77}
{'loss': 0.0001, 'grad_norm': 0.0048737600445747375, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.84}
{'loss': 0.4284, 'grad_norm': 79.28575134277344, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.91}
{'loss': 0.1282, 'grad_norm': 0.04251129925251007, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.99}
{'loss': 0.4104, 'grad_norm': 0.01192459650337696, 'learning_rate': 2.9e-05, 'epoch': 2.06}
{'loss': 0.0727, 'grad_norm': 0.0430603064596653, 'learning_rate': 3e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.6467341184616089, 'eval_runtime': 148.1477, 'eval_samples_per_second': 1.904, 'eval_steps_per_second': 0.243, 'epoch': 2.13}
{'loss': 0.0813, 'grad_norm': 0.011041228659451008, 'learning_rate': 3.1e-05, 'epoch': 2.2}
{'loss': 0.0006, 'grad_norm': 0.007272783666849136, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.27}
{'loss': 0.0048, 'grad_norm': 0.004587641917169094, 'learning_rate': 3.3e-05, 'epoch': 2.34}
{'loss': 0.0008, 'grad_norm': 0.11256895214319229, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.41}
{'loss': 0.0687, 'grad_norm': 0.004267828539013863, 'learning_rate': 3.5e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.8334217667579651, 'eval_runtime': 148.9758, 'eval_samples_per_second': 1.893, 'eval_steps_per_second': 0.242, 'epoch': 2.48}
{'loss': 0.0013, 'grad_norm': 0.0030506104230880737, 'learning_rate': 3.6e-05, 'epoch': 2.55}
{'loss': 0.0005, 'grad_norm': 0.0029004872776567936, 'learning_rate': 3.7e-05, 'epoch': 2.62}
{'loss': 0.1075, 'grad_norm': 0.009935302659869194, 'learning_rate': 3.8e-05, 'epoch': 2.7}
{'loss': 0.1066, 'grad_norm': 0.014192173257470131, 'learning_rate': 3.9000000000000006e-05, 'epoch': 2.77}
{'loss': 0.0008, 'grad_norm': 0.013441392220556736, 'learning_rate': 4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.7159590721130371, 'eval_runtime': 148.8872, 'eval_samples_per_second': 1.894, 'eval_steps_per_second': 0.242, 'epoch': 2.84}
{'loss': 0.2951, 'grad_norm': 35.07199478149414, 'learning_rate': 4.1e-05, 'epoch': 2.91}
{'loss': 0.1794, 'grad_norm': 82.05105590820312, 'learning_rate': 4.2e-05, 'epoch': 2.98}
{'loss': 0.1094, 'grad_norm': 0.0677320659160614, 'learning_rate': 4.3e-05, 'epoch': 3.05}
{'loss': 0.2266, 'grad_norm': 13.56425666809082, 'learning_rate': 4.4000000000000006e-05, 'epoch': 3.12}
{'loss': 0.3014, 'grad_norm': 0.2184535413980484, 'learning_rate': 4.5e-05, 'epoch': 3.19}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.3750174641609192, 'eval_runtime': 148.9943, 'eval_samples_per_second': 1.893, 'eval_steps_per_second': 0.242, 'epoch': 3.19}
{'loss': 0.1139, 'grad_norm': 0.06886564940214157, 'learning_rate': 4.600000000000001e-05, 'epoch': 3.26}
{'loss': 0.1527, 'grad_norm': 0.03381667658686638, 'learning_rate': 4.7e-05, 'epoch': 3.33}
{'loss': 0.0789, 'grad_norm': 0.009286211803555489, 'learning_rate': 4.8e-05, 'epoch': 3.4}
{'loss': 0.1644, 'grad_norm': 0.08180661499500275, 'learning_rate': 4.9e-05, 'epoch': 3.48}
{'loss': 0.0805, 'grad_norm': 0.3956437408924103, 'learning_rate': 5e-05, 'epoch': 3.55}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.6054810285568237, 'eval_runtime': 151.7126, 'eval_samples_per_second': 1.859, 'eval_steps_per_second': 0.237, 'epoch': 3.55}
{'loss': 0.1117, 'grad_norm': 0.03521377965807915, 'learning_rate': 4.75609756097561e-05, 'epoch': 3.62}
{'loss': 0.1389, 'grad_norm': 338.580078125, 'learning_rate': 4.51219512195122e-05, 'epoch': 3.69}
{'loss': 0.5037, 'grad_norm': 51.00216293334961, 'learning_rate': 4.26829268292683e-05, 'epoch': 3.76}
{'loss': 0.0802, 'grad_norm': 0.09075962007045746, 'learning_rate': 4.0243902439024395e-05, 'epoch': 3.83}
{'loss': 0.2558, 'grad_norm': 0.03315642848610878, 'learning_rate': 3.780487804878049e-05, 'epoch': 3.9}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.5865392088890076, 'eval_runtime': 148.2576, 'eval_samples_per_second': 1.902, 'eval_steps_per_second': 0.243, 'epoch': 3.9}
{'loss': 0.251, 'grad_norm': 0.0295635387301445, 'learning_rate': 3.5365853658536584e-05, 'epoch': 3.97}
{'loss': 0.0147, 'grad_norm': 0.015603003092110157, 'learning_rate': 3.292682926829269e-05, 'epoch': 4.04}
{'loss': 0.2732, 'grad_norm': 0.012952701188623905, 'learning_rate': 3.048780487804878e-05, 'epoch': 4.11}
{'loss': 0.0015, 'grad_norm': 0.025391114875674248, 'learning_rate': 2.8048780487804882e-05, 'epoch': 4.18}
{'loss': 0.1862, 'grad_norm': 144.3021697998047, 'learning_rate': 2.5609756097560977e-05, 'epoch': 4.26}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.5887055993080139, 'eval_runtime': 149.881, 'eval_samples_per_second': 1.881, 'eval_steps_per_second': 0.24, 'epoch': 4.26}
{'loss': 0.1709, 'grad_norm': 0.023935332894325256, 'learning_rate': 2.3170731707317075e-05, 'epoch': 4.33}
{'loss': 0.0757, 'grad_norm': 3.2279438972473145, 'learning_rate': 2.073170731707317e-05, 'epoch': 4.4}
{'loss': 0.1502, 'grad_norm': 291.9654235839844, 'learning_rate': 1.8292682926829268e-05, 'epoch': 4.47}
{'loss': 0.068, 'grad_norm': 0.02630203776061535, 'learning_rate': 1.5853658536585366e-05, 'epoch': 4.54}
{'loss': 0.1528, 'grad_norm': 0.023942938074469566, 'learning_rate': 1.3414634146341466e-05, 'epoch': 4.61}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.5837041735649109, 'eval_runtime': 150.9091, 'eval_samples_per_second': 1.869, 'eval_steps_per_second': 0.239, 'epoch': 4.61}
{'loss': 0.0039, 'grad_norm': 0.027502181008458138, 'learning_rate': 1.0975609756097562e-05, 'epoch': 4.68}
{'loss': 0.0014, 'grad_norm': 0.021716201677918434, 'learning_rate': 8.53658536585366e-06, 'epoch': 4.75}
{'loss': 0.0012, 'grad_norm': 0.016695033758878708, 'learning_rate': 6.0975609756097564e-06, 'epoch': 4.82}
{'loss': 0.0684, 'grad_norm': 0.018653536215424538, 'learning_rate': 3.6585365853658537e-06, 'epoch': 4.89}
{'loss': 0.0092, 'grad_norm': 0.020115206018090248, 'learning_rate': 1.2195121951219514e-06, 'epoch': 4.96}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.5430057048797607, 'eval_runtime': 151.6304, 'eval_samples_per_second': 1.86, 'eval_steps_per_second': 0.237, 'epoch': 4.96}


Could not locate the best model at ./results\checkpoint-450\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 10648.9623, 'train_samples_per_second': 0.528, 'train_steps_per_second': 0.066, 'train_loss': 0.09149316107773334, 'epoch': 5.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 5e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 0.5412152409553528




  0%|          | 0/423 [00:00<?, ?it/s]

{'loss': 0.0011, 'grad_norm': 0.05469508469104767, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.07}
{'loss': 0.001, 'grad_norm': 0.016263313591480255, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.14}
{'loss': 0.0563, 'grad_norm': 0.015388895757496357, 'learning_rate': 3e-06, 'epoch': 0.21}
{'loss': 0.0008, 'grad_norm': 0.03149605542421341, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.28}
{'loss': 0.0006, 'grad_norm': 0.014028198085725307, 'learning_rate': 5e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.5698654651641846, 'eval_runtime': 151.5585, 'eval_samples_per_second': 1.861, 'eval_steps_per_second': 0.238, 'epoch': 0.35}
{'loss': 0.0009, 'grad_norm': 0.00840272568166256, 'learning_rate': 6e-06, 'epoch': 0.43}
{'loss': 0.0005, 'grad_norm': 0.007400491740554571, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.5}
{'loss': 0.0239, 'grad_norm': 0.011761612258851528, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.57}
{'loss': 0.0003, 'grad_norm': 0.006826445460319519, 'learning_rate': 9e-06, 'epoch': 0.64}
{'loss': 0.0003, 'grad_norm': 0.00483096344396472, 'learning_rate': 1e-05, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.6626893877983093, 'eval_runtime': 153.1719, 'eval_samples_per_second': 1.841, 'eval_steps_per_second': 0.235, 'epoch': 0.71}
{'loss': 0.0002, 'grad_norm': 0.006231086794286966, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.78}
{'loss': 0.0014, 'grad_norm': 0.0031268936581909657, 'learning_rate': 1.2e-05, 'epoch': 0.85}
{'loss': 0.0002, 'grad_norm': 0.0030917867552489042, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.92}
{'loss': 0.0109, 'grad_norm': 0.002585755893960595, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.99}
{'loss': 0.0001, 'grad_norm': 0.0018737444188445807, 'learning_rate': 1.5e-05, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.7979727387428284, 'eval_runtime': 152.4331, 'eval_samples_per_second': 1.85, 'eval_steps_per_second': 0.236, 'epoch': 1.06}
{'loss': 0.0001, 'grad_norm': 0.001835969160310924, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.13}
{'loss': 0.0001, 'grad_norm': 0.002127373591065407, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.21}
{'loss': 0.0001, 'grad_norm': 0.0012382031418383121, 'learning_rate': 1.8e-05, 'epoch': 1.28}
{'loss': 0.0001, 'grad_norm': 0.0011930797481909394, 'learning_rate': 1.9e-05, 'epoch': 1.35}
{'loss': 0.0001, 'grad_norm': 0.0012075637932866812, 'learning_rate': 2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.886301577091217, 'eval_runtime': 150.9806, 'eval_samples_per_second': 1.868, 'eval_steps_per_second': 0.238, 'epoch': 1.42}
{'loss': 0.0001, 'grad_norm': 0.0011856689816340804, 'learning_rate': 2.1e-05, 'epoch': 1.49}
{'loss': 0.0001, 'grad_norm': 0.0008352631703019142, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.56}
{'loss': 0.0, 'grad_norm': 0.000864662230014801, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.63}
{'loss': 0.0, 'grad_norm': 0.0009132404229603708, 'learning_rate': 2.4e-05, 'epoch': 1.7}
{'loss': 0.0, 'grad_norm': 0.0006353181670419872, 'learning_rate': 2.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.930128276348114, 'eval_runtime': 152.2872, 'eval_samples_per_second': 1.852, 'eval_steps_per_second': 0.236, 'epoch': 1.77}
{'loss': 0.0, 'grad_norm': 0.0007124104886315763, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.84}
{'loss': 0.0, 'grad_norm': 0.0005636770511046052, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.91}
{'loss': 0.0, 'grad_norm': 0.0005250591784715652, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.99}
{'loss': 0.0, 'grad_norm': 0.0006959957536309958, 'learning_rate': 2.9e-05, 'epoch': 2.06}
{'loss': 0.0, 'grad_norm': 0.0005214551347307861, 'learning_rate': 3e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.9730871915817261, 'eval_runtime': 151.7461, 'eval_samples_per_second': 1.858, 'eval_steps_per_second': 0.237, 'epoch': 2.13}
{'loss': 0.0, 'grad_norm': 0.0003883748431690037, 'learning_rate': 3.1e-05, 'epoch': 2.2}
{'loss': 0.0, 'grad_norm': 0.0003508273512125015, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.27}
{'loss': 0.0, 'grad_norm': 0.000276851758826524, 'learning_rate': 3.3e-05, 'epoch': 2.34}
{'loss': 0.0, 'grad_norm': 0.00035707224742509425, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.41}
{'loss': 0.0, 'grad_norm': 0.0003453380486462265, 'learning_rate': 3.5e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.994270920753479, 'eval_runtime': 151.9569, 'eval_samples_per_second': 1.856, 'eval_steps_per_second': 0.237, 'epoch': 2.48}
{'loss': 0.0, 'grad_norm': 0.00033514350070618093, 'learning_rate': 3.6e-05, 'epoch': 2.55}
{'loss': 0.0, 'grad_norm': 0.0002911092306021601, 'learning_rate': 3.7e-05, 'epoch': 2.62}
{'loss': 0.0, 'grad_norm': 0.00025015196297317743, 'learning_rate': 3.8e-05, 'epoch': 2.7}
{'loss': 0.2696, 'grad_norm': 0.0037442026659846306, 'learning_rate': 3.9000000000000006e-05, 'epoch': 2.77}
{'loss': 0.2848, 'grad_norm': 0.27289772033691406, 'learning_rate': 4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.9512500166893005, 'eval_runtime': 150.45, 'eval_samples_per_second': 1.874, 'eval_steps_per_second': 0.239, 'epoch': 2.84}
{'loss': 0.0936, 'grad_norm': 0.024913925677537918, 'learning_rate': 4.1e-05, 'epoch': 2.91}
{'loss': 0.3038, 'grad_norm': 801.4618530273438, 'learning_rate': 4.2e-05, 'epoch': 2.98}


Could not locate the best model at ./results\checkpoint-50\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 6305.9667, 'train_samples_per_second': 0.535, 'train_steps_per_second': 0.067, 'train_loss': 0.03559036533998867, 'epoch': 3.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 5e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 1.5279346704483032




  0%|          | 0/564 [00:00<?, ?it/s]

{'loss': 1.2233, 'grad_norm': 77.37177276611328, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.07}
{'loss': 0.9579, 'grad_norm': 20.866579055786133, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.14}
{'loss': 0.7144, 'grad_norm': 292.65814208984375, 'learning_rate': 3e-06, 'epoch': 0.21}
{'loss': 0.433, 'grad_norm': 0.2007458508014679, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.28}
{'loss': 0.2456, 'grad_norm': 4.326260089874268, 'learning_rate': 5e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.7257171869277954, 'eval_runtime': 149.6552, 'eval_samples_per_second': 1.884, 'eval_steps_per_second': 0.241, 'epoch': 0.35}
{'loss': 0.2758, 'grad_norm': 49.516170501708984, 'learning_rate': 6e-06, 'epoch': 0.43}
{'loss': 0.0117, 'grad_norm': 0.14327864348888397, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.5}
{'loss': 0.148, 'grad_norm': 0.12699903547763824, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.57}
{'loss': 0.1298, 'grad_norm': 0.3457464277744293, 'learning_rate': 9e-06, 'epoch': 0.64}
{'loss': 0.0061, 'grad_norm': 0.10090821236371994, 'learning_rate': 1e-05, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.5831620097160339, 'eval_runtime': 150.3655, 'eval_samples_per_second': 1.875, 'eval_steps_per_second': 0.239, 'epoch': 0.71}
{'loss': 0.0049, 'grad_norm': 0.06623657792806625, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.78}
{'loss': 0.0039, 'grad_norm': 0.044374141842126846, 'learning_rate': 1.2e-05, 'epoch': 0.85}
{'loss': 0.0031, 'grad_norm': 0.05689585208892822, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.92}
{'loss': 0.0026, 'grad_norm': 0.037665337324142456, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.99}
{'loss': 0.0773, 'grad_norm': 0.03592977300286293, 'learning_rate': 1.5e-05, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.6837146282196045, 'eval_runtime': 151.2485, 'eval_samples_per_second': 1.864, 'eval_steps_per_second': 0.238, 'epoch': 1.06}
{'loss': 0.0022, 'grad_norm': 0.033627238124608994, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.13}
{'loss': 0.0018, 'grad_norm': 0.03422698378562927, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.21}
{'loss': 0.0736, 'grad_norm': 0.026438353583216667, 'learning_rate': 1.8e-05, 'epoch': 1.28}
{'loss': 0.0014, 'grad_norm': 0.01937846466898918, 'learning_rate': 1.9e-05, 'epoch': 1.35}
{'loss': 0.0989, 'grad_norm': 0.01829204522073269, 'learning_rate': 2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.6773088574409485, 'eval_runtime': 151.8007, 'eval_samples_per_second': 1.858, 'eval_steps_per_second': 0.237, 'epoch': 1.42}
{'loss': 0.0011, 'grad_norm': 0.01802876777946949, 'learning_rate': 2.1e-05, 'epoch': 1.49}
{'loss': 0.001, 'grad_norm': 0.015783606097102165, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.56}
{'loss': 0.0773, 'grad_norm': 0.023394236341118813, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.63}
{'loss': 0.0008, 'grad_norm': 0.013602412305772305, 'learning_rate': 2.4e-05, 'epoch': 1.7}
{'loss': 0.0008, 'grad_norm': 0.01334297377616167, 'learning_rate': 2.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.6301312446594238, 'eval_runtime': 152.8697, 'eval_samples_per_second': 1.845, 'eval_steps_per_second': 0.235, 'epoch': 1.77}
{'loss': 0.0007, 'grad_norm': 0.011189889162778854, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.84}
{'loss': 0.0006, 'grad_norm': 0.01045490987598896, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.91}
{'loss': 0.0905, 'grad_norm': 0.011438731104135513, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.99}
{'loss': 0.0007, 'grad_norm': 0.01193308550864458, 'learning_rate': 2.9e-05, 'epoch': 2.06}
{'loss': 0.0899, 'grad_norm': 0.011508074589073658, 'learning_rate': 3e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.7782942652702332, 'eval_runtime': 151.8176, 'eval_samples_per_second': 1.857, 'eval_steps_per_second': 0.237, 'epoch': 2.13}
{'loss': 0.0006, 'grad_norm': 0.010428015142679214, 'learning_rate': 3.1e-05, 'epoch': 2.2}
{'loss': 0.0005, 'grad_norm': 0.008841383270919323, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.27}
{'loss': 0.0005, 'grad_norm': 0.007198705337941647, 'learning_rate': 3.3e-05, 'epoch': 2.34}
{'loss': 0.0005, 'grad_norm': 0.007797816302627325, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.41}
{'loss': 0.0004, 'grad_norm': 0.007963314652442932, 'learning_rate': 3.5e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.86237633228302, 'eval_runtime': 153.6674, 'eval_samples_per_second': 1.835, 'eval_steps_per_second': 0.234, 'epoch': 2.48}
{'loss': 0.0004, 'grad_norm': 0.006226187106221914, 'learning_rate': 3.6e-05, 'epoch': 2.55}
{'loss': 0.0003, 'grad_norm': 0.006664480082690716, 'learning_rate': 3.7e-05, 'epoch': 2.62}
{'loss': 0.0003, 'grad_norm': 0.005435546860098839, 'learning_rate': 3.8e-05, 'epoch': 2.7}
{'loss': 0.0003, 'grad_norm': 0.005467678420245647, 'learning_rate': 3.9000000000000006e-05, 'epoch': 2.77}
{'loss': 0.0003, 'grad_norm': 0.005019015166908503, 'learning_rate': 4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.9193159341812134, 'eval_runtime': 153.1105, 'eval_samples_per_second': 1.842, 'eval_steps_per_second': 0.235, 'epoch': 2.84}
{'loss': 0.0002, 'grad_norm': 0.004572988022118807, 'learning_rate': 4.1e-05, 'epoch': 2.91}
{'loss': 0.0002, 'grad_norm': 0.004617329221218824, 'learning_rate': 4.2e-05, 'epoch': 2.98}
{'loss': 0.0748, 'grad_norm': 0.003742336295545101, 'learning_rate': 4.3e-05, 'epoch': 3.05}
{'loss': 0.0003, 'grad_norm': 0.00513175455853343, 'learning_rate': 4.4000000000000006e-05, 'epoch': 3.12}
{'loss': 0.0002, 'grad_norm': 0.0037314877845346928, 'learning_rate': 4.5e-05, 'epoch': 3.19}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.9873829483985901, 'eval_runtime': 150.9966, 'eval_samples_per_second': 1.868, 'eval_steps_per_second': 0.238, 'epoch': 3.19}
{'loss': 0.2128, 'grad_norm': 0.0067608351819217205, 'learning_rate': 4.600000000000001e-05, 'epoch': 3.26}
{'loss': 0.1966, 'grad_norm': 0.012851890176534653, 'learning_rate': 4.7e-05, 'epoch': 3.33}
{'loss': 0.0704, 'grad_norm': 49.88240051269531, 'learning_rate': 4.8e-05, 'epoch': 3.4}
{'loss': 0.2387, 'grad_norm': 0.15237633883953094, 'learning_rate': 4.9e-05, 'epoch': 3.48}
{'loss': 0.0043, 'grad_norm': 0.05947384983301163, 'learning_rate': 5e-05, 'epoch': 3.55}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.5653202533721924, 'eval_runtime': 151.3982, 'eval_samples_per_second': 1.863, 'eval_steps_per_second': 0.238, 'epoch': 3.55}
{'loss': 0.002, 'grad_norm': 0.012501831166446209, 'learning_rate': 4.21875e-05, 'epoch': 3.62}
{'loss': 0.1447, 'grad_norm': 0.031404413282871246, 'learning_rate': 3.4375e-05, 'epoch': 3.69}
{'loss': 0.001, 'grad_norm': 0.025274550542235374, 'learning_rate': 2.6562500000000002e-05, 'epoch': 3.76}
{'loss': 0.0881, 'grad_norm': 0.009403419680893421, 'learning_rate': 1.8750000000000002e-05, 'epoch': 3.83}
{'loss': 0.0843, 'grad_norm': 0.022499194368720055, 'learning_rate': 1.09375e-05, 'epoch': 3.9}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.7611519694328308, 'eval_runtime': 150.9465, 'eval_samples_per_second': 1.868, 'eval_steps_per_second': 0.238, 'epoch': 3.9}
{'loss': 0.0009, 'grad_norm': 0.023524874821305275, 'learning_rate': 3.125e-06, 'epoch': 3.97}


Could not locate the best model at ./results\checkpoint-500\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 8510.9303, 'train_samples_per_second': 0.528, 'train_steps_per_second': 0.066, 'train_loss': 0.10288497070403743, 'epoch': 4.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 5e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 0.7722607851028442




  0%|          | 0/705 [00:00<?, ?it/s]

{'loss': 0.0008, 'grad_norm': 0.013659988529980183, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.07}
{'loss': 0.0007, 'grad_norm': 0.012882574461400509, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.14}
{'loss': 0.0007, 'grad_norm': 0.010917635634541512, 'learning_rate': 3e-06, 'epoch': 0.21}
{'loss': 0.0006, 'grad_norm': 0.0084590008482337, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.28}
{'loss': 0.0004, 'grad_norm': 0.03501005470752716, 'learning_rate': 5e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.821146547794342, 'eval_runtime': 210.6421, 'eval_samples_per_second': 1.339, 'eval_steps_per_second': 0.171, 'epoch': 0.35}
{'loss': 0.1014, 'grad_norm': 0.006085636094212532, 'learning_rate': 6e-06, 'epoch': 0.43}
{'loss': 0.0003, 'grad_norm': 0.005930475890636444, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.5}
{'loss': 0.0003, 'grad_norm': 0.004766091704368591, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.57}
{'loss': 0.0003, 'grad_norm': 0.4595580995082855, 'learning_rate': 9e-06, 'epoch': 0.64}
{'loss': 0.0002, 'grad_norm': 0.004285939037799835, 'learning_rate': 1e-05, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.9083559513092041, 'eval_runtime': 179.1899, 'eval_samples_per_second': 1.574, 'eval_steps_per_second': 0.201, 'epoch': 0.71}
{'loss': 0.0002, 'grad_norm': 0.00376594765111804, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.78}
{'loss': 0.0001, 'grad_norm': 0.0024553413968533278, 'learning_rate': 1.2e-05, 'epoch': 0.85}
{'loss': 0.0001, 'grad_norm': 0.002526752417907119, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.92}
{'loss': 0.0001, 'grad_norm': 0.0022045206278562546, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.99}
{'loss': 0.0001, 'grad_norm': 0.001570521155372262, 'learning_rate': 1.5e-05, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.0148532390594482, 'eval_runtime': 188.2793, 'eval_samples_per_second': 1.498, 'eval_steps_per_second': 0.191, 'epoch': 1.06}
{'loss': 0.0459, 'grad_norm': 2225.39013671875, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.13}
{'loss': 0.0001, 'grad_norm': 0.0038862843066453934, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.21}
{'loss': 0.0001, 'grad_norm': 0.0010838382877409458, 'learning_rate': 1.8e-05, 'epoch': 1.28}
{'loss': 0.0001, 'grad_norm': 0.0009504526387900114, 'learning_rate': 1.9e-05, 'epoch': 1.35}
{'loss': 0.0, 'grad_norm': 0.0008963688160292804, 'learning_rate': 2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.9303802847862244, 'eval_runtime': 152.0411, 'eval_samples_per_second': 1.855, 'eval_steps_per_second': 0.237, 'epoch': 1.42}
{'loss': 0.0, 'grad_norm': 0.007292040623724461, 'learning_rate': 2.1e-05, 'epoch': 1.49}
{'loss': 0.0, 'grad_norm': 0.0006833281368017197, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.56}
{'loss': 0.1132, 'grad_norm': 0.0011719600297510624, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.63}
{'loss': 0.1202, 'grad_norm': 0.001353563740849495, 'learning_rate': 2.4e-05, 'epoch': 1.7}
{'loss': 0.0001, 'grad_norm': 0.0009513251134194434, 'learning_rate': 2.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.8307430148124695, 'eval_runtime': 164.1461, 'eval_samples_per_second': 1.718, 'eval_steps_per_second': 0.219, 'epoch': 1.77}
{'loss': 0.1794, 'grad_norm': 0.0011984159937128425, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.84}
{'loss': 0.0001, 'grad_norm': 0.002018854022026062, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.91}
{'loss': 0.0003, 'grad_norm': 0.0008205868070945144, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.99}
{'loss': 0.087, 'grad_norm': 0.001159960636869073, 'learning_rate': 2.9e-05, 'epoch': 2.06}
{'loss': 0.0001, 'grad_norm': 0.001618128502741456, 'learning_rate': 3e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.9035223722457886, 'eval_runtime': 146.2324, 'eval_samples_per_second': 1.928, 'eval_steps_per_second': 0.246, 'epoch': 2.13}
{'loss': 0.0, 'grad_norm': 0.000987268052995205, 'learning_rate': 3.1e-05, 'epoch': 2.2}
{'loss': 0.0, 'grad_norm': 0.0006445538019761443, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.27}
{'loss': 0.0, 'grad_norm': 0.00036703769001178443, 'learning_rate': 3.3e-05, 'epoch': 2.34}
{'loss': 0.0, 'grad_norm': 0.00048065808368846774, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.41}
{'loss': 0.0, 'grad_norm': 0.00045703735668212175, 'learning_rate': 3.5e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.07847261428833, 'eval_runtime': 146.2316, 'eval_samples_per_second': 1.928, 'eval_steps_per_second': 0.246, 'epoch': 2.48}
{'loss': 0.0, 'grad_norm': 0.00040728613384999335, 'learning_rate': 3.6e-05, 'epoch': 2.55}
{'loss': 0.0, 'grad_norm': 0.00035153754288330674, 'learning_rate': 3.7e-05, 'epoch': 2.62}
{'loss': 0.0, 'grad_norm': 0.0003007574996445328, 'learning_rate': 3.8e-05, 'epoch': 2.7}
{'loss': 0.0, 'grad_norm': 0.00023720513854641467, 'learning_rate': 3.9000000000000006e-05, 'epoch': 2.77}
{'loss': 0.0, 'grad_norm': 0.00027588591910898685, 'learning_rate': 4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.1395907402038574, 'eval_runtime': 183.356, 'eval_samples_per_second': 1.538, 'eval_steps_per_second': 0.196, 'epoch': 2.84}
{'loss': 0.0, 'grad_norm': 0.0002224722848040983, 'learning_rate': 4.1e-05, 'epoch': 2.91}
{'loss': 0.0, 'grad_norm': 0.000980834010988474, 'learning_rate': 4.2e-05, 'epoch': 2.98}
{'loss': 0.0, 'grad_norm': 0.00019866767979692668, 'learning_rate': 4.3e-05, 'epoch': 3.05}
{'loss': 0.0, 'grad_norm': 0.00021738534269388765, 'learning_rate': 4.4000000000000006e-05, 'epoch': 3.12}
{'loss': 0.0, 'grad_norm': 0.00020657581626437604, 'learning_rate': 4.5e-05, 'epoch': 3.19}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.134047508239746, 'eval_runtime': 167.1517, 'eval_samples_per_second': 1.687, 'eval_steps_per_second': 0.215, 'epoch': 3.19}
{'loss': 0.0, 'grad_norm': 0.0007393312989734113, 'learning_rate': 4.600000000000001e-05, 'epoch': 3.26}
{'loss': 0.0, 'grad_norm': 0.0008436404750682414, 'learning_rate': 4.7e-05, 'epoch': 3.33}
{'loss': 0.0, 'grad_norm': 0.00017968453175853938, 'learning_rate': 4.8e-05, 'epoch': 3.4}
{'loss': 0.0, 'grad_norm': 0.0001613172935321927, 'learning_rate': 4.9e-05, 'epoch': 3.48}
{'loss': 0.0, 'grad_norm': 0.00016873075219336897, 'learning_rate': 5e-05, 'epoch': 3.55}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.1610267162322998, 'eval_runtime': 185.5358, 'eval_samples_per_second': 1.52, 'eval_steps_per_second': 0.194, 'epoch': 3.55}
{'loss': 0.0, 'grad_norm': 0.00011811617878265679, 'learning_rate': 4.75609756097561e-05, 'epoch': 3.62}
{'loss': 0.0, 'grad_norm': 0.00032171656494028866, 'learning_rate': 4.51219512195122e-05, 'epoch': 3.69}
{'loss': 0.0, 'grad_norm': 0.00013197590305935591, 'learning_rate': 4.26829268292683e-05, 'epoch': 3.76}
{'loss': 0.0, 'grad_norm': 0.00010470593406353146, 'learning_rate': 4.0243902439024395e-05, 'epoch': 3.83}
{'loss': 0.0, 'grad_norm': 0.00012400723062455654, 'learning_rate': 3.780487804878049e-05, 'epoch': 3.9}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.278921365737915, 'eval_runtime': 207.2613, 'eval_samples_per_second': 1.361, 'eval_steps_per_second': 0.174, 'epoch': 3.9}
{'loss': 0.0, 'grad_norm': 0.00015369201719295233, 'learning_rate': 3.5365853658536584e-05, 'epoch': 3.97}
{'loss': 0.0, 'grad_norm': 0.00010873877181438729, 'learning_rate': 3.292682926829269e-05, 'epoch': 4.04}
{'loss': 0.0, 'grad_norm': 9.530093666398898e-05, 'learning_rate': 3.048780487804878e-05, 'epoch': 4.11}
{'loss': 0.0, 'grad_norm': 0.0001725660404190421, 'learning_rate': 2.8048780487804882e-05, 'epoch': 4.18}
{'loss': 0.0, 'grad_norm': 0.00010885130905080587, 'learning_rate': 2.5609756097560977e-05, 'epoch': 4.26}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.2931467294692993, 'eval_runtime': 203.2344, 'eval_samples_per_second': 1.388, 'eval_steps_per_second': 0.177, 'epoch': 4.26}
{'loss': 0.0, 'grad_norm': 0.00010840123286470771, 'learning_rate': 2.3170731707317075e-05, 'epoch': 4.33}
{'loss': 0.0, 'grad_norm': 0.0001007726023090072, 'learning_rate': 2.073170731707317e-05, 'epoch': 4.4}
{'loss': 0.0, 'grad_norm': 0.00010636671504471451, 'learning_rate': 1.8292682926829268e-05, 'epoch': 4.47}
{'loss': 0.0, 'grad_norm': 0.0001176449513877742, 'learning_rate': 1.5853658536585366e-05, 'epoch': 4.54}
{'loss': 0.0, 'grad_norm': 0.00010159647354157642, 'learning_rate': 1.3414634146341466e-05, 'epoch': 4.61}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.3019911050796509, 'eval_runtime': 226.6969, 'eval_samples_per_second': 1.244, 'eval_steps_per_second': 0.159, 'epoch': 4.61}
{'loss': 0.0, 'grad_norm': 9.813171345740557e-05, 'learning_rate': 1.0975609756097562e-05, 'epoch': 4.68}
{'loss': 0.0, 'grad_norm': 0.00010246776946587488, 'learning_rate': 8.53658536585366e-06, 'epoch': 4.75}
{'loss': 0.0, 'grad_norm': 9.380836127093062e-05, 'learning_rate': 6.0975609756097564e-06, 'epoch': 4.82}
{'loss': 0.0, 'grad_norm': 8.239233284257352e-05, 'learning_rate': 3.6585365853658537e-06, 'epoch': 4.89}
{'loss': 0.0, 'grad_norm': 0.00013023370411247015, 'learning_rate': 1.2195121951219514e-06, 'epoch': 4.96}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.3067644834518433, 'eval_runtime': 285.5767, 'eval_samples_per_second': 0.987, 'eval_steps_per_second': 0.126, 'epoch': 4.96}


Could not locate the best model at ./results\checkpoint-50\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 14280.184, 'train_samples_per_second': 0.394, 'train_steps_per_second': 0.049, 'train_loss': 0.009267037244087806, 'epoch': 5.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 5e-05, Weight decay: 0.015, Epochs: 5, Eval loss: 1.3067927360534668




  0%|          | 0/423 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': 0.00011773869482567534, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.07}
{'loss': 0.0, 'grad_norm': 9.314089402323589e-05, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.14}
{'loss': 0.0, 'grad_norm': 0.00010042737267212942, 'learning_rate': 3e-06, 'epoch': 0.21}
{'loss': 0.0, 'grad_norm': 7.354994886554778e-05, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.28}
{'loss': 0.0, 'grad_norm': 6.24630629317835e-05, 'learning_rate': 5e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.3020418882369995, 'eval_runtime': 246.4143, 'eval_samples_per_second': 1.144, 'eval_steps_per_second': 0.146, 'epoch': 0.35}
{'loss': 0.0, 'grad_norm': 5.847906868439168e-05, 'learning_rate': 6e-06, 'epoch': 0.43}
{'loss': 0.0, 'grad_norm': 5.149767457623966e-05, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.5}
{'loss': 0.0, 'grad_norm': 4.708864071290009e-05, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.57}
{'loss': 0.0, 'grad_norm': 4.1826628148555756e-05, 'learning_rate': 9e-06, 'epoch': 0.64}
{'loss': 0.0, 'grad_norm': 3.9794678741600364e-05, 'learning_rate': 1e-05, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.4248085021972656, 'eval_runtime': 239.1891, 'eval_samples_per_second': 1.179, 'eval_steps_per_second': 0.151, 'epoch': 0.71}
{'loss': 0.0, 'grad_norm': 3.2669871870893985e-05, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.78}
{'loss': 0.0, 'grad_norm': 2.107131695083808e-05, 'learning_rate': 1.2e-05, 'epoch': 0.85}
{'loss': 0.0, 'grad_norm': 2.084991501760669e-05, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.92}
{'loss': 0.0, 'grad_norm': 2.0451287127798423e-05, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.99}
{'loss': 0.0, 'grad_norm': 1.612485539226327e-05, 'learning_rate': 1.5e-05, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.5195554494857788, 'eval_runtime': 156.3191, 'eval_samples_per_second': 1.804, 'eval_steps_per_second': 0.23, 'epoch': 1.06}
{'loss': 0.0, 'grad_norm': 1.6523865269846283e-05, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.13}
{'loss': 0.0, 'grad_norm': 1.6704423615010455e-05, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.21}
{'loss': 0.0, 'grad_norm': 1.1135523891425692e-05, 'learning_rate': 1.8e-05, 'epoch': 1.28}
{'loss': 0.0, 'grad_norm': 9.39840992941754e-06, 'learning_rate': 1.9e-05, 'epoch': 1.35}
{'loss': 0.0, 'grad_norm': 8.852548489812762e-06, 'learning_rate': 2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.483810305595398, 'eval_runtime': 146.8105, 'eval_samples_per_second': 1.921, 'eval_steps_per_second': 0.245, 'epoch': 1.42}
{'loss': 0.0, 'grad_norm': 9.828079782892019e-06, 'learning_rate': 2.1e-05, 'epoch': 1.49}
{'loss': 0.0, 'grad_norm': 6.8766730692004785e-06, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.56}
{'loss': 0.0, 'grad_norm': 7.107956662366632e-06, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.63}
{'loss': 0.0, 'grad_norm': 7.298905075003859e-06, 'learning_rate': 2.4e-05, 'epoch': 1.7}
{'loss': 0.0, 'grad_norm': 6.284692517510848e-06, 'learning_rate': 2.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.5057919025421143, 'eval_runtime': 145.7159, 'eval_samples_per_second': 1.935, 'eval_steps_per_second': 0.247, 'epoch': 1.77}
{'loss': 0.0, 'grad_norm': 5.888016858079936e-06, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.84}
{'loss': 0.0, 'grad_norm': 5.111512564326404e-06, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.91}
{'loss': 0.0, 'grad_norm': 4.472739874472609e-06, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.99}
{'loss': 0.0, 'grad_norm': 4.264070867066039e-06, 'learning_rate': 2.9e-05, 'epoch': 2.06}
{'loss': 0.0, 'grad_norm': 4.747687398776179e-06, 'learning_rate': 3e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.7086971998214722, 'eval_runtime': 205.9682, 'eval_samples_per_second': 1.369, 'eval_steps_per_second': 0.175, 'epoch': 2.13}
{'loss': 0.0, 'grad_norm': 3.1870188195171067e-06, 'learning_rate': 3.1e-05, 'epoch': 2.2}
{'loss': 0.0, 'grad_norm': 3.5219213714299258e-06, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.27}
{'loss': 0.0, 'grad_norm': 2.6726515898189973e-06, 'learning_rate': 3.3e-05, 'epoch': 2.34}
{'loss': 0.0, 'grad_norm': 3.7485874599951785e-06, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.41}
{'loss': 0.0, 'grad_norm': 3.2676202863513026e-06, 'learning_rate': 3.5e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.7341315746307373, 'eval_runtime': 204.5403, 'eval_samples_per_second': 1.379, 'eval_steps_per_second': 0.176, 'epoch': 2.48}
{'loss': 0.0, 'grad_norm': 2.792299710563384e-06, 'learning_rate': 3.6e-05, 'epoch': 2.55}
{'loss': 0.0, 'grad_norm': 2.6792990865942556e-06, 'learning_rate': 3.7e-05, 'epoch': 2.62}
{'loss': 0.0, 'grad_norm': 2.6143770810449496e-06, 'learning_rate': 3.8e-05, 'epoch': 2.7}
{'loss': 0.0, 'grad_norm': 2.2712583813699894e-06, 'learning_rate': 3.9000000000000006e-05, 'epoch': 2.77}
{'loss': 0.0, 'grad_norm': 2.373771621932974e-06, 'learning_rate': 4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.7940711975097656, 'eval_runtime': 165.4096, 'eval_samples_per_second': 1.705, 'eval_steps_per_second': 0.218, 'epoch': 2.84}
{'loss': 0.0, 'grad_norm': 2.1775683762825793e-06, 'learning_rate': 4.1e-05, 'epoch': 2.91}
{'loss': 0.0, 'grad_norm': 1.993064643102116e-06, 'learning_rate': 4.2e-05, 'epoch': 2.98}


Could not locate the best model at ./results\checkpoint-50\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 8354.243, 'train_samples_per_second': 0.404, 'train_steps_per_second': 0.051, 'train_loss': 8.975912396515033e-07, 'epoch': 3.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 5e-05, Weight decay: 0.02, Epochs: 3, Eval loss: 1.8069546222686768




  0%|          | 0/564 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': 2.3667776076763403e-06, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.07}
{'loss': 0.0, 'grad_norm': 1.775491227817838e-06, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.14}
{'loss': 0.0, 'grad_norm': 1.953109858732205e-06, 'learning_rate': 3e-06, 'epoch': 0.21}
{'loss': 0.0, 'grad_norm': 1.7102074707509018e-06, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.28}
{'loss': 0.0, 'grad_norm': 1.3934721891928348e-06, 'learning_rate': 5e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.8179950714111328, 'eval_runtime': 166.2111, 'eval_samples_per_second': 1.697, 'eval_steps_per_second': 0.217, 'epoch': 0.35}
{'loss': 0.0, 'grad_norm': 1.451237267247052e-06, 'learning_rate': 6e-06, 'epoch': 0.43}
{'loss': 0.0, 'grad_norm': 1.011943027151574e-06, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.5}
{'loss': 0.0, 'grad_norm': 1.5232774330797838e-06, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.57}
{'loss': 0.0, 'grad_norm': 1.378668230245239e-06, 'learning_rate': 9e-06, 'epoch': 0.64}
{'loss': 0.0, 'grad_norm': 1.4329762052511796e-06, 'learning_rate': 1e-05, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.7972526550292969, 'eval_runtime': 166.7647, 'eval_samples_per_second': 1.691, 'eval_steps_per_second': 0.216, 'epoch': 0.71}
{'loss': 0.0, 'grad_norm': 1.3595514474218362e-06, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.78}
{'loss': 0.0, 'grad_norm': 5.181203164283943e-07, 'learning_rate': 1.2e-05, 'epoch': 0.85}
{'loss': 0.0, 'grad_norm': 5.807381739941775e-07, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.92}
{'loss': 0.0, 'grad_norm': 5.440616064333881e-07, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.99}
{'loss': 0.0, 'grad_norm': 5.073756597084866e-07, 'learning_rate': 1.5e-05, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.8899544477462769, 'eval_runtime': 156.1723, 'eval_samples_per_second': 1.806, 'eval_steps_per_second': 0.231, 'epoch': 1.06}
{'loss': 0.0, 'grad_norm': 5.034903551859315e-07, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.13}
{'loss': 0.0, 'grad_norm': 7.918630444692099e-07, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.21}
{'loss': 0.0, 'grad_norm': 4.355766236585623e-07, 'learning_rate': 1.8e-05, 'epoch': 1.28}
{'loss': 0.0, 'grad_norm': 3.3810033528425265e-07, 'learning_rate': 1.9e-05, 'epoch': 1.35}
{'loss': 0.0, 'grad_norm': 3.3298604762421746e-07, 'learning_rate': 2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.9154562950134277, 'eval_runtime': 156.9401, 'eval_samples_per_second': 1.797, 'eval_steps_per_second': 0.229, 'epoch': 1.42}
{'loss': 0.0, 'grad_norm': 3.9154508613137295e-07, 'learning_rate': 2.1e-05, 'epoch': 1.49}
{'loss': 0.0, 'grad_norm': 2.8458154588406614e-07, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.56}
{'loss': 0.0, 'grad_norm': 2.8268277674214914e-07, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.63}
{'loss': 0.0, 'grad_norm': 3.550977680788492e-07, 'learning_rate': 2.4e-05, 'epoch': 1.7}
{'loss': 0.0, 'grad_norm': 3.065818532377307e-07, 'learning_rate': 2.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.9146085977554321, 'eval_runtime': 157.7826, 'eval_samples_per_second': 1.787, 'eval_steps_per_second': 0.228, 'epoch': 1.77}
{'loss': 0.0, 'grad_norm': 2.6191074198322895e-07, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.84}
{'loss': 0.0, 'grad_norm': 2.5324334274046123e-07, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.91}
{'loss': 0.0, 'grad_norm': 2.0727266303310898e-07, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.99}
{'loss': 0.0, 'grad_norm': 2.1596528654299618e-07, 'learning_rate': 2.9e-05, 'epoch': 2.06}
{'loss': 0.0, 'grad_norm': 2.3403093507567974e-07, 'learning_rate': 3e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.9245212078094482, 'eval_runtime': 155.7198, 'eval_samples_per_second': 1.811, 'eval_steps_per_second': 0.231, 'epoch': 2.13}
{'loss': 0.0, 'grad_norm': 1.744677007309292e-07, 'learning_rate': 3.1e-05, 'epoch': 2.2}
{'loss': 0.0, 'grad_norm': 1.653881440688565e-07, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.27}
{'loss': 0.0, 'grad_norm': 1.2959849016169755e-07, 'learning_rate': 3.3e-05, 'epoch': 2.34}
{'loss': 0.0, 'grad_norm': 1.8078007713029365e-07, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.41}
{'loss': 0.0, 'grad_norm': 1.71595232245636e-07, 'learning_rate': 3.5e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.9345886707305908, 'eval_runtime': 153.9631, 'eval_samples_per_second': 1.832, 'eval_steps_per_second': 0.234, 'epoch': 2.48}
{'loss': 0.0, 'grad_norm': 1.5543832887487952e-07, 'learning_rate': 3.6e-05, 'epoch': 2.55}
{'loss': 0.0, 'grad_norm': 1.3770444695637707e-07, 'learning_rate': 3.7e-05, 'epoch': 2.62}
{'loss': 0.0, 'grad_norm': 1.2566157181481685e-07, 'learning_rate': 3.8e-05, 'epoch': 2.7}
{'loss': 0.0, 'grad_norm': 1.185308988738143e-07, 'learning_rate': 3.9000000000000006e-05, 'epoch': 2.77}
{'loss': 0.0, 'grad_norm': 1.2656862224957877e-07, 'learning_rate': 4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.955985188484192, 'eval_runtime': 165.7566, 'eval_samples_per_second': 1.701, 'eval_steps_per_second': 0.217, 'epoch': 2.84}
{'loss': 0.0, 'grad_norm': 1.1097744589960712e-07, 'learning_rate': 4.1e-05, 'epoch': 2.91}
{'loss': 0.0, 'grad_norm': 1.075133866379474e-07, 'learning_rate': 4.2e-05, 'epoch': 2.98}
{'loss': 0.0, 'grad_norm': 9.633919262341806e-08, 'learning_rate': 4.3e-05, 'epoch': 3.05}
{'loss': 0.0, 'grad_norm': 1.1294293500441199e-07, 'learning_rate': 4.4000000000000006e-05, 'epoch': 3.12}
{'loss': 0.0, 'grad_norm': 1.06405920519137e-07, 'learning_rate': 4.5e-05, 'epoch': 3.19}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.9241770505905151, 'eval_runtime': 259.6575, 'eval_samples_per_second': 1.086, 'eval_steps_per_second': 0.139, 'epoch': 3.19}
{'loss': 0.0, 'grad_norm': 8.762808079154638e-08, 'learning_rate': 4.600000000000001e-05, 'epoch': 3.26}
{'loss': 0.0, 'grad_norm': 7.772005261585946e-08, 'learning_rate': 4.7e-05, 'epoch': 3.33}
{'loss': 0.0, 'grad_norm': 9.769999564923637e-08, 'learning_rate': 4.8e-05, 'epoch': 3.4}
{'loss': 0.0, 'grad_norm': 8.736177647961085e-08, 'learning_rate': 4.9e-05, 'epoch': 3.48}
{'loss': 0.0, 'grad_norm': 9.147224488970096e-08, 'learning_rate': 5e-05, 'epoch': 3.55}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.8627163171768188, 'eval_runtime': 228.6241, 'eval_samples_per_second': 1.233, 'eval_steps_per_second': 0.157, 'epoch': 3.55}
{'loss': 0.0, 'grad_norm': 7.302404014808417e-08, 'learning_rate': 4.21875e-05, 'epoch': 3.62}
{'loss': 0.0, 'grad_norm': 7.586407235748993e-08, 'learning_rate': 3.4375e-05, 'epoch': 3.69}
{'loss': 0.0, 'grad_norm': 7.634706378212286e-08, 'learning_rate': 2.6562500000000002e-05, 'epoch': 3.76}
{'loss': 0.0, 'grad_norm': 5.821760495905437e-08, 'learning_rate': 1.8750000000000002e-05, 'epoch': 3.83}
{'loss': 0.0, 'grad_norm': 7.518559641539468e-08, 'learning_rate': 1.09375e-05, 'epoch': 3.9}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.8484878540039062, 'eval_runtime': 199.7051, 'eval_samples_per_second': 1.412, 'eval_steps_per_second': 0.18, 'epoch': 3.9}
{'loss': 0.0, 'grad_norm': 1.1445342806837289e-07, 'learning_rate': 3.125e-06, 'epoch': 3.97}


Could not locate the best model at ./results\checkpoint-100\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 10063.2907, 'train_samples_per_second': 0.447, 'train_steps_per_second': 0.056, 'train_loss': 9.722742633065003e-09, 'epoch': 4.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 5e-05, Weight decay: 0.02, Epochs: 4, Eval loss: 1.847774863243103




  0%|          | 0/705 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': 8.618778224445123e-08, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.07}
{'loss': 0.0, 'grad_norm': 7.426209691629992e-08, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.14}
{'loss': 0.0, 'grad_norm': 8.552526509220115e-08, 'learning_rate': 3e-06, 'epoch': 0.21}
{'loss': 0.0, 'grad_norm': 6.701803556552477e-08, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.28}
{'loss': 0.0, 'grad_norm': 5.6507008849848717e-08, 'learning_rate': 5e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.8477674722671509, 'eval_runtime': 210.4222, 'eval_samples_per_second': 1.34, 'eval_steps_per_second': 0.171, 'epoch': 0.35}
{'loss': 0.0, 'grad_norm': 6.486641979108754e-08, 'learning_rate': 6e-06, 'epoch': 0.43}
{'loss': 0.0, 'grad_norm': 6.311766043154421e-08, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.5}
{'loss': 0.0, 'grad_norm': 6.896190285488046e-08, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.57}
{'loss': 0.0, 'grad_norm': 7.32611837861441e-08, 'learning_rate': 9e-06, 'epoch': 0.64}
{'loss': 0.0, 'grad_norm': 7.773719090664599e-08, 'learning_rate': 1e-05, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.8452469110488892, 'eval_runtime': 184.3691, 'eval_samples_per_second': 1.53, 'eval_steps_per_second': 0.195, 'epoch': 0.71}
{'loss': 0.0, 'grad_norm': 7.688073822009756e-08, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.78}
{'loss': 0.0, 'grad_norm': 5.343616749087232e-08, 'learning_rate': 1.2e-05, 'epoch': 0.85}
{'loss': 0.0, 'grad_norm': 6.275025299373738e-08, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.92}
{'loss': 0.0, 'grad_norm': 6.25044691560106e-08, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.99}
{'loss': 0.0, 'grad_norm': 6.021787157806102e-08, 'learning_rate': 1.5e-05, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.8479783535003662, 'eval_runtime': 181.8381, 'eval_samples_per_second': 1.551, 'eval_steps_per_second': 0.198, 'epoch': 1.06}
{'loss': 0.0, 'grad_norm': 6.345612035829618e-08, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.13}
{'loss': 0.0, 'grad_norm': 7.516449329614261e-08, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.21}
{'loss': 0.0, 'grad_norm': 5.880025710780501e-08, 'learning_rate': 1.8e-05, 'epoch': 1.28}
{'loss': 0.0, 'grad_norm': 4.6116142016217054e-08, 'learning_rate': 1.9e-05, 'epoch': 1.35}
{'loss': 0.0, 'grad_norm': 4.7173806194678036e-08, 'learning_rate': 2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.8505140542984009, 'eval_runtime': 194.7845, 'eval_samples_per_second': 1.448, 'eval_steps_per_second': 0.185, 'epoch': 1.42}
{'loss': 0.0, 'grad_norm': 5.8848900863495146e-08, 'learning_rate': 2.1e-05, 'epoch': 1.49}
{'loss': 0.0, 'grad_norm': 4.2525478249899606e-08, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.56}
{'loss': 0.0, 'grad_norm': 4.364066441553405e-08, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.63}
{'loss': 0.0, 'grad_norm': 5.819409309992807e-08, 'learning_rate': 2.4e-05, 'epoch': 1.7}
{'loss': 0.0, 'grad_norm': 5.132449842903952e-08, 'learning_rate': 2.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.873225212097168, 'eval_runtime': 215.8787, 'eval_samples_per_second': 1.306, 'eval_steps_per_second': 0.167, 'epoch': 1.77}
{'loss': 0.0, 'grad_norm': 4.5065725373660825e-08, 'learning_rate': 2.6000000000000002e-05, 'epoch': 1.84}
{'loss': 0.0, 'grad_norm': 4.522612684354499e-08, 'learning_rate': 2.7000000000000002e-05, 'epoch': 1.91}
{'loss': 0.0, 'grad_norm': 3.72162425321676e-08, 'learning_rate': 2.8000000000000003e-05, 'epoch': 1.99}
{'loss': 0.0, 'grad_norm': 4.073272208415801e-08, 'learning_rate': 2.9e-05, 'epoch': 2.06}
{'loss': 0.0, 'grad_norm': 4.50996324730113e-08, 'learning_rate': 3e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.894948124885559, 'eval_runtime': 168.8364, 'eval_samples_per_second': 1.67, 'eval_steps_per_second': 0.213, 'epoch': 2.13}
{'loss': 0.0, 'grad_norm': 3.4109941537963095e-08, 'learning_rate': 3.1e-05, 'epoch': 2.2}
{'loss': 0.0, 'grad_norm': 3.317772723221424e-08, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.27}
{'loss': 0.0, 'grad_norm': 2.601725945794442e-08, 'learning_rate': 3.3e-05, 'epoch': 2.34}
{'loss': 0.0, 'grad_norm': 3.9048764222116006e-08, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.41}
{'loss': 0.0, 'grad_norm': 3.728446884565528e-08, 'learning_rate': 3.5e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.925278663635254, 'eval_runtime': 155.1215, 'eval_samples_per_second': 1.818, 'eval_steps_per_second': 0.232, 'epoch': 2.48}
{'loss': 0.0, 'grad_norm': 3.459825137497319e-08, 'learning_rate': 3.6e-05, 'epoch': 2.55}
{'loss': 0.0, 'grad_norm': 3.102062251514326e-08, 'learning_rate': 3.7e-05, 'epoch': 2.62}
{'loss': 0.0, 'grad_norm': 2.8967708232130462e-08, 'learning_rate': 3.8e-05, 'epoch': 2.7}
{'loss': 0.0, 'grad_norm': 2.7600494334478753e-08, 'learning_rate': 3.9000000000000006e-05, 'epoch': 2.77}
{'loss': 0.0, 'grad_norm': 3.058356767837722e-08, 'learning_rate': 4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.941567301750183, 'eval_runtime': 205.4642, 'eval_samples_per_second': 1.373, 'eval_steps_per_second': 0.175, 'epoch': 2.84}
{'loss': 0.0, 'grad_norm': 2.7015996550971977e-08, 'learning_rate': 4.1e-05, 'epoch': 2.91}
{'loss': 0.0, 'grad_norm': 2.6778394612847478e-08, 'learning_rate': 4.2e-05, 'epoch': 2.98}
{'loss': 0.0, 'grad_norm': 2.439017698918633e-08, 'learning_rate': 4.3e-05, 'epoch': 3.05}
{'loss': 0.0, 'grad_norm': 2.9458060879505865e-08, 'learning_rate': 4.4000000000000006e-05, 'epoch': 3.12}
{'loss': 0.0, 'grad_norm': 2.8314829592090973e-08, 'learning_rate': 4.5e-05, 'epoch': 3.19}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.9589498043060303, 'eval_runtime': 201.4983, 'eval_samples_per_second': 1.4, 'eval_steps_per_second': 0.179, 'epoch': 3.19}
{'loss': 0.0, 'grad_norm': 2.340891924745847e-08, 'learning_rate': 4.600000000000001e-05, 'epoch': 3.26}
{'loss': 0.0, 'grad_norm': 2.0805243750032787e-08, 'learning_rate': 4.7e-05, 'epoch': 3.33}
{'loss': 0.0, 'grad_norm': 2.7061730634159176e-08, 'learning_rate': 4.8e-05, 'epoch': 3.4}
{'loss': 0.0, 'grad_norm': 2.4566647383039708e-08, 'learning_rate': 4.9e-05, 'epoch': 3.48}
{'loss': 0.0, 'grad_norm': 2.6139886699638737e-08, 'learning_rate': 5e-05, 'epoch': 3.55}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.9733024835586548, 'eval_runtime': 194.7738, 'eval_samples_per_second': 1.448, 'eval_steps_per_second': 0.185, 'epoch': 3.55}
{'loss': 0.0, 'grad_norm': 2.0627146213314518e-08, 'learning_rate': 4.75609756097561e-05, 'epoch': 3.62}
{'loss': 0.0, 'grad_norm': 2.1989599474636634e-08, 'learning_rate': 4.51219512195122e-05, 'epoch': 3.69}
{'loss': 0.0, 'grad_norm': 2.216827077461403e-08, 'learning_rate': 4.26829268292683e-05, 'epoch': 3.76}
{'loss': 0.0, 'grad_norm': 1.6627087262577334e-08, 'learning_rate': 4.0243902439024395e-05, 'epoch': 3.83}
{'loss': 0.0, 'grad_norm': 2.1771343838850044e-08, 'learning_rate': 3.780487804878049e-05, 'epoch': 3.9}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.982142686843872, 'eval_runtime': 205.9195, 'eval_samples_per_second': 1.369, 'eval_steps_per_second': 0.175, 'epoch': 3.9}
{'loss': 0.0, 'grad_norm': 3.274946180908955e-08, 'learning_rate': 3.5365853658536584e-05, 'epoch': 3.97}
{'loss': 0.0, 'grad_norm': 1.7935272822455772e-08, 'learning_rate': 3.292682926829269e-05, 'epoch': 4.04}
{'loss': 0.0, 'grad_norm': 1.5691716370724862e-08, 'learning_rate': 3.048780487804878e-05, 'epoch': 4.11}
{'loss': 0.0, 'grad_norm': 2.4963860312254837e-08, 'learning_rate': 2.8048780487804882e-05, 'epoch': 4.18}
{'loss': 0.0, 'grad_norm': 1.6895610244205272e-08, 'learning_rate': 2.5609756097560977e-05, 'epoch': 4.26}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.9853459596633911, 'eval_runtime': 193.6047, 'eval_samples_per_second': 1.457, 'eval_steps_per_second': 0.186, 'epoch': 4.26}
{'loss': 0.0, 'grad_norm': 1.9685312935280308e-08, 'learning_rate': 2.3170731707317075e-05, 'epoch': 4.33}
{'loss': 0.0, 'grad_norm': 1.7162278709292877e-08, 'learning_rate': 2.073170731707317e-05, 'epoch': 4.4}
{'loss': 0.0, 'grad_norm': 1.990465570145261e-08, 'learning_rate': 1.8292682926829268e-05, 'epoch': 4.47}
{'loss': 0.0, 'grad_norm': 2.5703791095565975e-08, 'learning_rate': 1.5853658536585366e-05, 'epoch': 4.54}
{'loss': 0.0, 'grad_norm': 1.942649774377969e-08, 'learning_rate': 1.3414634146341466e-05, 'epoch': 4.61}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.993577480316162, 'eval_runtime': 199.123, 'eval_samples_per_second': 1.416, 'eval_steps_per_second': 0.181, 'epoch': 4.61}
{'loss': 0.0, 'grad_norm': 1.788264469837486e-08, 'learning_rate': 1.0975609756097562e-05, 'epoch': 4.68}
{'loss': 0.0, 'grad_norm': 1.965370088896634e-08, 'learning_rate': 8.53658536585366e-06, 'epoch': 4.75}
{'loss': 0.0, 'grad_norm': 1.7151091213918335e-08, 'learning_rate': 6.0975609756097564e-06, 'epoch': 4.82}
{'loss': 0.0, 'grad_norm': 1.3625512274018092e-08, 'learning_rate': 3.6585365853658537e-06, 'epoch': 4.89}
{'loss': 0.0, 'grad_norm': 1.9188883371157317e-08, 'learning_rate': 1.2195121951219514e-06, 'epoch': 4.96}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0005037784576416, 'eval_runtime': 198.963, 'eval_samples_per_second': 1.417, 'eval_steps_per_second': 0.181, 'epoch': 4.96}


Could not locate the best model at ./results\checkpoint-100\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 14289.3659, 'train_samples_per_second': 0.393, 'train_steps_per_second': 0.049, 'train_loss': 0.0, 'epoch': 5.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 5e-05, Weight decay: 0.02, Epochs: 5, Eval loss: 2.000518560409546




  0%|          | 0/423 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': 2.2198586080435234e-08, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.07}
{'loss': 0.0, 'grad_norm': 1.8987128314051915e-08, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.14}
{'loss': 0.0, 'grad_norm': 2.2166307900306492e-08, 'learning_rate': 1.8e-06, 'epoch': 0.21}
{'loss': 0.0, 'grad_norm': 1.7147565145592125e-08, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.28}
{'loss': 0.0, 'grad_norm': 1.4359382127793197e-08, 'learning_rate': 3e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.000833749771118, 'eval_runtime': 209.7361, 'eval_samples_per_second': 1.345, 'eval_steps_per_second': 0.172, 'epoch': 0.35}
{'loss': 0.0, 'grad_norm': 1.670398930286865e-08, 'learning_rate': 3.6e-06, 'epoch': 0.43}
{'loss': 0.0, 'grad_norm': 1.631361890019889e-08, 'learning_rate': 4.2000000000000004e-06, 'epoch': 0.5}
{'loss': 0.0, 'grad_norm': 1.8190723594102565e-08, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.57}
{'loss': 0.0, 'grad_norm': 1.9498317627153483e-08, 'learning_rate': 5.4e-06, 'epoch': 0.64}
{'loss': 0.0, 'grad_norm': 2.1104128222759755e-08, 'learning_rate': 6e-06, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0017120838165283, 'eval_runtime': 182.5184, 'eval_samples_per_second': 1.545, 'eval_steps_per_second': 0.197, 'epoch': 0.71}
{'loss': 0.0, 'grad_norm': 2.1153153895170362e-08, 'learning_rate': 6.6e-06, 'epoch': 0.78}
{'loss': 0.0, 'grad_norm': 1.453539155704675e-08, 'learning_rate': 7.2e-06, 'epoch': 0.85}
{'loss': 0.0, 'grad_norm': 1.7363655402391487e-08, 'learning_rate': 7.8e-06, 'epoch': 0.92}
{'loss': 0.0, 'grad_norm': 1.7719052891607134e-08, 'learning_rate': 8.400000000000001e-06, 'epoch': 0.99}
{'loss': 0.0, 'grad_norm': 1.7243474204065024e-08, 'learning_rate': 9e-06, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.00329852104187, 'eval_runtime': 184.0274, 'eval_samples_per_second': 1.532, 'eval_steps_per_second': 0.196, 'epoch': 1.06}
{'loss': 0.0, 'grad_norm': 1.8786437294693314e-08, 'learning_rate': 9.600000000000001e-06, 'epoch': 1.13}
{'loss': 0.0, 'grad_norm': 2.300655665976592e-08, 'learning_rate': 1.02e-05, 'epoch': 1.21}
{'loss': 0.0, 'grad_norm': 1.7933070139974916e-08, 'learning_rate': 1.08e-05, 'epoch': 1.28}
{'loss': 0.0, 'grad_norm': 1.4137553350224152e-08, 'learning_rate': 1.1400000000000001e-05, 'epoch': 1.35}
{'loss': 0.0, 'grad_norm': 1.4846928131362347e-08, 'learning_rate': 1.2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0052597522735596, 'eval_runtime': 187.7989, 'eval_samples_per_second': 1.502, 'eval_steps_per_second': 0.192, 'epoch': 1.42}
{'loss': 0.0, 'grad_norm': 1.911032754264852e-08, 'learning_rate': 1.26e-05, 'epoch': 1.49}
{'loss': 0.0, 'grad_norm': 1.3783953534129978e-08, 'learning_rate': 1.32e-05, 'epoch': 1.56}
{'loss': 0.0, 'grad_norm': 1.4463441111445263e-08, 'learning_rate': 1.3800000000000002e-05, 'epoch': 1.63}
{'loss': 0.0, 'grad_norm': 2.0077788320804757e-08, 'learning_rate': 1.44e-05, 'epoch': 1.7}
{'loss': 0.0, 'grad_norm': 1.7943088792549133e-08, 'learning_rate': 1.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0076541900634766, 'eval_runtime': 185.5656, 'eval_samples_per_second': 1.52, 'eval_steps_per_second': 0.194, 'epoch': 1.77}
{'loss': 0.0, 'grad_norm': 1.6103257394206594e-08, 'learning_rate': 1.56e-05, 'epoch': 1.84}
{'loss': 0.0, 'grad_norm': 1.654469805600911e-08, 'learning_rate': 1.62e-05, 'epoch': 1.91}
{'loss': 0.0, 'grad_norm': 1.3750588223615523e-08, 'learning_rate': 1.6800000000000002e-05, 'epoch': 1.99}
{'loss': 0.0, 'grad_norm': 1.5601020919575603e-08, 'learning_rate': 1.74e-05, 'epoch': 2.06}
{'loss': 0.0, 'grad_norm': 1.757956624715007e-08, 'learning_rate': 1.8e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.011016845703125, 'eval_runtime': 174.3902, 'eval_samples_per_second': 1.617, 'eval_steps_per_second': 0.206, 'epoch': 2.13}
{'loss': 0.0, 'grad_norm': 1.3486087802050406e-08, 'learning_rate': 1.86e-05, 'epoch': 2.2}
{'loss': 0.0, 'grad_norm': 1.341161492973697e-08, 'learning_rate': 1.9200000000000003e-05, 'epoch': 2.27}
{'loss': 0.0, 'grad_norm': 1.058768894779405e-08, 'learning_rate': 1.98e-05, 'epoch': 2.34}
{'loss': 0.0, 'grad_norm': 1.6718921358460648e-08, 'learning_rate': 2.04e-05, 'epoch': 2.41}
{'loss': 0.0, 'grad_norm': 1.6142500669502624e-08, 'learning_rate': 2.1e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0142977237701416, 'eval_runtime': 150.259, 'eval_samples_per_second': 1.877, 'eval_steps_per_second': 0.24, 'epoch': 2.48}
{'loss': 0.0, 'grad_norm': 1.531897986239983e-08, 'learning_rate': 2.16e-05, 'epoch': 2.55}
{'loss': 0.0, 'grad_norm': 1.3955728128678402e-08, 'learning_rate': 2.22e-05, 'epoch': 2.62}
{'loss': 0.0, 'grad_norm': 1.3316465263812916e-08, 'learning_rate': 2.2800000000000002e-05, 'epoch': 2.7}
{'loss': 0.0, 'grad_norm': 1.2874831867293324e-08, 'learning_rate': 2.3400000000000003e-05, 'epoch': 2.77}
{'loss': 0.0, 'grad_norm': 1.4695443084633553e-08, 'learning_rate': 2.4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0185062885284424, 'eval_runtime': 136.7045, 'eval_samples_per_second': 2.063, 'eval_steps_per_second': 0.263, 'epoch': 2.84}
{'loss': 0.0, 'grad_norm': 1.3161444378795295e-08, 'learning_rate': 2.4599999999999998e-05, 'epoch': 2.91}
{'loss': 0.0, 'grad_norm': 1.3327309922317454e-08, 'learning_rate': 2.52e-05, 'epoch': 2.98}


Could not locate the best model at ./results\checkpoint-50\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 7742.5087, 'train_samples_per_second': 0.436, 'train_steps_per_second': 0.055, 'train_loss': 0.0, 'epoch': 3.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 3e-05, Weight decay: 0.01, Epochs: 3, Eval loss: 2.0202107429504395




  0%|          | 0/564 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': 1.7927172635268107e-08, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.07}
{'loss': 0.0, 'grad_norm': 1.531204496529881e-08, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.14}
{'loss': 0.0, 'grad_norm': 1.7910206651094995e-08, 'learning_rate': 1.8e-06, 'epoch': 0.21}
{'loss': 0.0, 'grad_norm': 1.3818174160462604e-08, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.28}
{'loss': 0.0, 'grad_norm': 1.1551623657624077e-08, 'learning_rate': 3e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0204403400421143, 'eval_runtime': 133.2335, 'eval_samples_per_second': 2.117, 'eval_steps_per_second': 0.27, 'epoch': 0.35}
{'loss': 0.0, 'grad_norm': 1.3455169423082225e-08, 'learning_rate': 3.6e-06, 'epoch': 0.43}
{'loss': 0.0, 'grad_norm': 1.3135776910644381e-08, 'learning_rate': 4.2000000000000004e-06, 'epoch': 0.5}
{'loss': 0.0, 'grad_norm': 1.4679147675167314e-08, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.57}
{'loss': 0.0, 'grad_norm': 1.5736496550289303e-08, 'learning_rate': 5.4e-06, 'epoch': 0.64}
{'loss': 0.0, 'grad_norm': 1.7065026725049393e-08, 'learning_rate': 6e-06, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.02101469039917, 'eval_runtime': 132.9376, 'eval_samples_per_second': 2.121, 'eval_steps_per_second': 0.271, 'epoch': 0.71}
{'loss': 0.0, 'grad_norm': 1.711586072872251e-08, 'learning_rate': 6.6e-06, 'epoch': 0.78}
{'loss': 0.0, 'grad_norm': 1.1719317072333979e-08, 'learning_rate': 7.2e-06, 'epoch': 0.85}
{'loss': 0.0, 'grad_norm': 1.4014989169197634e-08, 'learning_rate': 7.8e-06, 'epoch': 0.92}
{'loss': 0.0, 'grad_norm': 1.4332699471708565e-08, 'learning_rate': 8.400000000000001e-06, 'epoch': 0.99}
{'loss': 0.0, 'grad_norm': 1.3942988097426223e-08, 'learning_rate': 9e-06, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.022254467010498, 'eval_runtime': 140.5685, 'eval_samples_per_second': 2.006, 'eval_steps_per_second': 0.256, 'epoch': 1.06}
{'loss': 0.0, 'grad_norm': 1.5244376427858697e-08, 'learning_rate': 9.600000000000001e-06, 'epoch': 1.13}
{'loss': 0.0, 'grad_norm': 1.8736818319098347e-08, 'learning_rate': 1.02e-05, 'epoch': 1.21}
{'loss': 0.0, 'grad_norm': 1.4561321037831476e-08, 'learning_rate': 1.08e-05, 'epoch': 1.28}
{'loss': 0.0, 'grad_norm': 1.146116002104236e-08, 'learning_rate': 1.1400000000000001e-05, 'epoch': 1.35}
{'loss': 0.0, 'grad_norm': 1.2064136356570998e-08, 'learning_rate': 1.2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0236055850982666, 'eval_runtime': 137.1508, 'eval_samples_per_second': 2.056, 'eval_steps_per_second': 0.262, 'epoch': 1.42}
{'loss': 0.0, 'grad_norm': 1.558044537830483e-08, 'learning_rate': 1.26e-05, 'epoch': 1.49}
{'loss': 0.0, 'grad_norm': 1.1201108485181521e-08, 'learning_rate': 1.32e-05, 'epoch': 1.56}
{'loss': 0.0, 'grad_norm': 1.1767838259402197e-08, 'learning_rate': 1.3800000000000002e-05, 'epoch': 1.63}
{'loss': 0.0, 'grad_norm': 1.6411025427487402e-08, 'learning_rate': 1.44e-05, 'epoch': 1.7}
{'loss': 0.0, 'grad_norm': 1.466020460583195e-08, 'learning_rate': 1.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0252978801727295, 'eval_runtime': 146.2085, 'eval_samples_per_second': 1.929, 'eval_steps_per_second': 0.246, 'epoch': 1.77}
{'loss': 0.0, 'grad_norm': 1.3174595636655795e-08, 'learning_rate': 1.56e-05, 'epoch': 1.84}
{'loss': 0.0, 'grad_norm': 1.3556670452885555e-08, 'learning_rate': 1.62e-05, 'epoch': 1.91}
{'loss': 0.0, 'grad_norm': 1.125463189310949e-08, 'learning_rate': 1.6800000000000002e-05, 'epoch': 1.99}
{'loss': 0.0, 'grad_norm': 1.2821034900412087e-08, 'learning_rate': 1.74e-05, 'epoch': 2.06}
{'loss': 0.0, 'grad_norm': 1.445181219139613e-08, 'learning_rate': 1.8e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0280256271362305, 'eval_runtime': 149.4391, 'eval_samples_per_second': 1.887, 'eval_steps_per_second': 0.241, 'epoch': 2.13}
{'loss': 0.0, 'grad_norm': 1.108441516350922e-08, 'learning_rate': 1.86e-05, 'epoch': 2.2}
{'loss': 0.0, 'grad_norm': 1.104126301498809e-08, 'learning_rate': 1.9200000000000003e-05, 'epoch': 2.27}
{'loss': 0.0, 'grad_norm': 8.700427933661103e-09, 'learning_rate': 1.98e-05, 'epoch': 2.34}
{'loss': 0.0, 'grad_norm': 1.3848387325765543e-08, 'learning_rate': 2.04e-05, 'epoch': 2.41}
{'loss': 0.0, 'grad_norm': 1.335652566325507e-08, 'learning_rate': 2.1e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.030514717102051, 'eval_runtime': 146.4599, 'eval_samples_per_second': 1.925, 'eval_steps_per_second': 0.246, 'epoch': 2.48}
{'loss': 0.0, 'grad_norm': 1.2695667628292995e-08, 'learning_rate': 2.16e-05, 'epoch': 2.55}
{'loss': 0.0, 'grad_norm': 1.1568003444040187e-08, 'learning_rate': 2.22e-05, 'epoch': 2.62}
{'loss': 0.0, 'grad_norm': 1.1055263371417823e-08, 'learning_rate': 2.2800000000000002e-05, 'epoch': 2.7}
{'loss': 0.0, 'grad_norm': 1.068803179293809e-08, 'learning_rate': 2.3400000000000003e-05, 'epoch': 2.77}
{'loss': 0.0, 'grad_norm': 1.2246541558624813e-08, 'learning_rate': 2.4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.033946990966797, 'eval_runtime': 146.8706, 'eval_samples_per_second': 1.92, 'eval_steps_per_second': 0.245, 'epoch': 2.84}
{'loss': 0.0, 'grad_norm': 1.0968249419818221e-08, 'learning_rate': 2.4599999999999998e-05, 'epoch': 2.91}
{'loss': 0.0, 'grad_norm': 1.1128721943975961e-08, 'learning_rate': 2.52e-05, 'epoch': 2.98}
{'loss': 0.0, 'grad_norm': 1.0356098201214081e-08, 'learning_rate': 2.58e-05, 'epoch': 3.05}
{'loss': 0.0, 'grad_norm': 1.286512940623652e-08, 'learning_rate': 2.64e-05, 'epoch': 3.12}
{'loss': 0.0, 'grad_norm': 1.2652033198889967e-08, 'learning_rate': 2.7000000000000002e-05, 'epoch': 3.19}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0372490882873535, 'eval_runtime': 145.6808, 'eval_samples_per_second': 1.936, 'eval_steps_per_second': 0.247, 'epoch': 3.19}
{'loss': 0.0, 'grad_norm': 1.0598442123921359e-08, 'learning_rate': 2.7600000000000003e-05, 'epoch': 3.26}
{'loss': 0.0, 'grad_norm': 9.54026724286905e-09, 'learning_rate': 2.8199999999999998e-05, 'epoch': 3.33}
{'loss': 0.0, 'grad_norm': 1.2795027259926428e-08, 'learning_rate': 2.88e-05, 'epoch': 3.4}
{'loss': 0.0, 'grad_norm': 1.1838834801380926e-08, 'learning_rate': 2.94e-05, 'epoch': 3.48}
{'loss': 0.0, 'grad_norm': 1.2851104180811035e-08, 'learning_rate': 3e-05, 'epoch': 3.55}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.039954900741577, 'eval_runtime': 149.9503, 'eval_samples_per_second': 1.881, 'eval_steps_per_second': 0.24, 'epoch': 3.55}
{'loss': 0.0, 'grad_norm': 1.0198585087550782e-08, 'learning_rate': 2.5312500000000002e-05, 'epoch': 3.62}
{'loss': 0.0, 'grad_norm': 1.1192149429462006e-08, 'learning_rate': 2.0625e-05, 'epoch': 3.69}
{'loss': 0.0, 'grad_norm': 1.1496772422958657e-08, 'learning_rate': 1.59375e-05, 'epoch': 3.76}
{'loss': 0.0, 'grad_norm': 8.714348354033064e-09, 'learning_rate': 1.125e-05, 'epoch': 3.83}
{'loss': 0.0, 'grad_norm': 1.1724155868364505e-08, 'learning_rate': 6.5625e-06, 'epoch': 3.9}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0417139530181885, 'eval_runtime': 165.0227, 'eval_samples_per_second': 1.709, 'eval_steps_per_second': 0.218, 'epoch': 3.9}
{'loss': 0.0, 'grad_norm': 1.7924485007370095e-08, 'learning_rate': 1.875e-06, 'epoch': 3.97}


Could not locate the best model at ./results\checkpoint-50\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 32710.105, 'train_samples_per_second': 0.137, 'train_steps_per_second': 0.017, 'train_loss': 0.0, 'epoch': 4.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 3e-05, Weight decay: 0.01, Epochs: 4, Eval loss: 2.0415918827056885




  0%|          | 0/705 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': 1.371493585367034e-08, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.07}
{'loss': 0.0, 'grad_norm': 1.1694127444172864e-08, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.14}
{'loss': 0.0, 'grad_norm': 1.3711238366909129e-08, 'learning_rate': 1.8e-06, 'epoch': 0.21}
{'loss': 0.0, 'grad_norm': 1.0542879458341758e-08, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.28}
{'loss': 0.0, 'grad_norm': 8.794073913520606e-09, 'learning_rate': 3e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.041750192642212, 'eval_runtime': 157.2822, 'eval_samples_per_second': 1.793, 'eval_steps_per_second': 0.229, 'epoch': 0.35}
{'loss': 0.0, 'grad_norm': 1.025930718157042e-08, 'learning_rate': 3.6e-06, 'epoch': 0.43}
{'loss': 0.0, 'grad_norm': 1.0010105633284638e-08, 'learning_rate': 4.2000000000000004e-06, 'epoch': 0.5}
{'loss': 0.0, 'grad_norm': 1.1215718132007169e-08, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.57}
{'loss': 0.0, 'grad_norm': 1.2024495177342942e-08, 'learning_rate': 5.4e-06, 'epoch': 0.64}
{'loss': 0.0, 'grad_norm': 1.3069474391613767e-08, 'learning_rate': 6e-06, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0420925617218018, 'eval_runtime': 145.4589, 'eval_samples_per_second': 1.939, 'eval_steps_per_second': 0.247, 'epoch': 0.71}
{'loss': 0.0, 'grad_norm': 1.3117264607842571e-08, 'learning_rate': 6.6e-06, 'epoch': 0.78}
{'loss': 0.0, 'grad_norm': 8.939877282898578e-09, 'learning_rate': 7.2e-06, 'epoch': 0.85}
{'loss': 0.0, 'grad_norm': 1.0704495068125652e-08, 'learning_rate': 7.8e-06, 'epoch': 0.92}
{'loss': 0.0, 'grad_norm': 1.0976120456973604e-08, 'learning_rate': 8.400000000000001e-06, 'epoch': 0.99}
{'loss': 0.0, 'grad_norm': 1.0670100358822765e-08, 'learning_rate': 9e-06, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.043036937713623, 'eval_runtime': 148.2548, 'eval_samples_per_second': 1.902, 'eval_steps_per_second': 0.243, 'epoch': 1.06}
{'loss': 0.0, 'grad_norm': 1.1715844294712952e-08, 'learning_rate': 9.600000000000001e-06, 'epoch': 1.13}
{'loss': 0.0, 'grad_norm': 1.446416231232206e-08, 'learning_rate': 1.02e-05, 'epoch': 1.21}
{'loss': 0.0, 'grad_norm': 1.119604853272449e-08, 'learning_rate': 1.08e-05, 'epoch': 1.28}
{'loss': 0.0, 'grad_norm': 8.792331307461154e-09, 'learning_rate': 1.1400000000000001e-05, 'epoch': 1.35}
{'loss': 0.0, 'grad_norm': 9.279482959811958e-09, 'learning_rate': 1.2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.043898582458496, 'eval_runtime': 144.603, 'eval_samples_per_second': 1.95, 'eval_steps_per_second': 0.249, 'epoch': 1.42}
{'loss': 0.0, 'grad_norm': 1.2036755592248483e-08, 'learning_rate': 1.26e-05, 'epoch': 1.49}
{'loss': 0.0, 'grad_norm': 8.612962787424294e-09, 'learning_rate': 1.32e-05, 'epoch': 1.56}
{'loss': 0.0, 'grad_norm': 9.060383554526652e-09, 'learning_rate': 1.3800000000000002e-05, 'epoch': 1.63}
{'loss': 0.0, 'grad_norm': 1.2705688945402471e-08, 'learning_rate': 1.44e-05, 'epoch': 1.7}
{'loss': 0.0, 'grad_norm': 1.1341407812892612e-08, 'learning_rate': 1.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0450119972229004, 'eval_runtime': 143.7449, 'eval_samples_per_second': 1.962, 'eval_steps_per_second': 0.25, 'epoch': 1.77}
{'loss': 0.0, 'grad_norm': 1.0206757217190443e-08, 'learning_rate': 1.56e-05, 'epoch': 1.84}
{'loss': 0.0, 'grad_norm': 1.0519510595941028e-08, 'learning_rate': 1.62e-05, 'epoch': 1.91}
{'loss': 0.0, 'grad_norm': 8.718953559139209e-09, 'learning_rate': 1.6800000000000002e-05, 'epoch': 1.99}
{'loss': 0.0, 'grad_norm': 9.981219406540731e-09, 'learning_rate': 1.74e-05, 'epoch': 2.06}
{'loss': 0.0, 'grad_norm': 1.1251518827748441e-08, 'learning_rate': 1.8e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.04712176322937, 'eval_runtime': 139.7642, 'eval_samples_per_second': 2.018, 'eval_steps_per_second': 0.258, 'epoch': 2.13}
{'loss': 0.0, 'grad_norm': 8.625182346122529e-09, 'learning_rate': 1.86e-05, 'epoch': 2.2}
{'loss': 0.0, 'grad_norm': 8.607425883155884e-09, 'learning_rate': 1.9200000000000003e-05, 'epoch': 2.27}
{'loss': 0.0, 'grad_norm': 6.7643792789340296e-09, 'learning_rate': 1.98e-05, 'epoch': 2.34}
{'loss': 0.0, 'grad_norm': 1.0874456890519468e-08, 'learning_rate': 2.04e-05, 'epoch': 2.41}
{'loss': 0.0, 'grad_norm': 1.0469560329795513e-08, 'learning_rate': 2.1e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0488035678863525, 'eval_runtime': 143.5375, 'eval_samples_per_second': 1.965, 'eval_steps_per_second': 0.251, 'epoch': 2.48}
{'loss': 0.0, 'grad_norm': 9.969000736020917e-09, 'learning_rate': 2.16e-05, 'epoch': 2.55}
{'loss': 0.0, 'grad_norm': 9.082627983048042e-09, 'learning_rate': 2.22e-05, 'epoch': 2.62}
{'loss': 0.0, 'grad_norm': 8.694596154157352e-09, 'learning_rate': 2.2800000000000002e-05, 'epoch': 2.7}
{'loss': 0.0, 'grad_norm': 8.402351703296063e-09, 'learning_rate': 2.3400000000000003e-05, 'epoch': 2.77}
{'loss': 0.0, 'grad_norm': 9.672700862495276e-09, 'learning_rate': 2.4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.051415205001831, 'eval_runtime': 191.3203, 'eval_samples_per_second': 1.474, 'eval_steps_per_second': 0.188, 'epoch': 2.84}
{'loss': 0.0, 'grad_norm': 8.660111738834075e-09, 'learning_rate': 2.4599999999999998e-05, 'epoch': 2.91}
{'loss': 0.0, 'grad_norm': 8.80662209823413e-09, 'learning_rate': 2.52e-05, 'epoch': 2.98}
{'loss': 0.0, 'grad_norm': 8.20957168912173e-09, 'learning_rate': 2.58e-05, 'epoch': 3.05}
{'loss': 0.0, 'grad_norm': 1.023448792381032e-08, 'learning_rate': 2.64e-05, 'epoch': 3.12}
{'loss': 0.0, 'grad_norm': 1.0087496171706789e-08, 'learning_rate': 2.7000000000000002e-05, 'epoch': 3.19}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0540010929107666, 'eval_runtime': 142.2949, 'eval_samples_per_second': 1.982, 'eval_steps_per_second': 0.253, 'epoch': 3.19}
{'loss': 0.0, 'grad_norm': 8.44604475247479e-09, 'learning_rate': 2.7600000000000003e-05, 'epoch': 3.26}
{'loss': 0.0, 'grad_norm': 7.598706552869317e-09, 'learning_rate': 2.8199999999999998e-05, 'epoch': 3.33}
{'loss': 0.0, 'grad_norm': 1.0239657122212975e-08, 'learning_rate': 2.88e-05, 'epoch': 3.4}
{'loss': 0.0, 'grad_norm': 9.489840913090575e-09, 'learning_rate': 2.94e-05, 'epoch': 3.48}
{'loss': 0.0, 'grad_norm': 1.0321661747525468e-08, 'learning_rate': 3e-05, 'epoch': 3.55}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0558412075042725, 'eval_runtime': 142.153, 'eval_samples_per_second': 1.984, 'eval_steps_per_second': 0.253, 'epoch': 3.55}
{'loss': 0.0, 'grad_norm': 8.16840639572547e-09, 'learning_rate': 2.8536585365853658e-05, 'epoch': 3.62}
{'loss': 0.0, 'grad_norm': 8.998283895778059e-09, 'learning_rate': 2.707317073170732e-05, 'epoch': 3.69}
{'loss': 0.0, 'grad_norm': 9.237972165010433e-09, 'learning_rate': 2.5609756097560977e-05, 'epoch': 3.76}
{'loss': 0.0, 'grad_norm': 6.973096322582251e-09, 'learning_rate': 2.4146341463414638e-05, 'epoch': 3.83}
{'loss': 0.0, 'grad_norm': 9.391291300175908e-09, 'learning_rate': 2.2682926829268295e-05, 'epoch': 3.9}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.057222843170166, 'eval_runtime': 144.1717, 'eval_samples_per_second': 1.956, 'eval_steps_per_second': 0.25, 'epoch': 3.9}
{'loss': 0.0, 'grad_norm': 1.4306507978290028e-08, 'learning_rate': 2.121951219512195e-05, 'epoch': 3.97}
{'loss': 0.0, 'grad_norm': 7.88744269897279e-09, 'learning_rate': 1.975609756097561e-05, 'epoch': 4.04}
{'loss': 0.0, 'grad_norm': 6.925667594970264e-09, 'learning_rate': 1.8292682926829268e-05, 'epoch': 4.11}
{'loss': 0.0, 'grad_norm': 1.1304345903795365e-08, 'learning_rate': 1.682926829268293e-05, 'epoch': 4.18}
{'loss': 0.0, 'grad_norm': 7.628035980644654e-09, 'learning_rate': 1.5365853658536586e-05, 'epoch': 4.26}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0573930740356445, 'eval_runtime': 143.8877, 'eval_samples_per_second': 1.96, 'eval_steps_per_second': 0.25, 'epoch': 4.26}
{'loss': 0.0, 'grad_norm': 8.9729841334929e-09, 'learning_rate': 1.3902439024390245e-05, 'epoch': 4.33}
{'loss': 0.0, 'grad_norm': 7.86736187308179e-09, 'learning_rate': 1.2439024390243903e-05, 'epoch': 4.4}
{'loss': 0.0, 'grad_norm': 9.283468216381152e-09, 'learning_rate': 1.097560975609756e-05, 'epoch': 4.47}
{'loss': 0.0, 'grad_norm': 1.2062492338316133e-08, 'learning_rate': 9.51219512195122e-06, 'epoch': 4.54}
{'loss': 0.0, 'grad_norm': 9.12128683694391e-09, 'learning_rate': 8.048780487804879e-06, 'epoch': 4.61}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.060394048690796, 'eval_runtime': 144.1727, 'eval_samples_per_second': 1.956, 'eval_steps_per_second': 0.25, 'epoch': 4.61}
{'loss': 0.0, 'grad_norm': 8.348172819694355e-09, 'learning_rate': 6.585365853658537e-06, 'epoch': 4.68}
{'loss': 0.0, 'grad_norm': 9.278877222129722e-09, 'learning_rate': 5.121951219512195e-06, 'epoch': 4.75}
{'loss': 0.0, 'grad_norm': 8.067428503011342e-09, 'learning_rate': 3.6585365853658537e-06, 'epoch': 4.82}
{'loss': 0.0, 'grad_norm': 6.408168218285937e-09, 'learning_rate': 2.195121951219512e-06, 'epoch': 4.89}
{'loss': 0.0, 'grad_norm': 9.09202402255005e-09, 'learning_rate': 7.317073170731708e-07, 'epoch': 4.96}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0642549991607666, 'eval_runtime': 142.4174, 'eval_samples_per_second': 1.98, 'eval_steps_per_second': 0.253, 'epoch': 4.96}


Could not locate the best model at ./results\checkpoint-50\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 10135.2147, 'train_samples_per_second': 0.555, 'train_steps_per_second': 0.07, 'train_loss': 0.0, 'epoch': 5.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 3e-05, Weight decay: 0.01, Epochs: 5, Eval loss: 2.0642573833465576




  0%|          | 0/423 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': 1.060208365544213e-08, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.07}
{'loss': 0.0, 'grad_norm': 9.024501146370767e-09, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.14}
{'loss': 0.0, 'grad_norm': 1.0605356592918724e-08, 'learning_rate': 1.8e-06, 'epoch': 0.21}
{'loss': 0.0, 'grad_norm': 8.127850392725122e-09, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.28}
{'loss': 0.0, 'grad_norm': 6.765257243301903e-09, 'learning_rate': 3e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0643529891967773, 'eval_runtime': 142.4678, 'eval_samples_per_second': 1.979, 'eval_steps_per_second': 0.253, 'epoch': 0.35}
{'loss': 0.0, 'grad_norm': 7.90364396152654e-09, 'learning_rate': 3.6e-06, 'epoch': 0.43}
{'loss': 0.0, 'grad_norm': 7.706885796210372e-09, 'learning_rate': 4.2000000000000004e-06, 'epoch': 0.5}
{'loss': 0.0, 'grad_norm': 8.656440897425455e-09, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.57}
{'loss': 0.0, 'grad_norm': 9.280361368269041e-09, 'learning_rate': 5.4e-06, 'epoch': 0.64}
{'loss': 0.0, 'grad_norm': 1.0108237802342046e-08, 'learning_rate': 6e-06, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.064572811126709, 'eval_runtime': 143.6396, 'eval_samples_per_second': 1.963, 'eval_steps_per_second': 0.251, 'epoch': 0.71}
{'loss': 0.0, 'grad_norm': 1.015084816202716e-08, 'learning_rate': 6.6e-06, 'epoch': 0.78}
{'loss': 0.0, 'grad_norm': 6.8864247637634435e-09, 'learning_rate': 7.2e-06, 'epoch': 0.85}
{'loss': 0.0, 'grad_norm': 8.254662731133067e-09, 'learning_rate': 7.8e-06, 'epoch': 0.92}
{'loss': 0.0, 'grad_norm': 8.484419389276354e-09, 'learning_rate': 8.400000000000001e-06, 'epoch': 0.99}
{'loss': 0.0, 'grad_norm': 8.241284987775543e-09, 'learning_rate': 9e-06, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0652694702148438, 'eval_runtime': 142.2911, 'eval_samples_per_second': 1.982, 'eval_steps_per_second': 0.253, 'epoch': 1.06}
{'loss': 0.0, 'grad_norm': 9.085056262847502e-09, 'learning_rate': 9.600000000000001e-06, 'epoch': 1.13}
{'loss': 0.0, 'grad_norm': 1.1263006527428843e-08, 'learning_rate': 1.02e-05, 'epoch': 1.21}
{'loss': 0.0, 'grad_norm': 8.683176844215268e-09, 'learning_rate': 1.08e-05, 'epoch': 1.28}
{'loss': 0.0, 'grad_norm': 6.8027943278536895e-09, 'learning_rate': 1.1400000000000001e-05, 'epoch': 1.35}
{'loss': 0.0, 'grad_norm': 7.197507478906573e-09, 'learning_rate': 1.2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.065821409225464, 'eval_runtime': 151.4328, 'eval_samples_per_second': 1.862, 'eval_steps_per_second': 0.238, 'epoch': 1.42}
{'loss': 0.0, 'grad_norm': 9.367307818308745e-09, 'learning_rate': 1.26e-05, 'epoch': 1.49}
{'loss': 0.0, 'grad_norm': 6.675584085513719e-09, 'learning_rate': 1.32e-05, 'epoch': 1.56}
{'loss': 0.0, 'grad_norm': 7.029696824645271e-09, 'learning_rate': 1.3800000000000002e-05, 'epoch': 1.63}
{'loss': 0.0, 'grad_norm': 9.909678411190725e-09, 'learning_rate': 1.44e-05, 'epoch': 1.7}
{'loss': 0.0, 'grad_norm': 8.837019116469946e-09, 'learning_rate': 1.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0665409564971924, 'eval_runtime': 197.5264, 'eval_samples_per_second': 1.428, 'eval_steps_per_second': 0.182, 'epoch': 1.77}
{'loss': 0.0, 'grad_norm': 7.962371206815533e-09, 'learning_rate': 1.56e-05, 'epoch': 1.84}
{'loss': 0.0, 'grad_norm': 8.216848534914334e-09, 'learning_rate': 1.62e-05, 'epoch': 1.91}
{'loss': 0.0, 'grad_norm': 6.798120288920018e-09, 'learning_rate': 1.6800000000000002e-05, 'epoch': 1.99}
{'loss': 0.0, 'grad_norm': 7.817875236071359e-09, 'learning_rate': 1.74e-05, 'epoch': 2.06}
{'loss': 0.0, 'grad_norm': 8.810960849814364e-09, 'learning_rate': 1.8e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.068129539489746, 'eval_runtime': 162.2213, 'eval_samples_per_second': 1.738, 'eval_steps_per_second': 0.222, 'epoch': 2.13}
{'loss': 0.0, 'grad_norm': 6.7491736643887634e-09, 'learning_rate': 1.86e-05, 'epoch': 2.2}
{'loss': 0.0, 'grad_norm': 6.746006864233323e-09, 'learning_rate': 1.9200000000000003e-05, 'epoch': 2.27}
{'loss': 0.0, 'grad_norm': 5.286206139487604e-09, 'learning_rate': 1.98e-05, 'epoch': 2.34}
{'loss': 0.0, 'grad_norm': 8.580060217866503e-09, 'learning_rate': 2.04e-05, 'epoch': 2.41}
{'loss': 0.0, 'grad_norm': 8.243521421036348e-09, 'learning_rate': 2.1e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0692150592803955, 'eval_runtime': 154.2122, 'eval_samples_per_second': 1.829, 'eval_steps_per_second': 0.233, 'epoch': 2.48}
{'loss': 0.0, 'grad_norm': 7.860706752182978e-09, 'learning_rate': 2.16e-05, 'epoch': 2.55}
{'loss': 0.0, 'grad_norm': 7.15898229586287e-09, 'learning_rate': 2.22e-05, 'epoch': 2.62}
{'loss': 0.0, 'grad_norm': 6.8626202498478506e-09, 'learning_rate': 2.2800000000000002e-05, 'epoch': 2.7}
{'loss': 0.0, 'grad_norm': 6.627424831151529e-09, 'learning_rate': 2.3400000000000003e-05, 'epoch': 2.77}
{'loss': 0.0, 'grad_norm': 7.662925405327314e-09, 'learning_rate': 2.4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0711257457733154, 'eval_runtime': 148.1412, 'eval_samples_per_second': 1.904, 'eval_steps_per_second': 0.243, 'epoch': 2.84}
{'loss': 0.0, 'grad_norm': 6.856709422464746e-09, 'learning_rate': 2.4599999999999998e-05, 'epoch': 2.91}
{'loss': 0.0, 'grad_norm': 6.9858034912329e-09, 'learning_rate': 2.52e-05, 'epoch': 2.98}


Could not locate the best model at ./results\checkpoint-50\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 6693.5163, 'train_samples_per_second': 0.504, 'train_steps_per_second': 0.063, 'train_loss': 0.0, 'epoch': 3.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 3e-05, Weight decay: 0.015, Epochs: 3, Eval loss: 2.0717782974243164




  0%|          | 0/564 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': 9.524132593696777e-09, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.07}
{'loss': 0.0, 'grad_norm': 8.101240567270906e-09, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.14}
{'loss': 0.0, 'grad_norm': 9.52960377276213e-09, 'learning_rate': 1.8e-06, 'epoch': 0.21}
{'loss': 0.0, 'grad_norm': 7.29309501679154e-09, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.28}
{'loss': 0.0, 'grad_norm': 6.065028035351361e-09, 'learning_rate': 3e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0718557834625244, 'eval_runtime': 142.7398, 'eval_samples_per_second': 1.976, 'eval_steps_per_second': 0.252, 'epoch': 0.35}
{'loss': 0.0, 'grad_norm': 7.0895351811373075e-09, 'learning_rate': 3.6e-06, 'epoch': 0.43}
{'loss': 0.0, 'grad_norm': 6.911204497583867e-09, 'learning_rate': 4.2000000000000004e-06, 'epoch': 0.5}
{'loss': 0.0, 'grad_norm': 7.77054065537186e-09, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.57}
{'loss': 0.0, 'grad_norm': 8.330261813682682e-09, 'learning_rate': 5.4e-06, 'epoch': 0.64}
{'loss': 0.0, 'grad_norm': 9.081307261737948e-09, 'learning_rate': 6e-06, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.072028160095215, 'eval_runtime': 142.6403, 'eval_samples_per_second': 1.977, 'eval_steps_per_second': 0.252, 'epoch': 0.71}
{'loss': 0.0, 'grad_norm': 9.121446709059455e-09, 'learning_rate': 6.6e-06, 'epoch': 0.78}
{'loss': 0.0, 'grad_norm': 6.176008149338941e-09, 'learning_rate': 7.2e-06, 'epoch': 0.85}
{'loss': 0.0, 'grad_norm': 7.406043778246385e-09, 'learning_rate': 7.8e-06, 'epoch': 0.92}
{'loss': 0.0, 'grad_norm': 7.620979403100137e-09, 'learning_rate': 8.400000000000001e-06, 'epoch': 0.99}
{'loss': 0.0, 'grad_norm': 7.398513801604167e-09, 'learning_rate': 9e-06, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0726478099823, 'eval_runtime': 143.2358, 'eval_samples_per_second': 1.969, 'eval_steps_per_second': 0.251, 'epoch': 1.06}
{'loss': 0.0, 'grad_norm': 8.169178222772189e-09, 'learning_rate': 9.600000000000001e-06, 'epoch': 1.13}
{'loss': 0.0, 'grad_norm': 1.0145139839323747e-08, 'learning_rate': 1.02e-05, 'epoch': 1.21}
{'loss': 0.0, 'grad_norm': 7.807649637925351e-09, 'learning_rate': 1.08e-05, 'epoch': 1.28}
{'loss': 0.0, 'grad_norm': 6.110357553268386e-09, 'learning_rate': 1.1400000000000001e-05, 'epoch': 1.35}
{'loss': 0.0, 'grad_norm': 6.4713625569368105e-09, 'learning_rate': 1.2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.073093891143799, 'eval_runtime': 142.4209, 'eval_samples_per_second': 1.98, 'eval_steps_per_second': 0.253, 'epoch': 1.42}
{'loss': 0.0, 'grad_norm': 8.435721454702616e-09, 'learning_rate': 1.26e-05, 'epoch': 1.49}
{'loss': 0.0, 'grad_norm': 5.9994689216580355e-09, 'learning_rate': 1.32e-05, 'epoch': 1.56}
{'loss': 0.0, 'grad_norm': 6.3200928934747935e-09, 'learning_rate': 1.3800000000000002e-05, 'epoch': 1.63}
{'loss': 0.0, 'grad_norm': 8.928075168057603e-09, 'learning_rate': 1.44e-05, 'epoch': 1.7}
{'loss': 0.0, 'grad_norm': 7.958003145347448e-09, 'learning_rate': 1.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0736823081970215, 'eval_runtime': 142.2901, 'eval_samples_per_second': 1.982, 'eval_steps_per_second': 0.253, 'epoch': 1.77}
{'loss': 0.0, 'grad_norm': 7.1734085338448494e-09, 'learning_rate': 1.56e-05, 'epoch': 1.84}
{'loss': 0.0, 'grad_norm': 7.406410595933721e-09, 'learning_rate': 1.62e-05, 'epoch': 1.91}
{'loss': 0.0, 'grad_norm': 6.122271578590244e-09, 'learning_rate': 1.6800000000000002e-05, 'epoch': 1.99}
{'loss': 0.0, 'grad_norm': 7.053851724947435e-09, 'learning_rate': 1.74e-05, 'epoch': 2.06}
{'loss': 0.0, 'grad_norm': 7.948480096331423e-09, 'learning_rate': 1.8e-05, 'epoch': 2.13}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0751099586486816, 'eval_runtime': 142.2486, 'eval_samples_per_second': 1.982, 'eval_steps_per_second': 0.253, 'epoch': 2.13}
{'loss': 0.0, 'grad_norm': 6.0860965156450675e-09, 'learning_rate': 1.86e-05, 'epoch': 2.2}
{'loss': 0.0, 'grad_norm': 6.086837700536307e-09, 'learning_rate': 1.9200000000000003e-05, 'epoch': 2.27}
{'loss': 0.0, 'grad_norm': 4.763468730573095e-09, 'learning_rate': 1.98e-05, 'epoch': 2.34}
{'loss': 0.0, 'grad_norm': 7.762465337179947e-09, 'learning_rate': 2.04e-05, 'epoch': 2.41}
{'loss': 0.0, 'grad_norm': 7.45063566398585e-09, 'learning_rate': 2.1e-05, 'epoch': 2.48}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.075984001159668, 'eval_runtime': 142.9371, 'eval_samples_per_second': 1.973, 'eval_steps_per_second': 0.252, 'epoch': 2.48}
{'loss': 0.0, 'grad_norm': 7.1083938735228e-09, 'learning_rate': 2.16e-05, 'epoch': 2.55}
{'loss': 0.0, 'grad_norm': 6.472216984576562e-09, 'learning_rate': 2.22e-05, 'epoch': 2.62}
{'loss': 0.0, 'grad_norm': 6.207311997741272e-09, 'learning_rate': 2.2800000000000002e-05, 'epoch': 2.7}
{'loss': 0.0, 'grad_norm': 5.99225735697928e-09, 'learning_rate': 2.3400000000000003e-05, 'epoch': 2.77}
{'loss': 0.0, 'grad_norm': 6.940750196804402e-09, 'learning_rate': 2.4e-05, 'epoch': 2.84}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0776612758636475, 'eval_runtime': 145.5618, 'eval_samples_per_second': 1.937, 'eval_steps_per_second': 0.247, 'epoch': 2.84}
{'loss': 0.0, 'grad_norm': 6.208462188794783e-09, 'learning_rate': 2.4599999999999998e-05, 'epoch': 2.91}
{'loss': 0.0, 'grad_norm': 6.330200807980191e-09, 'learning_rate': 2.52e-05, 'epoch': 2.98}
{'loss': 0.0, 'grad_norm': 5.9125895290890185e-09, 'learning_rate': 2.58e-05, 'epoch': 3.05}
{'loss': 0.0, 'grad_norm': 7.405168034324561e-09, 'learning_rate': 2.64e-05, 'epoch': 3.12}
{'loss': 0.0, 'grad_norm': 7.318221584284856e-09, 'learning_rate': 2.7000000000000002e-05, 'epoch': 3.19}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0794241428375244, 'eval_runtime': 142.3581, 'eval_samples_per_second': 1.981, 'eval_steps_per_second': 0.253, 'epoch': 3.19}
{'loss': 0.0, 'grad_norm': 6.119849071950512e-09, 'learning_rate': 2.7600000000000003e-05, 'epoch': 3.26}
{'loss': 0.0, 'grad_norm': 5.499698918498552e-09, 'learning_rate': 2.8199999999999998e-05, 'epoch': 3.33}
{'loss': 0.0, 'grad_norm': 7.457884088069022e-09, 'learning_rate': 2.88e-05, 'epoch': 3.4}
{'loss': 0.0, 'grad_norm': 6.923507545053553e-09, 'learning_rate': 2.94e-05, 'epoch': 3.48}
{'loss': 0.0, 'grad_norm': 7.547805047636302e-09, 'learning_rate': 3e-05, 'epoch': 3.55}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.080517292022705, 'eval_runtime': 142.5402, 'eval_samples_per_second': 1.978, 'eval_steps_per_second': 0.253, 'epoch': 3.55}
{'loss': 0.0, 'grad_norm': 5.950528070286509e-09, 'learning_rate': 2.5312500000000002e-05, 'epoch': 3.62}
{'loss': 0.0, 'grad_norm': 6.604458757664133e-09, 'learning_rate': 2.0625e-05, 'epoch': 3.69}
{'loss': 0.0, 'grad_norm': 6.802931995508743e-09, 'learning_rate': 1.59375e-05, 'epoch': 3.76}
{'loss': 0.0, 'grad_norm': 5.135140757062118e-09, 'learning_rate': 1.125e-05, 'epoch': 3.83}
{'loss': 0.0, 'grad_norm': 6.976319966156552e-09, 'learning_rate': 6.5625e-06, 'epoch': 3.9}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0811731815338135, 'eval_runtime': 142.1377, 'eval_samples_per_second': 1.984, 'eval_steps_per_second': 0.253, 'epoch': 3.9}
{'loss': 0.0, 'grad_norm': 1.0671140415752234e-08, 'learning_rate': 1.875e-06, 'epoch': 3.97}


Could not locate the best model at ./results\checkpoint-50\pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


{'train_runtime': 7845.8873, 'train_samples_per_second': 0.573, 'train_steps_per_second': 0.072, 'train_loss': 0.0, 'epoch': 4.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Learning rate: 3e-05, Weight decay: 0.015, Epochs: 4, Eval loss: 2.081048011779785




  0%|          | 0/705 [00:00<?, ?it/s]

{'loss': 0.0, 'grad_norm': 8.200583323514365e-09, 'learning_rate': 6.000000000000001e-07, 'epoch': 0.07}
{'loss': 0.0, 'grad_norm': 6.968742471968881e-09, 'learning_rate': 1.2000000000000002e-06, 'epoch': 0.14}
{'loss': 0.0, 'grad_norm': 8.208494328698634e-09, 'learning_rate': 1.8e-06, 'epoch': 0.21}
{'loss': 0.0, 'grad_norm': 6.269716301687822e-09, 'learning_rate': 2.4000000000000003e-06, 'epoch': 0.28}
{'loss': 0.0, 'grad_norm': 5.2073509948513674e-09, 'learning_rate': 3e-06, 'epoch': 0.35}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0811076164245605, 'eval_runtime': 200.2302, 'eval_samples_per_second': 1.408, 'eval_steps_per_second': 0.18, 'epoch': 0.35}
{'loss': 0.0, 'grad_norm': 6.091704030097844e-09, 'learning_rate': 3.6e-06, 'epoch': 0.43}
{'loss': 0.0, 'grad_norm': 5.936152014385243e-09, 'learning_rate': 4.2000000000000004e-06, 'epoch': 0.5}
{'loss': 0.0, 'grad_norm': 6.68349020571668e-09, 'learning_rate': 4.800000000000001e-06, 'epoch': 0.57}
{'loss': 0.0, 'grad_norm': 7.164423276861953e-09, 'learning_rate': 5.4e-06, 'epoch': 0.64}
{'loss': 0.0, 'grad_norm': 7.819690672761226e-09, 'learning_rate': 6e-06, 'epoch': 0.71}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0812203884124756, 'eval_runtime': 171.679, 'eval_samples_per_second': 1.643, 'eval_steps_per_second': 0.21, 'epoch': 0.71}
{'loss': 0.0, 'grad_norm': 7.856297834507586e-09, 'learning_rate': 6.6e-06, 'epoch': 0.78}
{'loss': 0.0, 'grad_norm': 5.3047655157456575e-09, 'learning_rate': 7.2e-06, 'epoch': 0.85}
{'loss': 0.0, 'grad_norm': 6.364712312745269e-09, 'learning_rate': 7.8e-06, 'epoch': 0.92}
{'loss': 0.0, 'grad_norm': 6.5612835165040906e-09, 'learning_rate': 8.400000000000001e-06, 'epoch': 0.99}
{'loss': 0.0, 'grad_norm': 6.363238380657776e-09, 'learning_rate': 9e-06, 'epoch': 1.06}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.0817503929138184, 'eval_runtime': 176.1963, 'eval_samples_per_second': 1.6, 'eval_steps_per_second': 0.204, 'epoch': 1.06}
{'loss': 0.0, 'grad_norm': 7.0417520703358605e-09, 'learning_rate': 9.600000000000001e-06, 'epoch': 1.13}
{'loss': 0.0, 'grad_norm': 8.766122050474223e-09, 'learning_rate': 1.02e-05, 'epoch': 1.21}
{'loss': 0.0, 'grad_norm': 6.729522716852898e-09, 'learning_rate': 1.08e-05, 'epoch': 1.28}
{'loss': 0.0, 'grad_norm': 5.258500745952688e-09, 'learning_rate': 1.1400000000000001e-05, 'epoch': 1.35}
{'loss': 0.0, 'grad_norm': 5.576621830982731e-09, 'learning_rate': 1.2e-05, 'epoch': 1.42}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 2.082066774368286, 'eval_runtime': 175.065, 'eval_samples_per_second': 1.611, 'eval_steps_per_second': 0.206, 'epoch': 1.42}
{'loss': 0.0, 'grad_norm': 7.286288461472168e-09, 'learning_rate': 1.26e-05, 'epoch': 1.49}
{'loss': 0.0, 'grad_norm': 5.166517880184074e-09, 'learning_rate': 1.32e-05, 'epoch': 1.56}
{'loss': 0.0, 'grad_norm': 5.44517764211605e-09, 'learning_rate': 1.3800000000000002e-05, 'epoch': 1.63}
{'loss': 0.0, 'grad_norm': 7.714430871885725e-09, 'learning_rate': 1.44e-05, 'epoch': 1.7}
{'loss': 0.0, 'grad_norm': 6.871472280067792e-09, 'learning_rate': 1.5e-05, 'epoch': 1.77}


  0%|          | 0/36 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Huấn luyện với tham số tốt nhất
training_args_with_early_stop = TrainingArguments(
    output_dir='./results_with_early_stop',
    num_train_epochs=best_params['num_train_epochs'],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=best_params['weight_decay'],
    logging_dir='./logs_with_early_stop',
    logging_steps=10,
    eval_strategy="steps",  # Đổi từ evaluation_strategy thành eval_strategy
    eval_steps=50,
    save_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,
    learning_rate=best_params['learning_rate'],
)

In [None]:
trainer_with_early_stop = Trainer(
    model=model,
    args=training_args_with_early_stop,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer_with_early_stop.train()

In [None]:
# Dự đoán nhãn cho tập kiểm tra
predictions2 = trainer_with_early_stop.predict(val_dataset)

# Lấy nhãn dự đoán từ logits
pred_labels2 = np.argmax(predictions2.predictions, axis=1)

In [None]:
print(pred_labels2[:])  # In ra dự đoán

In [None]:
# Tính các chỉ số
accuracy2 = accuracy_score(val_labels, pred_labels2)
precision2 = precision_score(val_labels, pred_labels2, pos_label=0)
recall2 = recall_score(val_labels, pred_labels2, pos_label=0)
f12 = f1_score(val_labels, pred_labels2, pos_label=0)
auc2 = roc_auc_score(val_labels, predictions2.predictions[:, 1])

print(f"Accuracy: {accuracy2:.6f}")
print(f"Precision: {precision2:.6f}")
print(f"Recall: {recall2:.6f}")
print(f"F1 Score: {f12:.6f}")
print(f'AUC: {auc2:.6f}')