In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import sys
sys.path.append('../dataset')
import get_funsd
funsd = get_funsd.CustomFunsdDataset('../dataset/')
funsd.split_generators()
funsd

CustomFunsdDataset:
DatasetDict({
    train: Dataset({features: ['id', 'tokens', 'ner_boxes', 'bboxes', 'ner_tags','line_ids','linkings','image','image_name'], num_rows: 149}),
    test: Dataset({features: ['id', 'tokens', 'ner_boxes', 'bboxes', 'ner_tags','line_ids','linkings','image','image_name'], num_rows: 50})
})

In [5]:
label_map = {
    0: 'O',
    1: 'B-HEADER',
    2: 'I-HEADER',
    3: 'B-QUESTION',
    4: 'I-QUESTION',
    5: 'B-ANSWER', 
    6: 'I-ANSWER'}

In [6]:
from datasets import Dataset, DatasetDict

funsd_train_dataset = Dataset.from_dict({
    "id": [entry["id"] for entry in funsd["train"]],
    "tokens": [entry["tokens"] for entry in funsd["train"]],
    "ner_boxes": [entry["ner_boxes"] for entry in funsd["train"]],
    "bboxes": [entry["bboxes"] for entry in funsd["train"]],
    "ner_tags": [entry["ner_tags"] for entry in funsd["train"]],
    "line_ids": [entry["line_ids"] for entry in funsd["train"]],
    "linkings": [entry["linkings"] for entry in funsd["train"]],
    "image": [entry["image"] for entry in funsd["train"]],
    "image_name": [entry["image_name"] for entry in funsd["train"]],
})

funsd_test_dataset = Dataset.from_dict({
    "id": [entry["id"] for entry in funsd["test"]],
    "tokens": [entry["tokens"] for entry in funsd["test"]],
    "ner_boxes": [entry["ner_boxes"] for entry in funsd["test"]],
    "bboxes": [entry["bboxes"] for entry in funsd["test"]],
    "ner_tags": [entry["ner_tags"] for entry in funsd["test"]],
    "line_ids": [entry["line_ids"] for entry in funsd["test"]],
    "linkings": [entry["linkings"] for entry in funsd["test"]],
    "image": [entry["image"] for entry in funsd["test"]],
    "image_name": [entry["image_name"] for entry in funsd["test"]],
})

# Optionally, you can create a DatasetDict if you have train/test splits
dataset = DatasetDict({
    "train": funsd_train_dataset,
    'test': funsd_test_dataset
})


In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_boxes', 'bboxes', 'ner_tags', 'line_ids', 'linkings', 'image', 'image_name'],
        num_rows: 149
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_boxes', 'bboxes', 'ner_tags', 'line_ids', 'linkings', 'image', 'image_name'],
        num_rows: 50
    })
})

In [8]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [9]:
import sys
sys.path.append('../..')
from mytools import unnormalize_box, normalize_box

In [10]:

import numpy as np
def make_dataset(documents):
    images = documents['image']
    
    rgb_images = list(map(lambda img: img.convert('RGB'), images))
  
    words = documents['tokens']
    
    # boxes = documents['bboxes'] ## token boxes normalized below
    
    boxes = documents['ner_boxes']
    
    word_labels = documents['ner_tags']
    
    normalized_boxes = []

    # Normalize the boxes by image width and height
    for i in range(len(boxes)):
        normalized_boxes.append([])
        for j in range(len(boxes[i])):
            normalized_boxes[-1].append(normalize_box(boxes[i][j], images[i].width, images[i].height))

    encoding = processor(rgb_images, words, boxes=normalized_boxes, word_labels=word_labels,
                         truncation=True, padding="max_length", return_offsets_mapping=True)
    
    encoding.pop('offset_mapping')

    return encoding

In [11]:
cols = funsd_train_dataset.column_names

In [12]:
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D

features = Features({
    'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'labels': Sequence(feature=Value(dtype='int64')),
})
train_dataset = dataset["train"].map(
    make_dataset,
    batched=True,
    remove_columns = cols,
    features=features)

test_dataset = dataset["test"].map(
    make_dataset,
    batched=True,
    remove_columns = cols,
    features=features)


Map: 100%|██████████| 149/149 [00:01<00:00, 75.41 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 79.44 examples/s]


In [13]:
train_dataset.set_format("torch")
test_dataset.set_format("torch")

In [14]:
from evaluate import load 
metric = load("seqeval")

In [15]:
label_list = ['O', 'B-HEADER', 'I-HEADER', 'B-QUESTION', 'I-QUESTION', 'B-ANSWER', 'I-ANSWER']


In [16]:
import numpy as np

return_entity_level_metrics = False

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    if return_entity_level_metrics:
        # Unpack nested dictionaries
        final_results = {}
        for key, value in results.items():
            if isinstance(value, dict):
                for n, v in value.items():
                    final_results[f"{key}_{n}"] = v
            else:
                final_results[key] = value
        return final_results
    else:
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

In [17]:
label_map_reversed = {v: k for k, v in label_map.items()} 

In [18]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from transformers.data.data_collator import default_data_collator
from transformers import LayoutLMv3ForTokenClassification
from transformers.trainer_callback import TrainerCallback


In [1]:
import pickle, json
with open("optuna_study.pkl", "rb") as f:
    study = pickle.load(f)

with open("tuning.json", "r") as f:
    hyperparameter_dict = json.load(f)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
study.best_params

{'dropout': 0.3560592494053517,
 'learning_rate': 3.127610016621582e-05,
 'weight_decay': 0.000817139913231531}

In [4]:
import numpy as np
def find_best_epochs_metrics(hyperparameter_dict):
    best_epochs_data = {}
    
    for trial_num, trial_data in hyperparameter_dict.items():
        metrics_history = trial_data['metrics_history']
        eval_metrics = metrics_history['eval']
        epochs = metrics_history['epoch']
        
        f1_scores = [metrics['f1'] for metrics in eval_metrics]
        best_f1_idx = np.argmax(f1_scores)
        best_f1 = f1_scores[best_f1_idx]
        best_epoch = epochs[best_f1_idx]
        
        best_epochs_data[trial_num] = {
            'best_epoch': best_epoch,
            'best_f1': best_f1,
            'learning_rate': trial_data['hyperparameters']['learning_rate'],
            'best_metrics': eval_metrics[best_f1_idx]
        }
    
    return best_epochs_data
best_epochs_data = find_best_epochs_metrics(hyperparameter_dict)

In [5]:
num_epochs = best_epochs_data['18']['best_epoch']

In [6]:
dropout = study.best_params["dropout"]
learning_rate = study.best_params["learning_rate"]
weight_decay = study.best_params["weight_decay"]
batch_size = 2

In [7]:
print('num epochs:', num_epochs)
print('dropout:', dropout)
print('learning rate:', learning_rate)
print('weight decay:', weight_decay)
print('batch size:', batch_size)

num epochs: 11.0
dropout: 0.3560592494053517
learning rate: 3.127610016621582e-05
weight decay: 0.000817139913231531
batch size: 2


In [25]:
from transformers import Trainer, TrainingArguments
from transformers.data.data_collator import default_data_collator
from transformers.trainer_callback import TrainerCallback
import os
import json
import numpy as np


class MetricsTrackingCallback(TrainerCallback):
    def __init__(self):
        self.metrics_history = {
            'train': [],
            'eval': [],
            'epoch': []
        }
        self.current_train_metrics = None
        self.last_logged_epoch = None

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        """Called after evaluation"""
        if metrics and state.epoch != self.last_logged_epoch:
            # Store evaluation metrics
            eval_metrics = {
                'precision': metrics.get('eval_precision', None),
                'recall': metrics.get('eval_recall', None),
                'f1': metrics.get('eval_f1', None),
                'accuracy': metrics.get('eval_accuracy', None),
                'loss': metrics.get('eval_loss', None)
            }
            
            # Store training metrics that were saved during training
            train_metrics = {}
            if self.current_train_metrics is not None:
                train_metrics = self.current_train_metrics
                self.current_train_metrics = None  # Reset for next epoch
            
            # Append metrics for this epoch
            self.metrics_history['train'].append(train_metrics)
            self.metrics_history['eval'].append(eval_metrics)
            self.metrics_history['epoch'].append(state.epoch)
            self.last_logged_epoch = state.epoch
            
            # Save to file
            save_dir = args.output_dir
            os.makedirs(save_dir, exist_ok=True)
            with open(os.path.join(save_dir, 'metrics_history.json'), 'w') as f:
                json.dump(self.metrics_history, f, indent=2)

    def on_log(self, args, state, control, logs=None, **kwargs):
        """Called on each training log"""
        if logs:
            # Check if these are training metrics (not eval metrics)
            if all(not k.startswith('eval_') for k in logs.keys()):
                # Store the most recent training metrics
                self.current_train_metrics = {
                    'loss': logs.get('loss', None),
                    'learning_rate': logs.get('learning_rate', None)
                }


In [26]:
train_dict = {}
for i in range(3):
    
    model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base",
                                                             id2label=label_map,
                                                             label2id=label_map_reversed).to(device)
    
    model.config.hidden_dropout_prob = dropout
    model.config.attention_probs_dropout_prob = dropout

    metrics_callback = MetricsTrackingCallback()


    trial_output_dir = f'./results_ner_boxes/{i}'
    training_args = TrainingArguments(
        output_dir=trial_output_dir,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,  
        weight_decay=weight_decay,
        logging_steps=10,
        evaluation_strategy="epoch",
        metric_for_best_model="eval_f1",
        load_best_model_at_end=True,
        save_strategy="epoch",
        save_total_limit=1,
        
    ) 

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=processor,
        data_collator=default_data_collator,
        compute_metrics=compute_metrics,
        callbacks=[metrics_callback]
    )

    train_result = trainer.train()

    eval_results = trainer.evaluate()

    train_dict[i] = {
    "final_eval_results": eval_results,
    "metrics_history": metrics_callback.metrics_history,
}


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|▏         | 11/825 [00:02<02:35,  5.23it/s]

{'loss': 1.7473, 'grad_norm': 6.216126441955566, 'learning_rate': 3.089699592177684e-05, 'epoch': 0.13}


  3%|▎         | 21/825 [00:04<02:28,  5.41it/s]

{'loss': 1.4874, 'grad_norm': 10.920066833496094, 'learning_rate': 3.051789167733786e-05, 'epoch': 0.27}


  4%|▍         | 31/825 [00:06<02:28,  5.35it/s]

{'loss': 1.374, 'grad_norm': 15.542312622070312, 'learning_rate': 3.013878743289888e-05, 'epoch': 0.4}


  5%|▍         | 41/825 [00:07<02:26,  5.36it/s]

{'loss': 1.1226, 'grad_norm': 10.771389961242676, 'learning_rate': 2.9759683188459904e-05, 'epoch': 0.53}


  6%|▌         | 51/825 [00:09<02:24,  5.34it/s]

{'loss': 0.9091, 'grad_norm': 30.38555908203125, 'learning_rate': 2.9380578944020924e-05, 'epoch': 0.67}


  7%|▋         | 61/825 [00:11<02:25,  5.24it/s]

{'loss': 1.0483, 'grad_norm': 8.336050987243652, 'learning_rate': 2.9001474699581942e-05, 'epoch': 0.8}


  9%|▊         | 71/825 [00:13<02:23,  5.24it/s]

{'loss': 0.8206, 'grad_norm': 18.473365783691406, 'learning_rate': 2.8622370455142963e-05, 'epoch': 0.93}


                                                
  9%|▉         | 75/825 [00:16<02:07,  5.89it/s]

{'eval_loss': 0.749459445476532, 'eval_precision': 0.6380648105887723, 'eval_recall': 0.6944858420268256, 'eval_f1': 0.665080875356803, 'eval_accuracy': 0.7263758469036016, 'eval_runtime': 1.8072, 'eval_samples_per_second': 27.667, 'eval_steps_per_second': 13.833, 'epoch': 1.0}


 10%|▉         | 81/825 [00:20<05:39,  2.19it/s]

{'loss': 0.8789, 'grad_norm': 9.630560874938965, 'learning_rate': 2.8243266210703984e-05, 'epoch': 1.07}


 11%|█         | 91/825 [00:22<02:24,  5.07it/s]

{'loss': 0.6935, 'grad_norm': 26.671409606933594, 'learning_rate': 2.7864161966265e-05, 'epoch': 1.2}


 12%|█▏        | 101/825 [00:24<02:07,  5.67it/s]

{'loss': 0.6931, 'grad_norm': 14.322047233581543, 'learning_rate': 2.7485057721826022e-05, 'epoch': 1.33}


 13%|█▎        | 111/825 [00:26<02:14,  5.29it/s]

{'loss': 0.8027, 'grad_norm': 22.948617935180664, 'learning_rate': 2.7105953477387046e-05, 'epoch': 1.47}


 15%|█▍        | 121/825 [00:27<01:57,  6.01it/s]

{'loss': 0.6139, 'grad_norm': 9.544118881225586, 'learning_rate': 2.6726849232948064e-05, 'epoch': 1.6}


 16%|█▌        | 131/825 [00:29<02:05,  5.52it/s]

{'loss': 0.6162, 'grad_norm': 8.964045524597168, 'learning_rate': 2.6347744988509085e-05, 'epoch': 1.73}


 17%|█▋        | 141/825 [00:31<02:02,  5.58it/s]

{'loss': 0.9347, 'grad_norm': 42.81330108642578, 'learning_rate': 2.5968640744070106e-05, 'epoch': 1.87}


 18%|█▊        | 150/825 [00:33<01:45,  6.43it/s]

{'loss': 0.8239, 'grad_norm': 21.061927795410156, 'learning_rate': 2.5589536499631127e-05, 'epoch': 2.0}


                                                 
 18%|█▊        | 150/825 [00:34<01:45,  6.43it/s]

{'eval_loss': 0.6429162621498108, 'eval_precision': 0.6914612676056338, 'eval_recall': 0.7804272230501739, 'eval_f1': 0.7332555425904317, 'eval_accuracy': 0.7714251753239035, 'eval_runtime': 1.6988, 'eval_samples_per_second': 29.433, 'eval_steps_per_second': 14.717, 'epoch': 2.0}


 20%|█▉        | 161/825 [00:43<02:53,  3.83it/s]

{'loss': 0.5366, 'grad_norm': 11.200016975402832, 'learning_rate': 2.5210432255192144e-05, 'epoch': 2.13}


 21%|██        | 171/825 [00:45<02:00,  5.43it/s]

{'loss': 0.6502, 'grad_norm': 15.204209327697754, 'learning_rate': 2.4831328010753165e-05, 'epoch': 2.27}


 22%|██▏       | 181/825 [00:47<01:55,  5.58it/s]

{'loss': 0.5028, 'grad_norm': 17.780210494995117, 'learning_rate': 2.445222376631419e-05, 'epoch': 2.4}


 23%|██▎       | 191/825 [00:48<01:54,  5.56it/s]

{'loss': 0.4635, 'grad_norm': 23.239036560058594, 'learning_rate': 2.4073119521875207e-05, 'epoch': 2.53}


 24%|██▍       | 201/825 [00:50<01:54,  5.43it/s]

{'loss': 0.6525, 'grad_norm': 76.13591003417969, 'learning_rate': 2.3694015277436228e-05, 'epoch': 2.67}


 26%|██▌       | 211/825 [00:52<01:43,  5.96it/s]

{'loss': 0.5731, 'grad_norm': 9.825047492980957, 'learning_rate': 2.331491103299725e-05, 'epoch': 2.8}


 27%|██▋       | 221/825 [00:54<01:51,  5.40it/s]

{'loss': 0.4317, 'grad_norm': 6.534836292266846, 'learning_rate': 2.2935806788558266e-05, 'epoch': 2.93}


                                                 
 27%|██▋       | 225/825 [00:56<01:45,  5.69it/s]

{'eval_loss': 0.56426602602005, 'eval_precision': 0.7421697684975034, 'eval_recall': 0.812220566318927, 'eval_f1': 0.7756166982922201, 'eval_accuracy': 0.8010222275050517, 'eval_runtime': 1.3862, 'eval_samples_per_second': 36.07, 'eval_steps_per_second': 18.035, 'epoch': 3.0}


 28%|██▊       | 231/825 [01:01<04:33,  2.17it/s]

{'loss': 0.4292, 'grad_norm': 9.167104721069336, 'learning_rate': 2.2556702544119287e-05, 'epoch': 3.07}


 29%|██▉       | 241/825 [01:03<01:46,  5.49it/s]

{'loss': 0.3374, 'grad_norm': 9.167813301086426, 'learning_rate': 2.2177598299680308e-05, 'epoch': 3.2}


 30%|███       | 251/825 [01:05<01:45,  5.42it/s]

{'loss': 0.3059, 'grad_norm': 10.43835735321045, 'learning_rate': 2.179849405524133e-05, 'epoch': 3.33}


 32%|███▏      | 261/825 [01:07<01:47,  5.26it/s]

{'loss': 0.4785, 'grad_norm': 10.505439758300781, 'learning_rate': 2.141938981080235e-05, 'epoch': 3.47}


 33%|███▎      | 271/825 [01:08<01:46,  5.20it/s]

{'loss': 0.4197, 'grad_norm': 11.782747268676758, 'learning_rate': 2.104028556636337e-05, 'epoch': 3.6}


 34%|███▍      | 281/825 [01:10<01:44,  5.22it/s]

{'loss': 0.3668, 'grad_norm': 11.46175765991211, 'learning_rate': 2.066118132192439e-05, 'epoch': 3.73}


 35%|███▌      | 291/825 [01:12<01:41,  5.24it/s]

{'loss': 0.4166, 'grad_norm': 4.392373085021973, 'learning_rate': 2.028207707748541e-05, 'epoch': 3.87}


 36%|███▋      | 300/825 [01:14<01:30,  5.81it/s]

{'loss': 0.4475, 'grad_norm': 54.217952728271484, 'learning_rate': 1.990297283304643e-05, 'epoch': 4.0}


                                                 
 36%|███▋      | 300/825 [01:15<01:30,  5.81it/s]

{'eval_loss': 0.5508626699447632, 'eval_precision': 0.7562528422010004, 'eval_recall': 0.8261301539990065, 'eval_f1': 0.7896486229819563, 'eval_accuracy': 0.8233685962201355, 'eval_runtime': 1.6135, 'eval_samples_per_second': 30.989, 'eval_steps_per_second': 15.495, 'epoch': 4.0}


 38%|███▊      | 311/825 [01:21<01:56,  4.41it/s]

{'loss': 0.2816, 'grad_norm': 4.579873561859131, 'learning_rate': 1.952386858860745e-05, 'epoch': 4.13}


 39%|███▉      | 321/825 [01:23<01:12,  6.96it/s]

{'loss': 0.2696, 'grad_norm': 5.914554119110107, 'learning_rate': 1.914476434416847e-05, 'epoch': 4.27}


 40%|████      | 331/825 [01:24<01:08,  7.16it/s]

{'loss': 0.3129, 'grad_norm': 5.3768696784973145, 'learning_rate': 1.8765660099729493e-05, 'epoch': 4.4}


 41%|████▏     | 341/825 [01:25<01:07,  7.13it/s]

{'loss': 0.3021, 'grad_norm': 6.484789848327637, 'learning_rate': 1.8386555855290514e-05, 'epoch': 4.53}


 43%|████▎     | 351/825 [01:27<01:06,  7.12it/s]

{'loss': 0.3316, 'grad_norm': 11.166986465454102, 'learning_rate': 1.8007451610851534e-05, 'epoch': 4.67}


 44%|████▍     | 361/825 [01:28<01:04,  7.14it/s]

{'loss': 0.3296, 'grad_norm': 30.065248489379883, 'learning_rate': 1.7628347366412552e-05, 'epoch': 4.8}


 45%|████▍     | 371/825 [01:30<01:04,  7.05it/s]

{'loss': 0.1708, 'grad_norm': 5.180712699890137, 'learning_rate': 1.7249243121973573e-05, 'epoch': 4.93}


                                                 
 45%|████▌     | 375/825 [01:31<01:03,  7.14it/s]

{'eval_loss': 0.5980115532875061, 'eval_precision': 0.759493670886076, 'eval_recall': 0.834575260804769, 'eval_f1': 0.7952662721893492, 'eval_accuracy': 0.8113633662189469, 'eval_runtime': 1.3601, 'eval_samples_per_second': 36.762, 'eval_steps_per_second': 18.381, 'epoch': 5.0}


 46%|████▌     | 381/825 [01:37<03:07,  2.37it/s]

{'loss': 0.2801, 'grad_norm': 7.717175006866455, 'learning_rate': 1.6870138877534594e-05, 'epoch': 5.07}


 47%|████▋     | 391/825 [01:38<01:09,  6.21it/s]

{'loss': 0.1851, 'grad_norm': 4.509560585021973, 'learning_rate': 1.649103463309561e-05, 'epoch': 5.2}


 49%|████▊     | 401/825 [01:40<01:00,  7.05it/s]

{'loss': 0.2264, 'grad_norm': 2.745527505874634, 'learning_rate': 1.6111930388656636e-05, 'epoch': 5.33}


 50%|████▉     | 411/825 [01:41<00:57,  7.16it/s]

{'loss': 0.2741, 'grad_norm': 4.099153518676758, 'learning_rate': 1.5732826144217656e-05, 'epoch': 5.47}


 51%|█████     | 421/825 [01:43<00:57,  7.05it/s]

{'loss': 0.2986, 'grad_norm': 10.206860542297363, 'learning_rate': 1.5353721899778674e-05, 'epoch': 5.6}


 52%|█████▏    | 431/825 [01:44<00:55,  7.07it/s]

{'loss': 0.2553, 'grad_norm': 7.017035961151123, 'learning_rate': 1.4974617655339695e-05, 'epoch': 5.73}


 53%|█████▎    | 441/825 [01:45<00:54,  7.02it/s]

{'loss': 0.1592, 'grad_norm': 3.1677913665771484, 'learning_rate': 1.4595513410900716e-05, 'epoch': 5.87}


 55%|█████▍    | 450/825 [01:47<00:52,  7.18it/s]

{'loss': 0.1369, 'grad_norm': 8.538447380065918, 'learning_rate': 1.4216409166461737e-05, 'epoch': 6.0}


                                                 
 55%|█████▍    | 450/825 [01:48<00:52,  7.18it/s]

{'eval_loss': 0.6380410194396973, 'eval_precision': 0.7803171641791045, 'eval_recall': 0.8310978638847492, 'eval_f1': 0.8049073851335098, 'eval_accuracy': 0.8266967787947225, 'eval_runtime': 1.3553, 'eval_samples_per_second': 36.892, 'eval_steps_per_second': 18.446, 'epoch': 6.0}


 56%|█████▌    | 461/825 [01:55<01:19,  4.58it/s]

{'loss': 0.2246, 'grad_norm': 3.343646764755249, 'learning_rate': 1.3837304922022758e-05, 'epoch': 6.13}


 57%|█████▋    | 471/825 [01:56<00:51,  6.94it/s]

{'loss': 0.1501, 'grad_norm': 7.738948822021484, 'learning_rate': 1.3458200677583777e-05, 'epoch': 6.27}


 58%|█████▊    | 481/825 [01:58<00:48,  7.12it/s]

{'loss': 0.1386, 'grad_norm': 3.635836362838745, 'learning_rate': 1.3079096433144796e-05, 'epoch': 6.4}


 60%|█████▉    | 491/825 [01:59<00:46,  7.15it/s]

{'loss': 0.1978, 'grad_norm': 2.716435670852661, 'learning_rate': 1.2699992188705819e-05, 'epoch': 6.53}


 61%|██████    | 501/825 [02:01<00:45,  7.16it/s]

{'loss': 0.1396, 'grad_norm': 2.8245880603790283, 'learning_rate': 1.2320887944266838e-05, 'epoch': 6.67}


 62%|██████▏   | 511/825 [02:02<00:44,  7.13it/s]

{'loss': 0.1478, 'grad_norm': 2.7459089756011963, 'learning_rate': 1.1941783699827859e-05, 'epoch': 6.8}


 63%|██████▎   | 521/825 [02:04<00:42,  7.16it/s]

{'loss': 0.1991, 'grad_norm': 7.859139919281006, 'learning_rate': 1.1562679455388878e-05, 'epoch': 6.93}


                                                 
 64%|██████▎   | 525/825 [02:05<00:42,  7.12it/s]

{'eval_loss': 0.6579369306564331, 'eval_precision': 0.800952380952381, 'eval_recall': 0.8355688027819176, 'eval_f1': 0.8178944809141746, 'eval_accuracy': 0.8297872340425532, 'eval_runtime': 1.3572, 'eval_samples_per_second': 36.842, 'eval_steps_per_second': 18.421, 'epoch': 7.0}


 64%|██████▍   | 531/825 [02:11<02:03,  2.38it/s]

{'loss': 0.1225, 'grad_norm': 8.127373695373535, 'learning_rate': 1.11835752109499e-05, 'epoch': 7.07}


 66%|██████▌   | 541/825 [02:12<00:44,  6.34it/s]

{'loss': 0.0793, 'grad_norm': 2.131065607070923, 'learning_rate': 1.080447096651092e-05, 'epoch': 7.2}


 67%|██████▋   | 551/825 [02:14<00:38,  7.12it/s]

{'loss': 0.2093, 'grad_norm': 8.547724723815918, 'learning_rate': 1.0425366722071939e-05, 'epoch': 7.33}


 68%|██████▊   | 561/825 [02:15<00:37,  7.10it/s]

{'loss': 0.1202, 'grad_norm': 19.979114532470703, 'learning_rate': 1.0046262477632961e-05, 'epoch': 7.47}


 69%|██████▉   | 571/825 [02:17<00:35,  7.12it/s]

{'loss': 0.1557, 'grad_norm': 5.917729377746582, 'learning_rate': 9.66715823319398e-06, 'epoch': 7.6}


 70%|███████   | 581/825 [02:18<00:34,  7.14it/s]

{'loss': 0.108, 'grad_norm': 2.513162612915039, 'learning_rate': 9.288053988755002e-06, 'epoch': 7.73}


 72%|███████▏  | 591/825 [02:19<00:33,  7.03it/s]

{'loss': 0.1383, 'grad_norm': 5.023011207580566, 'learning_rate': 8.90894974431602e-06, 'epoch': 7.87}


 73%|███████▎  | 600/825 [02:21<00:31,  7.16it/s]

{'loss': 0.1112, 'grad_norm': 4.7456769943237305, 'learning_rate': 8.529845499877042e-06, 'epoch': 8.0}


                                                 
 73%|███████▎  | 600/825 [02:22<00:31,  7.16it/s]

{'eval_loss': 0.7088896036148071, 'eval_precision': 0.818889970788705, 'eval_recall': 0.8355688027819176, 'eval_f1': 0.8271453159577082, 'eval_accuracy': 0.8308570070129562, 'eval_runtime': 1.3578, 'eval_samples_per_second': 36.825, 'eval_steps_per_second': 18.412, 'epoch': 8.0}


 74%|███████▍  | 611/825 [02:28<00:49,  4.29it/s]

{'loss': 0.0663, 'grad_norm': 4.619118690490723, 'learning_rate': 8.150741255438063e-06, 'epoch': 8.13}


 75%|███████▌  | 621/825 [02:30<00:29,  6.87it/s]

{'loss': 0.1507, 'grad_norm': 3.326876163482666, 'learning_rate': 7.771637010999082e-06, 'epoch': 8.27}


 76%|███████▋  | 631/825 [02:31<00:27,  7.11it/s]

{'loss': 0.1003, 'grad_norm': 6.20957088470459, 'learning_rate': 7.392532766560103e-06, 'epoch': 8.4}


 78%|███████▊  | 641/825 [02:33<00:26,  7.07it/s]

{'loss': 0.0948, 'grad_norm': 4.514616966247559, 'learning_rate': 7.0134285221211236e-06, 'epoch': 8.53}


 79%|███████▉  | 651/825 [02:34<00:24,  7.09it/s]

{'loss': 0.0666, 'grad_norm': 2.7767739295959473, 'learning_rate': 6.634324277682144e-06, 'epoch': 8.67}


 80%|████████  | 661/825 [02:36<00:23,  7.07it/s]

{'loss': 0.1163, 'grad_norm': 16.719585418701172, 'learning_rate': 6.2552200332431645e-06, 'epoch': 8.8}


 81%|████████▏ | 671/825 [02:37<00:21,  7.10it/s]

{'loss': 0.0842, 'grad_norm': 10.414656639099121, 'learning_rate': 5.876115788804184e-06, 'epoch': 8.93}


                                                 
 82%|████████▏ | 675/825 [02:39<00:21,  7.08it/s]

{'eval_loss': 0.7542171478271484, 'eval_precision': 0.804642166344294, 'eval_recall': 0.8266269249875807, 'eval_f1': 0.8154864003920608, 'eval_accuracy': 0.8169499583977178, 'eval_runtime': 1.3702, 'eval_samples_per_second': 36.492, 'eval_steps_per_second': 18.246, 'epoch': 9.0}


 83%|████████▎ | 681/825 [02:44<01:00,  2.37it/s]

{'loss': 0.0581, 'grad_norm': 4.421509742736816, 'learning_rate': 5.497011544365205e-06, 'epoch': 9.07}


 84%|████████▍ | 691/825 [02:46<00:21,  6.31it/s]

{'loss': 0.0737, 'grad_norm': 8.67169189453125, 'learning_rate': 5.117907299926225e-06, 'epoch': 9.2}


 85%|████████▍ | 701/825 [02:47<00:17,  7.01it/s]

{'loss': 0.0751, 'grad_norm': 4.616166591644287, 'learning_rate': 4.7388030554872456e-06, 'epoch': 9.33}


 86%|████████▌ | 711/825 [02:49<00:15,  7.14it/s]

{'loss': 0.0536, 'grad_norm': 1.2980848550796509, 'learning_rate': 4.3596988110482665e-06, 'epoch': 9.47}


 87%|████████▋ | 721/825 [02:50<00:14,  7.10it/s]

{'loss': 0.1016, 'grad_norm': 2.8045432567596436, 'learning_rate': 3.980594566609286e-06, 'epoch': 9.6}


 89%|████████▊ | 731/825 [02:51<00:13,  7.08it/s]

{'loss': 0.0308, 'grad_norm': 1.6199864149093628, 'learning_rate': 3.6014903221703066e-06, 'epoch': 9.73}


 90%|████████▉ | 741/825 [02:53<00:11,  7.06it/s]

{'loss': 0.0666, 'grad_norm': 0.27619460225105286, 'learning_rate': 3.222386077731327e-06, 'epoch': 9.87}


 91%|█████████ | 750/825 [02:54<00:10,  7.10it/s]

{'loss': 0.1367, 'grad_norm': 1.1674712896347046, 'learning_rate': 2.8432818332923475e-06, 'epoch': 10.0}


                                                 
 91%|█████████ | 750/825 [02:56<00:10,  7.10it/s]

{'eval_loss': 0.7657732963562012, 'eval_precision': 0.8219844357976653, 'eval_recall': 0.8395429706905116, 'eval_f1': 0.8306709265175718, 'eval_accuracy': 0.8357304172114585, 'eval_runtime': 1.4121, 'eval_samples_per_second': 35.408, 'eval_steps_per_second': 17.704, 'epoch': 10.0}


 92%|█████████▏| 761/825 [03:00<00:11,  5.72it/s]

{'loss': 0.0424, 'grad_norm': 1.7362689971923828, 'learning_rate': 2.4641775888533676e-06, 'epoch': 10.13}


 93%|█████████▎| 771/825 [03:01<00:07,  7.07it/s]

{'loss': 0.093, 'grad_norm': 33.917354583740234, 'learning_rate': 2.085073344414388e-06, 'epoch': 10.27}


 95%|█████████▍| 781/825 [03:02<00:06,  7.02it/s]

{'loss': 0.0519, 'grad_norm': 3.1751019954681396, 'learning_rate': 1.7059690999754083e-06, 'epoch': 10.4}


 96%|█████████▌| 791/825 [03:04<00:04,  7.04it/s]

{'loss': 0.0723, 'grad_norm': 5.1900315284729, 'learning_rate': 1.3268648555364288e-06, 'epoch': 10.53}


 97%|█████████▋| 801/825 [03:05<00:03,  7.01it/s]

{'loss': 0.0401, 'grad_norm': 2.068547487258911, 'learning_rate': 9.477606110974491e-07, 'epoch': 10.67}


 98%|█████████▊| 811/825 [03:07<00:01,  7.08it/s]

{'loss': 0.0777, 'grad_norm': 11.544830322265625, 'learning_rate': 5.686563666584694e-07, 'epoch': 10.8}


100%|█████████▉| 821/825 [03:08<00:00,  7.04it/s]

{'loss': 0.0988, 'grad_norm': 7.013841152191162, 'learning_rate': 1.8955212221948982e-07, 'epoch': 10.93}


                                                 
100%|██████████| 825/825 [03:15<00:00,  7.08it/s]

{'eval_loss': 0.7702839374542236, 'eval_precision': 0.8171677982541222, 'eval_recall': 0.8370591157476404, 'eval_f1': 0.8269938650306747, 'eval_accuracy': 0.8338285985974088, 'eval_runtime': 1.6912, 'eval_samples_per_second': 29.565, 'eval_steps_per_second': 14.783, 'epoch': 11.0}


100%|██████████| 825/825 [03:18<00:00,  7.08it/s]

{'train_runtime': 198.1535, 'train_samples_per_second': 8.271, 'train_steps_per_second': 4.163, 'train_loss': 0.35651122595324664, 'epoch': 11.0}


100%|██████████| 825/825 [03:18<00:00,  4.16it/s]
100%|██████████| 25/25 [00:01<00:00, 18.94it/s]
Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|▏         | 11/825 [00:01<01:56,  6.98it/s]

{'loss': 1.7956, 'grad_norm': 6.340710163116455, 'learning_rate': 3.089699592177684e-05, 'epoch': 0.13}


  3%|▎         | 21/825 [00:02<01:53,  7.07it/s]

{'loss': 1.5226, 'grad_norm': 10.833175659179688, 'learning_rate': 3.051789167733786e-05, 'epoch': 0.27}


  4%|▍         | 31/825 [00:04<01:51,  7.12it/s]

{'loss': 1.3117, 'grad_norm': 13.469727516174316, 'learning_rate': 3.013878743289888e-05, 'epoch': 0.4}


  5%|▍         | 41/825 [00:05<01:50,  7.09it/s]

{'loss': 1.1306, 'grad_norm': 14.142316818237305, 'learning_rate': 2.9759683188459904e-05, 'epoch': 0.53}


  6%|▌         | 51/825 [00:07<01:48,  7.13it/s]

{'loss': 0.9139, 'grad_norm': 19.90774154663086, 'learning_rate': 2.9380578944020924e-05, 'epoch': 0.67}


  7%|▋         | 61/825 [00:08<01:49,  6.98it/s]

{'loss': 1.0223, 'grad_norm': 8.752481460571289, 'learning_rate': 2.9001474699581942e-05, 'epoch': 0.8}


  9%|▊         | 71/825 [00:10<01:46,  7.10it/s]

{'loss': 0.8556, 'grad_norm': 15.54957103729248, 'learning_rate': 2.8622370455142963e-05, 'epoch': 0.93}


  9%|▉         | 74/825 [00:10<01:45,  7.10it/s]
  9%|▉         | 75/825 [00:11<01:45,  7.10it/s]

{'eval_loss': 0.6818825602531433, 'eval_precision': 0.6762656758012077, 'eval_recall': 0.7232985593641331, 'eval_f1': 0.6989918386941911, 'eval_accuracy': 0.7660763104718887, 'eval_runtime': 1.3812, 'eval_samples_per_second': 36.2, 'eval_steps_per_second': 18.1, 'epoch': 1.0}


 10%|▉         | 81/825 [00:15<04:01,  3.08it/s]

{'loss': 0.8206, 'grad_norm': 7.767667770385742, 'learning_rate': 2.8243266210703984e-05, 'epoch': 1.07}


 11%|█         | 91/825 [00:16<01:48,  6.78it/s]

{'loss': 0.6882, 'grad_norm': 21.864675521850586, 'learning_rate': 2.7864161966265e-05, 'epoch': 1.2}


 12%|█▏        | 101/825 [00:18<01:41,  7.10it/s]

{'loss': 0.6764, 'grad_norm': 18.261329650878906, 'learning_rate': 2.7485057721826022e-05, 'epoch': 1.33}


 13%|█▎        | 111/825 [00:19<01:41,  7.05it/s]

{'loss': 0.8199, 'grad_norm': 47.8232536315918, 'learning_rate': 2.7105953477387046e-05, 'epoch': 1.47}


 15%|█▍        | 121/825 [00:20<01:39,  7.07it/s]

{'loss': 0.6048, 'grad_norm': 7.229106426239014, 'learning_rate': 2.6726849232948064e-05, 'epoch': 1.6}


 16%|█▌        | 131/825 [00:22<01:37,  7.09it/s]

{'loss': 0.6122, 'grad_norm': 8.96817684173584, 'learning_rate': 2.6347744988509085e-05, 'epoch': 1.73}


 17%|█▋        | 141/825 [00:23<01:37,  7.05it/s]

{'loss': 0.9099, 'grad_norm': 23.001527786254883, 'learning_rate': 2.5968640744070106e-05, 'epoch': 1.87}


 18%|█▊        | 150/825 [00:24<01:35,  7.08it/s]

{'loss': 0.7601, 'grad_norm': 31.88058853149414, 'learning_rate': 2.5589536499631127e-05, 'epoch': 2.0}



 18%|█▊        | 150/825 [00:26<01:35,  7.08it/s]

{'eval_loss': 0.663290798664093, 'eval_precision': 0.6758739749676306, 'eval_recall': 0.7779433681073026, 'eval_f1': 0.723325635103926, 'eval_accuracy': 0.7598953999762272, 'eval_runtime': 1.3674, 'eval_samples_per_second': 36.566, 'eval_steps_per_second': 18.283, 'epoch': 2.0}


 20%|█▉        | 161/825 [00:33<02:36,  4.25it/s]

{'loss': 0.5929, 'grad_norm': 7.7160820960998535, 'learning_rate': 2.5210432255192144e-05, 'epoch': 2.13}


 21%|██        | 171/825 [00:35<01:34,  6.95it/s]

{'loss': 0.5971, 'grad_norm': 17.127140045166016, 'learning_rate': 2.4831328010753165e-05, 'epoch': 2.27}


 22%|██▏       | 181/825 [00:36<01:31,  7.05it/s]

{'loss': 0.5365, 'grad_norm': 16.018226623535156, 'learning_rate': 2.445222376631419e-05, 'epoch': 2.4}


 23%|██▎       | 191/825 [00:37<01:30,  7.03it/s]

{'loss': 0.531, 'grad_norm': 64.59288024902344, 'learning_rate': 2.4073119521875207e-05, 'epoch': 2.53}


 24%|██▍       | 201/825 [00:39<01:27,  7.10it/s]

{'loss': 0.593, 'grad_norm': 33.638431549072266, 'learning_rate': 2.3694015277436228e-05, 'epoch': 2.67}


 26%|██▌       | 211/825 [00:40<01:26,  7.07it/s]

{'loss': 0.5247, 'grad_norm': 10.299402236938477, 'learning_rate': 2.331491103299725e-05, 'epoch': 2.8}


 27%|██▋       | 221/825 [00:42<01:25,  7.06it/s]

{'loss': 0.4636, 'grad_norm': 7.707093238830566, 'learning_rate': 2.2935806788558266e-05, 'epoch': 2.93}


 27%|██▋       | 224/825 [00:42<01:25,  7.05it/s]
 27%|██▋       | 225/825 [00:44<01:25,  7.05it/s]

{'eval_loss': 0.5569959878921509, 'eval_precision': 0.7144736842105263, 'eval_recall': 0.8092399403874814, 'eval_f1': 0.7589098532494758, 'eval_accuracy': 0.8043504100796387, 'eval_runtime': 1.3752, 'eval_samples_per_second': 36.359, 'eval_steps_per_second': 18.18, 'epoch': 3.0}


 28%|██▊       | 231/825 [00:48<04:00,  2.47it/s]

{'loss': 0.4867, 'grad_norm': 13.769940376281738, 'learning_rate': 2.2556702544119287e-05, 'epoch': 3.07}


 29%|██▉       | 241/825 [00:50<01:32,  6.32it/s]

{'loss': 0.3434, 'grad_norm': 7.278059005737305, 'learning_rate': 2.2177598299680308e-05, 'epoch': 3.2}


 30%|███       | 251/825 [00:52<01:21,  7.05it/s]

{'loss': 0.3094, 'grad_norm': 7.791276931762695, 'learning_rate': 2.179849405524133e-05, 'epoch': 3.33}


 32%|███▏      | 261/825 [00:53<01:19,  7.14it/s]

{'loss': 0.4526, 'grad_norm': 19.18781852722168, 'learning_rate': 2.141938981080235e-05, 'epoch': 3.47}


 33%|███▎      | 271/825 [00:54<01:18,  7.06it/s]

{'loss': 0.4364, 'grad_norm': 16.029272079467773, 'learning_rate': 2.104028556636337e-05, 'epoch': 3.6}


 34%|███▍      | 281/825 [00:56<01:17,  7.06it/s]

{'loss': 0.3676, 'grad_norm': 6.017010688781738, 'learning_rate': 2.066118132192439e-05, 'epoch': 3.73}


 35%|███▌      | 291/825 [00:57<01:14,  7.12it/s]

{'loss': 0.391, 'grad_norm': 17.62451171875, 'learning_rate': 2.028207707748541e-05, 'epoch': 3.87}


 36%|███▋      | 300/825 [00:58<01:15,  7.00it/s]

{'loss': 0.5513, 'grad_norm': 60.81452178955078, 'learning_rate': 1.990297283304643e-05, 'epoch': 4.0}



 36%|███▋      | 300/825 [01:00<01:15,  7.00it/s]

{'eval_loss': 0.5987088680267334, 'eval_precision': 0.7605764760576476, 'eval_recall': 0.8127173373075013, 'eval_f1': 0.7857829010566763, 'eval_accuracy': 0.8173065493878522, 'eval_runtime': 1.374, 'eval_samples_per_second': 36.391, 'eval_steps_per_second': 18.196, 'epoch': 4.0}


 38%|███▊      | 311/825 [01:06<02:00,  4.28it/s]

{'loss': 0.3177, 'grad_norm': 4.639071941375732, 'learning_rate': 1.952386858860745e-05, 'epoch': 4.13}


 39%|███▉      | 321/825 [01:08<01:12,  6.95it/s]

{'loss': 0.2724, 'grad_norm': 11.283742904663086, 'learning_rate': 1.914476434416847e-05, 'epoch': 4.27}


 40%|████      | 331/825 [01:09<01:09,  7.12it/s]

{'loss': 0.3224, 'grad_norm': 8.983238220214844, 'learning_rate': 1.8765660099729493e-05, 'epoch': 4.4}


 41%|████▏     | 341/825 [01:11<01:08,  7.10it/s]

{'loss': 0.2857, 'grad_norm': 6.594318389892578, 'learning_rate': 1.8386555855290514e-05, 'epoch': 4.53}


 43%|████▎     | 351/825 [01:12<01:07,  7.04it/s]

{'loss': 0.3462, 'grad_norm': 18.252330780029297, 'learning_rate': 1.8007451610851534e-05, 'epoch': 4.67}


 44%|████▍     | 361/825 [01:13<01:05,  7.03it/s]

{'loss': 0.3071, 'grad_norm': 15.481406211853027, 'learning_rate': 1.7628347366412552e-05, 'epoch': 4.8}


 45%|████▍     | 371/825 [01:15<01:04,  7.06it/s]

{'loss': 0.1774, 'grad_norm': 5.420370578765869, 'learning_rate': 1.7249243121973573e-05, 'epoch': 4.93}


 45%|████▌     | 374/825 [01:15<01:03,  7.06it/s]
 45%|████▌     | 375/825 [01:17<01:03,  7.06it/s]

{'eval_loss': 0.6187559366226196, 'eval_precision': 0.7677419354838709, 'eval_recall': 0.8276204669647292, 'eval_f1': 0.7965574946210853, 'eval_accuracy': 0.825032687507429, 'eval_runtime': 1.3888, 'eval_samples_per_second': 36.002, 'eval_steps_per_second': 18.001, 'epoch': 5.0}


 46%|████▌     | 381/825 [01:22<03:02,  2.43it/s]

{'loss': 0.2619, 'grad_norm': 6.489009857177734, 'learning_rate': 1.6870138877534594e-05, 'epoch': 5.07}


 47%|████▋     | 391/825 [01:23<01:10,  6.20it/s]

{'loss': 0.225, 'grad_norm': 8.528050422668457, 'learning_rate': 1.649103463309561e-05, 'epoch': 5.2}


 49%|████▊     | 401/825 [01:25<01:00,  7.05it/s]

{'loss': 0.2419, 'grad_norm': 3.073837995529175, 'learning_rate': 1.6111930388656636e-05, 'epoch': 5.33}


 50%|████▉     | 411/825 [01:26<00:58,  7.05it/s]

{'loss': 0.2938, 'grad_norm': 7.777092933654785, 'learning_rate': 1.5732826144217656e-05, 'epoch': 5.47}


 51%|█████     | 421/825 [01:28<00:57,  7.01it/s]

{'loss': 0.2494, 'grad_norm': 6.944087505340576, 'learning_rate': 1.5353721899778674e-05, 'epoch': 5.6}


 52%|█████▏    | 431/825 [01:29<00:55,  7.10it/s]

{'loss': 0.3845, 'grad_norm': 6.792850494384766, 'learning_rate': 1.4974617655339695e-05, 'epoch': 5.73}


 53%|█████▎    | 441/825 [01:31<00:54,  7.05it/s]

{'loss': 0.1815, 'grad_norm': 9.840289115905762, 'learning_rate': 1.4595513410900716e-05, 'epoch': 5.87}


 55%|█████▍    | 450/825 [01:32<00:52,  7.14it/s]

{'loss': 0.1455, 'grad_norm': 26.663469314575195, 'learning_rate': 1.4216409166461737e-05, 'epoch': 6.0}



 55%|█████▍    | 450/825 [01:33<00:52,  7.14it/s]

{'eval_loss': 0.5957151055335999, 'eval_precision': 0.792003807710614, 'eval_recall': 0.8266269249875807, 'eval_f1': 0.8089450656295576, 'eval_accuracy': 0.826102460477832, 'eval_runtime': 1.3822, 'eval_samples_per_second': 36.175, 'eval_steps_per_second': 18.087, 'epoch': 6.0}


 56%|█████▌    | 461/825 [01:40<01:25,  4.28it/s]

{'loss': 0.2157, 'grad_norm': 4.9340715408325195, 'learning_rate': 1.3837304922022758e-05, 'epoch': 6.13}


 57%|█████▋    | 471/825 [01:41<00:52,  6.77it/s]

{'loss': 0.17, 'grad_norm': 3.3305470943450928, 'learning_rate': 1.3458200677583777e-05, 'epoch': 6.27}


 58%|█████▊    | 481/825 [01:43<00:48,  7.04it/s]

{'loss': 0.1188, 'grad_norm': 15.34871768951416, 'learning_rate': 1.3079096433144796e-05, 'epoch': 6.4}


 60%|█████▉    | 491/825 [01:44<00:47,  6.99it/s]

{'loss': 0.1962, 'grad_norm': 3.574669361114502, 'learning_rate': 1.2699992188705819e-05, 'epoch': 6.53}


 61%|██████    | 501/825 [01:46<00:45,  7.08it/s]

{'loss': 0.1464, 'grad_norm': 5.894938945770264, 'learning_rate': 1.2320887944266838e-05, 'epoch': 6.67}


 62%|██████▏   | 511/825 [01:47<00:44,  7.07it/s]

{'loss': 0.1852, 'grad_norm': 4.1127495765686035, 'learning_rate': 1.1941783699827859e-05, 'epoch': 6.8}


 63%|██████▎   | 521/825 [01:48<00:43,  7.05it/s]

{'loss': 0.2143, 'grad_norm': 5.5514631271362305, 'learning_rate': 1.1562679455388878e-05, 'epoch': 6.93}


 64%|██████▎   | 524/825 [01:49<00:42,  7.06it/s]
 64%|██████▎   | 525/825 [01:50<00:42,  7.06it/s]

{'eval_loss': 0.6002989411354065, 'eval_precision': 0.7962518020182604, 'eval_recall': 0.8231495280675608, 'eval_f1': 0.809477283829995, 'eval_accuracy': 0.8334720076072745, 'eval_runtime': 1.3812, 'eval_samples_per_second': 36.201, 'eval_steps_per_second': 18.1, 'epoch': 7.0}


 64%|██████▍   | 531/825 [01:55<02:00,  2.45it/s]

{'loss': 0.1197, 'grad_norm': 6.6215009689331055, 'learning_rate': 1.11835752109499e-05, 'epoch': 7.07}


 66%|██████▌   | 541/825 [01:57<00:45,  6.22it/s]

{'loss': 0.0924, 'grad_norm': 1.1800543069839478, 'learning_rate': 1.080447096651092e-05, 'epoch': 7.2}


 67%|██████▋   | 551/825 [01:58<00:39,  6.99it/s]

{'loss': 0.1738, 'grad_norm': 12.399444580078125, 'learning_rate': 1.0425366722071939e-05, 'epoch': 7.33}


 68%|██████▊   | 561/825 [02:00<00:37,  7.00it/s]

{'loss': 0.1004, 'grad_norm': 2.6239938735961914, 'learning_rate': 1.0046262477632961e-05, 'epoch': 7.47}


 69%|██████▉   | 571/825 [02:01<00:35,  7.11it/s]

{'loss': 0.1597, 'grad_norm': 4.320742607116699, 'learning_rate': 9.66715823319398e-06, 'epoch': 7.6}


 70%|███████   | 581/825 [02:03<00:34,  7.10it/s]

{'loss': 0.1283, 'grad_norm': 3.1916210651397705, 'learning_rate': 9.288053988755002e-06, 'epoch': 7.73}


 72%|███████▏  | 591/825 [02:04<00:33,  6.99it/s]

{'loss': 0.1496, 'grad_norm': 7.3555707931518555, 'learning_rate': 8.90894974431602e-06, 'epoch': 7.87}


 73%|███████▎  | 600/825 [02:05<00:32,  7.01it/s]

{'loss': 0.147, 'grad_norm': 2.2331552505493164, 'learning_rate': 8.529845499877042e-06, 'epoch': 8.0}



 73%|███████▎  | 600/825 [02:07<00:32,  7.01it/s]

{'eval_loss': 0.6824684143066406, 'eval_precision': 0.7971360381861575, 'eval_recall': 0.8296075509190264, 'eval_f1': 0.813047711781889, 'eval_accuracy': 0.8291929157256627, 'eval_runtime': 1.3865, 'eval_samples_per_second': 36.063, 'eval_steps_per_second': 18.032, 'epoch': 8.0}


 74%|███████▍  | 611/825 [02:13<00:48,  4.41it/s]

{'loss': 0.0692, 'grad_norm': 1.782378077507019, 'learning_rate': 8.150741255438063e-06, 'epoch': 8.13}


 75%|███████▌  | 621/825 [02:14<00:32,  6.34it/s]

{'loss': 0.1416, 'grad_norm': 2.5008299350738525, 'learning_rate': 7.771637010999082e-06, 'epoch': 8.27}


 76%|███████▋  | 631/825 [02:16<00:29,  6.68it/s]

{'loss': 0.1084, 'grad_norm': 1.1041879653930664, 'learning_rate': 7.392532766560103e-06, 'epoch': 8.4}


 78%|███████▊  | 641/825 [02:17<00:28,  6.40it/s]

{'loss': 0.1163, 'grad_norm': 16.211931228637695, 'learning_rate': 7.0134285221211236e-06, 'epoch': 8.53}


 79%|███████▉  | 651/825 [02:19<00:26,  6.56it/s]

{'loss': 0.0831, 'grad_norm': 19.7936954498291, 'learning_rate': 6.634324277682144e-06, 'epoch': 8.67}


 80%|████████  | 661/825 [02:20<00:23,  6.95it/s]

{'loss': 0.1094, 'grad_norm': 13.801299095153809, 'learning_rate': 6.2552200332431645e-06, 'epoch': 8.8}


 81%|████████▏ | 671/825 [02:22<00:25,  5.99it/s]

{'loss': 0.0881, 'grad_norm': 4.140436172485352, 'learning_rate': 5.876115788804184e-06, 'epoch': 8.93}


 82%|████████▏ | 674/825 [02:22<00:24,  6.10it/s]
 82%|████████▏ | 675/825 [02:24<00:24,  6.10it/s]

{'eval_loss': 0.7031681537628174, 'eval_precision': 0.8036750483558994, 'eval_recall': 0.8256333830104322, 'eval_f1': 0.8145062484685127, 'eval_accuracy': 0.8181385950314989, 'eval_runtime': 1.4345, 'eval_samples_per_second': 34.855, 'eval_steps_per_second': 17.427, 'epoch': 9.0}


 83%|████████▎ | 681/825 [02:29<01:04,  2.23it/s]

{'loss': 0.0623, 'grad_norm': 5.0649495124816895, 'learning_rate': 5.497011544365205e-06, 'epoch': 9.07}


 84%|████████▍ | 691/825 [02:31<00:26,  4.99it/s]

{'loss': 0.0781, 'grad_norm': 4.386407852172852, 'learning_rate': 5.117907299926225e-06, 'epoch': 9.2}


 85%|████████▍ | 701/825 [02:33<00:24,  5.15it/s]

{'loss': 0.0812, 'grad_norm': 4.421472549438477, 'learning_rate': 4.7388030554872456e-06, 'epoch': 9.33}


 86%|████████▌ | 711/825 [02:35<00:21,  5.24it/s]

{'loss': 0.0672, 'grad_norm': 6.519797325134277, 'learning_rate': 4.3596988110482665e-06, 'epoch': 9.47}


 87%|████████▋ | 721/825 [02:37<00:19,  5.21it/s]

{'loss': 0.09, 'grad_norm': 1.9671381711959839, 'learning_rate': 3.980594566609286e-06, 'epoch': 9.6}


 89%|████████▊ | 731/825 [02:39<00:17,  5.23it/s]

{'loss': 0.0477, 'grad_norm': 2.6072967052459717, 'learning_rate': 3.6014903221703066e-06, 'epoch': 9.73}


 90%|████████▉ | 741/825 [02:41<00:16,  5.20it/s]

{'loss': 0.0501, 'grad_norm': 0.8739238977432251, 'learning_rate': 3.222386077731327e-06, 'epoch': 9.87}


 91%|█████████ | 750/825 [02:42<00:12,  5.92it/s]

{'loss': 0.1161, 'grad_norm': 3.9137332439422607, 'learning_rate': 2.8432818332923475e-06, 'epoch': 10.0}



 91%|█████████ | 750/825 [02:44<00:12,  5.92it/s]

{'eval_loss': 0.7403074502944946, 'eval_precision': 0.7990521327014218, 'eval_recall': 0.8375558867362146, 'eval_f1': 0.8178510793111812, 'eval_accuracy': 0.8170688220610959, 'eval_runtime': 1.7942, 'eval_samples_per_second': 27.868, 'eval_steps_per_second': 13.934, 'epoch': 10.0}


 92%|█████████▏| 761/825 [02:51<00:15,  4.10it/s]

{'loss': 0.0465, 'grad_norm': 3.1159701347351074, 'learning_rate': 2.4641775888533676e-06, 'epoch': 10.13}


 93%|█████████▎| 771/825 [02:53<00:10,  5.21it/s]

{'loss': 0.0554, 'grad_norm': 17.372501373291016, 'learning_rate': 2.085073344414388e-06, 'epoch': 10.27}


 95%|█████████▍| 781/825 [02:55<00:08,  5.21it/s]

{'loss': 0.0506, 'grad_norm': 1.1418272256851196, 'learning_rate': 1.7059690999754083e-06, 'epoch': 10.4}


 96%|█████████▌| 791/825 [02:56<00:06,  5.17it/s]

{'loss': 0.0684, 'grad_norm': 7.749805450439453, 'learning_rate': 1.3268648555364288e-06, 'epoch': 10.53}


 97%|█████████▋| 801/825 [02:58<00:04,  5.15it/s]

{'loss': 0.0537, 'grad_norm': 3.5037436485290527, 'learning_rate': 9.477606110974491e-07, 'epoch': 10.67}


 98%|█████████▊| 811/825 [03:00<00:02,  5.28it/s]

{'loss': 0.0632, 'grad_norm': 5.054827690124512, 'learning_rate': 5.686563666584694e-07, 'epoch': 10.8}


100%|█████████▉| 821/825 [03:02<00:00,  5.15it/s]

{'loss': 0.1126, 'grad_norm': 4.87497615814209, 'learning_rate': 1.8955212221948982e-07, 'epoch': 10.93}



100%|██████████| 825/825 [03:09<00:00,  5.90it/s]

{'eval_loss': 0.7498010396957397, 'eval_precision': 0.8072289156626506, 'eval_recall': 0.8320914058618977, 'eval_f1': 0.8194716242661448, 'eval_accuracy': 0.8253892784975633, 'eval_runtime': 1.7924, 'eval_samples_per_second': 27.895, 'eval_steps_per_second': 13.948, 'epoch': 11.0}


100%|██████████| 825/825 [03:13<00:00,  4.27it/s]


{'train_runtime': 193.2153, 'train_samples_per_second': 8.483, 'train_steps_per_second': 4.27, 'train_loss': 0.36046468420462174, 'epoch': 11.0}


100%|██████████| 25/25 [00:01<00:00, 14.28it/s]
Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|▏         | 11/825 [00:02<02:37,  5.16it/s]

{'loss': 1.7956, 'grad_norm': 6.340710163116455, 'learning_rate': 3.089699592177684e-05, 'epoch': 0.13}


  3%|▎         | 21/825 [00:04<02:29,  5.39it/s]

{'loss': 1.5226, 'grad_norm': 10.833175659179688, 'learning_rate': 3.051789167733786e-05, 'epoch': 0.27}


  4%|▍         | 31/825 [00:05<02:10,  6.09it/s]

{'loss': 1.3117, 'grad_norm': 13.469727516174316, 'learning_rate': 3.013878743289888e-05, 'epoch': 0.4}


  5%|▍         | 41/825 [00:07<01:53,  6.88it/s]

{'loss': 1.1306, 'grad_norm': 14.142316818237305, 'learning_rate': 2.9759683188459904e-05, 'epoch': 0.53}


  6%|▌         | 51/825 [00:08<01:52,  6.90it/s]

{'loss': 0.9139, 'grad_norm': 19.90774154663086, 'learning_rate': 2.9380578944020924e-05, 'epoch': 0.67}


  7%|▋         | 61/825 [00:10<01:48,  7.05it/s]

{'loss': 1.0223, 'grad_norm': 8.752481460571289, 'learning_rate': 2.9001474699581942e-05, 'epoch': 0.8}


  9%|▊         | 71/825 [00:11<01:59,  6.30it/s]

{'loss': 0.8556, 'grad_norm': 15.54957103729248, 'learning_rate': 2.8622370455142963e-05, 'epoch': 0.93}


  9%|▉         | 74/825 [00:12<01:52,  6.70it/s]
  9%|▉         | 75/825 [00:13<01:51,  6.70it/s]

{'eval_loss': 0.6818825602531433, 'eval_precision': 0.6762656758012077, 'eval_recall': 0.7232985593641331, 'eval_f1': 0.6989918386941911, 'eval_accuracy': 0.7660763104718887, 'eval_runtime': 1.6989, 'eval_samples_per_second': 29.43, 'eval_steps_per_second': 14.715, 'epoch': 1.0}


 10%|▉         | 81/825 [00:19<05:59,  2.07it/s]

{'loss': 0.8206, 'grad_norm': 7.767667770385742, 'learning_rate': 2.8243266210703984e-05, 'epoch': 1.07}


 11%|█         | 91/825 [00:21<02:18,  5.29it/s]

{'loss': 0.6882, 'grad_norm': 21.864675521850586, 'learning_rate': 2.7864161966265e-05, 'epoch': 1.2}


 12%|█▏        | 101/825 [00:22<01:44,  6.94it/s]

{'loss': 0.6764, 'grad_norm': 18.261329650878906, 'learning_rate': 2.7485057721826022e-05, 'epoch': 1.33}


 13%|█▎        | 111/825 [00:24<01:44,  6.82it/s]

{'loss': 0.8199, 'grad_norm': 47.8232536315918, 'learning_rate': 2.7105953477387046e-05, 'epoch': 1.47}


 15%|█▍        | 121/825 [00:25<01:40,  7.02it/s]

{'loss': 0.6048, 'grad_norm': 7.229106426239014, 'learning_rate': 2.6726849232948064e-05, 'epoch': 1.6}


 16%|█▌        | 131/825 [00:27<01:37,  7.11it/s]

{'loss': 0.6122, 'grad_norm': 8.96817684173584, 'learning_rate': 2.6347744988509085e-05, 'epoch': 1.73}


 17%|█▋        | 141/825 [00:28<01:38,  6.93it/s]

{'loss': 0.9099, 'grad_norm': 23.001527786254883, 'learning_rate': 2.5968640744070106e-05, 'epoch': 1.87}


 18%|█▊        | 150/825 [00:29<01:35,  7.08it/s]

{'loss': 0.7601, 'grad_norm': 31.88058853149414, 'learning_rate': 2.5589536499631127e-05, 'epoch': 2.0}



 18%|█▊        | 150/825 [00:31<01:35,  7.08it/s]

{'eval_loss': 0.663290798664093, 'eval_precision': 0.6758739749676306, 'eval_recall': 0.7779433681073026, 'eval_f1': 0.723325635103926, 'eval_accuracy': 0.7598953999762272, 'eval_runtime': 1.3859, 'eval_samples_per_second': 36.078, 'eval_steps_per_second': 18.039, 'epoch': 2.0}


 20%|█▉        | 161/825 [00:37<02:36,  4.26it/s]

{'loss': 0.5929, 'grad_norm': 7.7160820960998535, 'learning_rate': 2.5210432255192144e-05, 'epoch': 2.13}


 21%|██        | 171/825 [00:38<02:06,  5.17it/s]

{'loss': 0.5971, 'grad_norm': 17.127140045166016, 'learning_rate': 2.4831328010753165e-05, 'epoch': 2.27}


 22%|██▏       | 181/825 [00:40<02:03,  5.20it/s]

{'loss': 0.5365, 'grad_norm': 16.018226623535156, 'learning_rate': 2.445222376631419e-05, 'epoch': 2.4}


 23%|██▎       | 191/825 [00:42<02:01,  5.22it/s]

{'loss': 0.531, 'grad_norm': 64.59288024902344, 'learning_rate': 2.4073119521875207e-05, 'epoch': 2.53}


 24%|██▍       | 201/825 [00:44<01:59,  5.23it/s]

{'loss': 0.593, 'grad_norm': 33.638431549072266, 'learning_rate': 2.3694015277436228e-05, 'epoch': 2.67}


 26%|██▌       | 211/825 [00:46<01:59,  5.15it/s]

{'loss': 0.5247, 'grad_norm': 10.299402236938477, 'learning_rate': 2.331491103299725e-05, 'epoch': 2.8}


 27%|██▋       | 221/825 [00:48<01:56,  5.19it/s]

{'loss': 0.4636, 'grad_norm': 7.707093238830566, 'learning_rate': 2.2935806788558266e-05, 'epoch': 2.93}


 27%|██▋       | 225/825 [00:49<01:41,  5.89it/s]
 27%|██▋       | 225/825 [00:51<01:41,  5.89it/s]

{'eval_loss': 0.5569959878921509, 'eval_precision': 0.7144736842105263, 'eval_recall': 0.8092399403874814, 'eval_f1': 0.7589098532494758, 'eval_accuracy': 0.8043504100796387, 'eval_runtime': 1.8181, 'eval_samples_per_second': 27.502, 'eval_steps_per_second': 13.751, 'epoch': 3.0}


 28%|██▊       | 231/825 [00:56<04:56,  2.00it/s]

{'loss': 0.4867, 'grad_norm': 13.769940376281738, 'learning_rate': 2.2556702544119287e-05, 'epoch': 3.07}


 29%|██▉       | 241/825 [00:58<01:49,  5.35it/s]

{'loss': 0.3434, 'grad_norm': 7.278059005737305, 'learning_rate': 2.2177598299680308e-05, 'epoch': 3.2}


 30%|███       | 251/825 [01:00<01:39,  5.78it/s]

{'loss': 0.3094, 'grad_norm': 7.791276931762695, 'learning_rate': 2.179849405524133e-05, 'epoch': 3.33}


 32%|███▏      | 261/825 [01:02<01:49,  5.16it/s]

{'loss': 0.4526, 'grad_norm': 19.18781852722168, 'learning_rate': 2.141938981080235e-05, 'epoch': 3.47}


 33%|███▎      | 271/825 [01:03<01:47,  5.16it/s]

{'loss': 0.4364, 'grad_norm': 16.029272079467773, 'learning_rate': 2.104028556636337e-05, 'epoch': 3.6}


 34%|███▍      | 281/825 [01:05<01:46,  5.10it/s]

{'loss': 0.3676, 'grad_norm': 6.017010688781738, 'learning_rate': 2.066118132192439e-05, 'epoch': 3.73}


 35%|███▌      | 291/825 [01:07<01:35,  5.57it/s]

{'loss': 0.391, 'grad_norm': 17.62451171875, 'learning_rate': 2.028207707748541e-05, 'epoch': 3.87}


 36%|███▋      | 300/825 [01:09<01:28,  5.90it/s]

{'loss': 0.5513, 'grad_norm': 60.81452178955078, 'learning_rate': 1.990297283304643e-05, 'epoch': 4.0}



 36%|███▋      | 300/825 [01:11<01:28,  5.90it/s]

{'eval_loss': 0.5987088680267334, 'eval_precision': 0.7605764760576476, 'eval_recall': 0.8127173373075013, 'eval_f1': 0.7857829010566763, 'eval_accuracy': 0.8173065493878522, 'eval_runtime': 1.8074, 'eval_samples_per_second': 27.664, 'eval_steps_per_second': 13.832, 'epoch': 4.0}


 38%|███▊      | 311/825 [01:17<02:06,  4.06it/s]

{'loss': 0.3177, 'grad_norm': 4.639071941375732, 'learning_rate': 1.952386858860745e-05, 'epoch': 4.13}


 39%|███▉      | 321/825 [01:19<01:36,  5.20it/s]

{'loss': 0.2724, 'grad_norm': 11.283742904663086, 'learning_rate': 1.914476434416847e-05, 'epoch': 4.27}


 40%|████      | 331/825 [01:21<01:35,  5.16it/s]

{'loss': 0.3224, 'grad_norm': 8.983238220214844, 'learning_rate': 1.8765660099729493e-05, 'epoch': 4.4}


 41%|████▏     | 341/825 [01:23<01:33,  5.18it/s]

{'loss': 0.2857, 'grad_norm': 6.594318389892578, 'learning_rate': 1.8386555855290514e-05, 'epoch': 4.53}


 43%|████▎     | 351/825 [01:25<01:32,  5.14it/s]

{'loss': 0.3462, 'grad_norm': 18.252330780029297, 'learning_rate': 1.8007451610851534e-05, 'epoch': 4.67}


 44%|████▍     | 361/825 [01:27<01:31,  5.07it/s]

{'loss': 0.3071, 'grad_norm': 15.481406211853027, 'learning_rate': 1.7628347366412552e-05, 'epoch': 4.8}


 45%|████▍     | 371/825 [01:29<01:29,  5.07it/s]

{'loss': 0.1774, 'grad_norm': 5.420370578765869, 'learning_rate': 1.7249243121973573e-05, 'epoch': 4.93}


 45%|████▌     | 375/825 [01:30<01:17,  5.81it/s]
 45%|████▌     | 375/825 [01:31<01:17,  5.81it/s]

{'eval_loss': 0.6187559366226196, 'eval_precision': 0.7677419354838709, 'eval_recall': 0.8276204669647292, 'eval_f1': 0.7965574946210853, 'eval_accuracy': 0.825032687507429, 'eval_runtime': 1.8293, 'eval_samples_per_second': 27.332, 'eval_steps_per_second': 13.666, 'epoch': 5.0}


 46%|████▌     | 381/825 [01:37<03:44,  1.98it/s]

{'loss': 0.2619, 'grad_norm': 6.489009857177734, 'learning_rate': 1.6870138877534594e-05, 'epoch': 5.07}


 47%|████▋     | 391/825 [01:39<01:27,  4.98it/s]

{'loss': 0.225, 'grad_norm': 8.528050422668457, 'learning_rate': 1.649103463309561e-05, 'epoch': 5.2}


 49%|████▊     | 401/825 [01:41<01:22,  5.16it/s]

{'loss': 0.2419, 'grad_norm': 3.073837995529175, 'learning_rate': 1.6111930388656636e-05, 'epoch': 5.33}


 50%|████▉     | 411/825 [01:43<01:22,  5.03it/s]

{'loss': 0.2938, 'grad_norm': 7.777092933654785, 'learning_rate': 1.5732826144217656e-05, 'epoch': 5.47}


 51%|█████     | 421/825 [01:45<01:18,  5.15it/s]

{'loss': 0.2494, 'grad_norm': 6.944087505340576, 'learning_rate': 1.5353721899778674e-05, 'epoch': 5.6}


 52%|█████▏    | 431/825 [01:47<01:16,  5.12it/s]

{'loss': 0.3845, 'grad_norm': 6.792850494384766, 'learning_rate': 1.4974617655339695e-05, 'epoch': 5.73}


 53%|█████▎    | 441/825 [01:49<01:14,  5.14it/s]

{'loss': 0.1815, 'grad_norm': 9.840289115905762, 'learning_rate': 1.4595513410900716e-05, 'epoch': 5.87}


 55%|█████▍    | 450/825 [01:50<01:03,  5.93it/s]

{'loss': 0.1455, 'grad_norm': 26.663469314575195, 'learning_rate': 1.4216409166461737e-05, 'epoch': 6.0}



 55%|█████▍    | 450/825 [01:52<01:03,  5.93it/s]

{'eval_loss': 0.5957151055335999, 'eval_precision': 0.792003807710614, 'eval_recall': 0.8266269249875807, 'eval_f1': 0.8089450656295576, 'eval_accuracy': 0.826102460477832, 'eval_runtime': 1.7935, 'eval_samples_per_second': 27.878, 'eval_steps_per_second': 13.939, 'epoch': 6.0}


 56%|█████▌    | 461/825 [01:58<01:25,  4.25it/s]

{'loss': 0.2157, 'grad_norm': 4.9340715408325195, 'learning_rate': 1.3837304922022758e-05, 'epoch': 6.13}


 57%|█████▋    | 471/825 [02:00<01:10,  5.04it/s]

{'loss': 0.17, 'grad_norm': 3.3305470943450928, 'learning_rate': 1.3458200677583777e-05, 'epoch': 6.27}


 58%|█████▊    | 481/825 [02:02<01:06,  5.13it/s]

{'loss': 0.1188, 'grad_norm': 15.34871768951416, 'learning_rate': 1.3079096433144796e-05, 'epoch': 6.4}


 60%|█████▉    | 491/825 [02:04<01:04,  5.21it/s]

{'loss': 0.1962, 'grad_norm': 3.574669361114502, 'learning_rate': 1.2699992188705819e-05, 'epoch': 6.53}


 61%|██████    | 501/825 [02:06<01:02,  5.22it/s]

{'loss': 0.1464, 'grad_norm': 5.894938945770264, 'learning_rate': 1.2320887944266838e-05, 'epoch': 6.67}


 62%|██████▏   | 511/825 [02:08<00:56,  5.53it/s]

{'loss': 0.1852, 'grad_norm': 4.1127495765686035, 'learning_rate': 1.1941783699827859e-05, 'epoch': 6.8}


 63%|██████▎   | 521/825 [02:09<00:55,  5.45it/s]

{'loss': 0.2143, 'grad_norm': 5.5514631271362305, 'learning_rate': 1.1562679455388878e-05, 'epoch': 6.93}


 64%|██████▎   | 525/825 [02:10<00:49,  6.02it/s]
 64%|██████▎   | 525/825 [02:12<00:49,  6.02it/s]

{'eval_loss': 0.6002989411354065, 'eval_precision': 0.7962518020182604, 'eval_recall': 0.8231495280675608, 'eval_f1': 0.809477283829995, 'eval_accuracy': 0.8334720076072745, 'eval_runtime': 1.8099, 'eval_samples_per_second': 27.626, 'eval_steps_per_second': 13.813, 'epoch': 7.0}


 64%|██████▍   | 531/825 [02:17<02:27,  1.99it/s]

{'loss': 0.1197, 'grad_norm': 6.6215009689331055, 'learning_rate': 1.11835752109499e-05, 'epoch': 7.07}


 66%|██████▌   | 541/825 [02:19<00:57,  4.95it/s]

{'loss': 0.0924, 'grad_norm': 1.1800543069839478, 'learning_rate': 1.080447096651092e-05, 'epoch': 7.2}


 67%|██████▋   | 551/825 [02:21<00:53,  5.14it/s]

{'loss': 0.1738, 'grad_norm': 12.399444580078125, 'learning_rate': 1.0425366722071939e-05, 'epoch': 7.33}


 68%|██████▊   | 561/825 [02:23<00:50,  5.22it/s]

{'loss': 0.1004, 'grad_norm': 2.6239938735961914, 'learning_rate': 1.0046262477632961e-05, 'epoch': 7.47}


 69%|██████▉   | 571/825 [02:25<00:48,  5.20it/s]

{'loss': 0.1597, 'grad_norm': 4.320742607116699, 'learning_rate': 9.66715823319398e-06, 'epoch': 7.6}


 70%|███████   | 581/825 [02:27<00:46,  5.23it/s]

{'loss': 0.1283, 'grad_norm': 3.1916210651397705, 'learning_rate': 9.288053988755002e-06, 'epoch': 7.73}


 72%|███████▏  | 591/825 [02:29<00:46,  5.07it/s]

{'loss': 0.1496, 'grad_norm': 7.3555707931518555, 'learning_rate': 8.90894974431602e-06, 'epoch': 7.87}


 73%|███████▎  | 600/825 [02:31<00:37,  5.94it/s]

{'loss': 0.147, 'grad_norm': 2.2331552505493164, 'learning_rate': 8.529845499877042e-06, 'epoch': 8.0}



 73%|███████▎  | 600/825 [02:33<00:37,  5.94it/s]

{'eval_loss': 0.6824684143066406, 'eval_precision': 0.7971360381861575, 'eval_recall': 0.8296075509190264, 'eval_f1': 0.813047711781889, 'eval_accuracy': 0.8291929157256627, 'eval_runtime': 1.7942, 'eval_samples_per_second': 27.868, 'eval_steps_per_second': 13.934, 'epoch': 8.0}


 74%|███████▍  | 611/825 [02:39<00:52,  4.08it/s]

{'loss': 0.0692, 'grad_norm': 1.782378077507019, 'learning_rate': 8.150741255438063e-06, 'epoch': 8.13}


 75%|███████▌  | 621/825 [02:41<00:39,  5.12it/s]

{'loss': 0.1416, 'grad_norm': 2.5008299350738525, 'learning_rate': 7.771637010999082e-06, 'epoch': 8.27}


 76%|███████▋  | 631/825 [02:43<00:37,  5.20it/s]

{'loss': 0.1084, 'grad_norm': 1.1041879653930664, 'learning_rate': 7.392532766560103e-06, 'epoch': 8.4}


 78%|███████▊  | 641/825 [02:45<00:35,  5.24it/s]

{'loss': 0.1163, 'grad_norm': 16.211931228637695, 'learning_rate': 7.0134285221211236e-06, 'epoch': 8.53}


 79%|███████▉  | 651/825 [02:47<00:34,  5.11it/s]

{'loss': 0.0831, 'grad_norm': 19.7936954498291, 'learning_rate': 6.634324277682144e-06, 'epoch': 8.67}


 80%|████████  | 661/825 [02:48<00:31,  5.20it/s]

{'loss': 0.1094, 'grad_norm': 13.801299095153809, 'learning_rate': 6.2552200332431645e-06, 'epoch': 8.8}


 81%|████████▏ | 671/825 [02:50<00:29,  5.26it/s]

{'loss': 0.0881, 'grad_norm': 4.140436172485352, 'learning_rate': 5.876115788804184e-06, 'epoch': 8.93}


 82%|████████▏ | 675/825 [02:51<00:25,  5.89it/s]
 82%|████████▏ | 675/825 [02:53<00:25,  5.89it/s]

{'eval_loss': 0.7031681537628174, 'eval_precision': 0.8036750483558994, 'eval_recall': 0.8256333830104322, 'eval_f1': 0.8145062484685127, 'eval_accuracy': 0.8181385950314989, 'eval_runtime': 1.8042, 'eval_samples_per_second': 27.713, 'eval_steps_per_second': 13.857, 'epoch': 9.0}


 83%|████████▎ | 681/825 [02:58<01:11,  2.00it/s]

{'loss': 0.0623, 'grad_norm': 5.0649495124816895, 'learning_rate': 5.497011544365205e-06, 'epoch': 9.07}


 84%|████████▍ | 691/825 [03:00<00:26,  4.98it/s]

{'loss': 0.0781, 'grad_norm': 4.386407852172852, 'learning_rate': 5.117907299926225e-06, 'epoch': 9.2}


 85%|████████▍ | 701/825 [03:02<00:23,  5.22it/s]

{'loss': 0.0812, 'grad_norm': 4.421472549438477, 'learning_rate': 4.7388030554872456e-06, 'epoch': 9.33}


 86%|████████▌ | 711/825 [03:04<00:22,  5.15it/s]

{'loss': 0.0672, 'grad_norm': 6.519797325134277, 'learning_rate': 4.3596988110482665e-06, 'epoch': 9.47}


 87%|████████▋ | 721/825 [03:06<00:19,  5.24it/s]

{'loss': 0.09, 'grad_norm': 1.9671381711959839, 'learning_rate': 3.980594566609286e-06, 'epoch': 9.6}


 89%|████████▊ | 731/825 [03:08<00:18,  5.21it/s]

{'loss': 0.0477, 'grad_norm': 2.6072967052459717, 'learning_rate': 3.6014903221703066e-06, 'epoch': 9.73}


 90%|████████▉ | 741/825 [03:10<00:16,  5.23it/s]

{'loss': 0.0501, 'grad_norm': 0.8739238977432251, 'learning_rate': 3.222386077731327e-06, 'epoch': 9.87}


 91%|█████████ | 750/825 [03:12<00:12,  5.90it/s]

{'loss': 0.1161, 'grad_norm': 3.9137332439422607, 'learning_rate': 2.8432818332923475e-06, 'epoch': 10.0}



 91%|█████████ | 750/825 [03:13<00:12,  5.90it/s]

{'eval_loss': 0.7403074502944946, 'eval_precision': 0.7990521327014218, 'eval_recall': 0.8375558867362146, 'eval_f1': 0.8178510793111812, 'eval_accuracy': 0.8170688220610959, 'eval_runtime': 1.7967, 'eval_samples_per_second': 27.828, 'eval_steps_per_second': 13.914, 'epoch': 10.0}


 92%|█████████▏| 761/825 [03:20<00:15,  4.12it/s]

{'loss': 0.0465, 'grad_norm': 3.1159701347351074, 'learning_rate': 2.4641775888533676e-06, 'epoch': 10.13}


 93%|█████████▎| 771/825 [03:21<00:09,  5.71it/s]

{'loss': 0.0554, 'grad_norm': 17.372501373291016, 'learning_rate': 2.085073344414388e-06, 'epoch': 10.27}


 95%|█████████▍| 781/825 [03:23<00:08,  5.26it/s]

{'loss': 0.0506, 'grad_norm': 1.1418272256851196, 'learning_rate': 1.7059690999754083e-06, 'epoch': 10.4}


 96%|█████████▌| 791/825 [03:25<00:06,  5.07it/s]

{'loss': 0.0684, 'grad_norm': 7.749805450439453, 'learning_rate': 1.3268648555364288e-06, 'epoch': 10.53}


 97%|█████████▋| 801/825 [03:27<00:04,  5.23it/s]

{'loss': 0.0537, 'grad_norm': 3.5037436485290527, 'learning_rate': 9.477606110974491e-07, 'epoch': 10.67}


 98%|█████████▊| 811/825 [03:29<00:02,  5.20it/s]

{'loss': 0.0632, 'grad_norm': 5.054827690124512, 'learning_rate': 5.686563666584694e-07, 'epoch': 10.8}


100%|█████████▉| 821/825 [03:31<00:00,  5.10it/s]

{'loss': 0.1126, 'grad_norm': 4.87497615814209, 'learning_rate': 1.8955212221948982e-07, 'epoch': 10.93}



100%|██████████| 825/825 [03:38<00:00,  5.91it/s]

{'eval_loss': 0.7498010396957397, 'eval_precision': 0.8072289156626506, 'eval_recall': 0.8320914058618977, 'eval_f1': 0.8194716242661448, 'eval_accuracy': 0.8253892784975633, 'eval_runtime': 1.7889, 'eval_samples_per_second': 27.949, 'eval_steps_per_second': 13.975, 'epoch': 11.0}


100%|██████████| 825/825 [03:42<00:00,  3.71it/s]


{'train_runtime': 222.1989, 'train_samples_per_second': 7.376, 'train_steps_per_second': 3.713, 'train_loss': 0.36046468420462174, 'epoch': 11.0}


100%|██████████| 25/25 [00:01<00:00, 14.30it/s]


In [28]:
with open("3ser_lm3_ner_boxes_hptune_models_results.json", "w") as json_file:
    json.dump(train_dict, json_file)