## Aspect Term Extraction (ATE) Training and Fine Tuning for Large Language Models on German hospital reviews using the special OB-Tagging


In [1]:
import torch
import os

import spacy
import ast  # To safely evaluate strings as Python objects

from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import Dataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import evaluate

from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

# We need the sys package to load modules from another directory:
import sys
sys.path.append('../')
from functions.ate_model_train_OB import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Is CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("GPU device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

Is CUDA available: True
CUDA version: 12.6
GPU device name: NVIDIA A30


In [3]:
# Load the dataset into a DataFrame
data = pd.read_csv("./data/hospitalABSA/patient_review_labels_absa.csv")
data_ano = pd.read_csv("./data/hospitalABSA/patient_review_labels_absa_ano.csv")

In [4]:
models = ["google-bert/bert-base-german-cased","dbmdz/bert-base-german-cased", "dbmdz/bert-base-german-uncased",
          "FacebookAI/xlm-roberta-base", "TUM/GottBERT_base_best", "TUM/GottBERT_filtered_base_best", "TUM/GottBERT_base_last",
          "distilbert/distilbert-base-german-cased", "GerMedBERT/medbert-512", "deepset/gbert-base"]

### 1. Train standard ATE Models for 5, 6, 7, 8, 10, 12 epochs

In [5]:
for model in models:
    print(f'training and results for {model}:')
    ate_model(data, model, rn1=42, rn2=42, epochs=5)
    print()
# GPU: NVIDIA GeForce RTX 2080 Ti

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4140.65 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3653.54 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for google-bert/bert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.132847,0.822581,0.781609,0.801572
2,0.192500,0.156996,0.83871,0.796935,0.817289
3,0.082300,0.21795,0.829268,0.781609,0.804734
4,0.024000,0.244573,0.821012,0.808429,0.814672
5,0.012200,0.271916,0.822394,0.816092,0.819231


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3443.90 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.80      0.84       323

   micro avg       0.88      0.80      0.84       323
   macro avg       0.88      0.80      0.84       323
weighted avg       0.88      0.80      0.84       323

Precision Score: 0.8809523809523809
Recall Score: 0.8018575851393189
F1 Score: 0.8395461912479741
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4517.03 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3705.51 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.141073,0.832618,0.76378,0.796715
2,0.203200,0.158556,0.849785,0.779528,0.813142
3,0.095900,0.204931,0.849593,0.822835,0.836
4,0.037700,0.25093,0.833333,0.807087,0.82
5,0.020800,0.290485,0.836735,0.807087,0.821643


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3702.33 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.91      0.74      0.81       315

   micro avg       0.91      0.74      0.81       315
   macro avg       0.91      0.74      0.81       315
weighted avg       0.91      0.74      0.81       315

Precision Score: 0.9098039215686274
Recall Score: 0.7365079365079366
F1 Score: 0.8140350877192983
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4403.13 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4164.74 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-uncased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.142309,0.832618,0.769841,0.8
2,0.192200,0.165703,0.855204,0.75,0.799154
3,0.091500,0.243324,0.843049,0.746032,0.791579
4,0.037300,0.225724,0.839506,0.809524,0.824242
5,0.025700,0.257734,0.852941,0.805556,0.828571


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3838.11 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.76      0.82       316

   micro avg       0.88      0.76      0.82       316
   macro avg       0.88      0.76      0.82       316
weighted avg       0.88      0.76      0.82       316

Precision Score: 0.8827838827838828
Recall Score: 0.7626582278481012
F1 Score: 0.8183361629881153
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4188.35 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4404.96 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for FacebookAI/xlm-roberta-base with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.191942,0.822642,0.754325,0.787004
2,0.224400,0.180229,0.842697,0.778547,0.809353
3,0.153900,0.174498,0.865169,0.799308,0.830935
4,0.095900,0.182869,0.84083,0.84083,0.84083
5,0.067900,0.199222,0.83737,0.83737,0.83737


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3703.03 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.84      0.77      0.80       346

   micro avg       0.84      0.77      0.80       346
   macro avg       0.84      0.77      0.80       346
weighted avg       0.84      0.77      0.80       346

Precision Score: 0.8369905956112853
Recall Score: 0.7716763005780347
F1 Score: 0.8030075187969926
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4718.02 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4142.79 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_best with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.158621,0.821782,0.783019,0.801932
2,0.183600,0.152034,0.835749,0.816038,0.825776
3,0.108300,0.189798,0.835749,0.816038,0.825776
4,0.054900,0.199698,0.824074,0.839623,0.831776
5,0.037200,0.216283,0.824645,0.820755,0.822695


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3551.84 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.86      0.82      0.84       279

   micro avg       0.86      0.82      0.84       279
   macro avg       0.86      0.82      0.84       279
weighted avg       0.86      0.82      0.84       279

Precision Score: 0.8641509433962264
Recall Score: 0.8207885304659498
F1 Score: 0.8419117647058825
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4947.62 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4031.33 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_filtered_base_best with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.120732,0.836634,0.79717,0.816425
2,0.174700,0.149519,0.801843,0.820755,0.811189
3,0.100100,0.196196,0.809302,0.820755,0.814988
4,0.041500,0.214591,0.813636,0.84434,0.828704
5,0.029300,0.223136,0.823256,0.834906,0.82904


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3909.56 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.78      0.82       279

   micro avg       0.87      0.78      0.82       279
   macro avg       0.87      0.78      0.82       279
weighted avg       0.87      0.78      0.82       279

Precision Score: 0.8656126482213439
Recall Score: 0.7849462365591398
F1 Score: 0.8233082706766918
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 5137.22 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4212.36 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_last with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.152716,0.835897,0.768868,0.800983
2,0.177900,0.131368,0.845,0.79717,0.820388
3,0.110300,0.153732,0.833333,0.825472,0.829384
4,0.056700,0.214831,0.828431,0.79717,0.8125
5,0.037300,0.21799,0.846154,0.830189,0.838095


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3980.12 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.81      0.84       279

   micro avg       0.87      0.81      0.84       279
   macro avg       0.87      0.81      0.84       279
weighted avg       0.87      0.81      0.84       279

Precision Score: 0.8664122137404581
Recall Score: 0.8136200716845878
F1 Score: 0.8391866913123844
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 5254.19 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4596.42 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for distilbert/distilbert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.140078,0.818548,0.799213,0.808765
2,0.208000,0.163722,0.843049,0.740157,0.78826
3,0.104600,0.198602,0.873832,0.73622,0.799145
4,0.054800,0.207215,0.814815,0.779528,0.796781
5,0.039600,0.216795,0.829167,0.783465,0.805668


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3705.60 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.81      0.77      0.79       315

   micro avg       0.81      0.77      0.79       315
   macro avg       0.81      0.77      0.79       315
weighted avg       0.81      0.77      0.79       315

Precision Score: 0.8114478114478114
Recall Score: 0.765079365079365
F1 Score: 0.7875816993464052
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4546.56 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3788.66 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for GerMedBERT/medbert-512 with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.113728,0.828054,0.792208,0.809735
2,0.190200,0.145504,0.789683,0.861472,0.824017
3,0.087200,0.198338,0.834081,0.805195,0.819383
4,0.035500,0.21509,0.818565,0.839827,0.82906
5,0.018600,0.263873,0.84,0.818182,0.828947


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3266.92 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.90      0.78      0.83       288

   micro avg       0.90      0.78      0.83       288
   macro avg       0.90      0.78      0.83       288
weighted avg       0.90      0.78      0.83       288

Precision Score: 0.8995983935742972
Recall Score: 0.7777777777777778
F1 Score: 0.8342644320297952
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4498.23 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3360.07 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for deepset/gbert-base with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.129026,0.808,0.795276,0.801587
2,0.190300,0.153711,0.833333,0.826772,0.83004
3,0.095500,0.203675,0.836735,0.807087,0.821643
4,0.038200,0.20933,0.830769,0.850394,0.840467
5,0.023900,0.23232,0.825095,0.854331,0.839458


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3005.47 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.91      0.79      0.84       315

   micro avg       0.91      0.79      0.84       315
   macro avg       0.91      0.79      0.84       315
weighted avg       0.91      0.79      0.84       315

Precision Score: 0.9084249084249084
Recall Score: 0.7873015873015873
F1 Score: 0.8435374149659863
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

In [11]:
for model in models:
    print(f'training and results for {model}:')
    ate_model(data, model, rn1=42, rn2=42, epochs=5)
    print()
# GPU: NVIDIA A30

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6400.11 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3816.95 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for google-bert/bert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.142313,0.8159,0.747126,0.78
2,0.188000,0.158496,0.820225,0.83908,0.829545
3,0.079900,0.257712,0.82449,0.773946,0.798419
4,0.023100,0.28216,0.819277,0.781609,0.8
5,0.014100,0.325987,0.815261,0.777778,0.796078


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5779.78 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.92      0.79      0.85       323

   micro avg       0.92      0.79      0.85       323
   macro avg       0.92      0.79      0.85       323
weighted avg       0.92      0.79      0.85       323

Precision Score: 0.9172661870503597
Recall Score: 0.7894736842105263
F1 Score: 0.848585690515807
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7444.13 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6361.59 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.145685,0.834746,0.775591,0.804082
2,0.203800,0.157646,0.840816,0.811024,0.825651
3,0.097100,0.219875,0.850622,0.807087,0.828283
4,0.041600,0.228635,0.826923,0.846457,0.836576
5,0.020800,0.263164,0.839216,0.84252,0.840864


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5508.02 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.76      0.82       315

   micro avg       0.89      0.76      0.82       315
   macro avg       0.89      0.76      0.82       315
weighted avg       0.89      0.76      0.82       315

Precision Score: 0.8880597014925373
Recall Score: 0.7555555555555555
F1 Score: 0.8164665523156089
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6518.88 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5539.03 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-uncased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.137741,0.831933,0.785714,0.808163
2,0.191500,0.169057,0.853333,0.761905,0.805031
3,0.090200,0.24635,0.849315,0.738095,0.789809
4,0.036800,0.235265,0.855895,0.777778,0.814969
5,0.024000,0.268049,0.859031,0.77381,0.814196


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5387.74 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.76      0.82       316

   micro avg       0.88      0.76      0.82       316
   macro avg       0.88      0.76      0.82       316
weighted avg       0.88      0.76      0.82       316

Precision Score: 0.8763636363636363
Recall Score: 0.7626582278481012
F1 Score: 0.8155668358714044
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6204.99 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5260.98 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for FacebookAI/xlm-roberta-base with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.166674,0.852459,0.719723,0.780488
2,0.226000,0.133717,0.8125,0.854671,0.833052
3,0.164800,0.148331,0.833898,0.851211,0.842466
4,0.097000,0.190572,0.837545,0.802768,0.819788
5,0.074500,0.193155,0.821549,0.844291,0.832765


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5176.40 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.84      0.78      0.81       346

   micro avg       0.84      0.78      0.81       346
   macro avg       0.84      0.78      0.81       346
weighted avg       0.84      0.78      0.81       346

Precision Score: 0.838006230529595
Recall Score: 0.7774566473988439
F1 Score: 0.8065967016491755
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6645.63 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5480.48 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_best with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.185822,0.845745,0.75,0.795
2,0.181500,0.14218,0.843434,0.787736,0.814634
3,0.103400,0.221468,0.827225,0.745283,0.784119
4,0.051600,0.246947,0.82199,0.740566,0.779156
5,0.035500,0.238256,0.819512,0.792453,0.805755


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5704.18 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.90      0.68      0.78       279

   micro avg       0.90      0.68      0.78       279
   macro avg       0.90      0.68      0.78       279
weighted avg       0.90      0.68      0.78       279

Precision Score: 0.9009433962264151
Recall Score: 0.6845878136200717
F1 Score: 0.7780040733197556
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 5814.25 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4496.60 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_filtered_base_best with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.121842,0.835,0.787736,0.81068
2,0.174700,0.155838,0.810427,0.806604,0.808511
3,0.100100,0.192741,0.816425,0.79717,0.806683
4,0.042400,0.215976,0.804651,0.816038,0.810304
5,0.031400,0.224245,0.810427,0.806604,0.808511


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5507.60 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.85      0.78      0.82       279

   micro avg       0.85      0.78      0.82       279
   macro avg       0.85      0.78      0.82       279
weighted avg       0.85      0.78      0.82       279

Precision Score: 0.8549019607843137
Recall Score: 0.7813620071684588
F1 Score: 0.8164794007490637
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7076.95 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5879.92 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_last with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.148909,0.831633,0.768868,0.79902
2,0.177900,0.150692,0.831633,0.768868,0.79902
3,0.109500,0.170914,0.806604,0.806604,0.806604
4,0.053200,0.215131,0.834951,0.811321,0.822967
5,0.037700,0.23284,0.827751,0.816038,0.821853


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5666.55 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.81      0.84       279

   micro avg       0.87      0.81      0.84       279
   macro avg       0.87      0.81      0.84       279
weighted avg       0.87      0.81      0.84       279

Precision Score: 0.8730769230769231
Recall Score: 0.8136200716845878
F1 Score: 0.8423005565862709
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 8271.38 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6928.43 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for distilbert/distilbert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.140078,0.818548,0.799213,0.808765
2,0.208000,0.163722,0.843049,0.740157,0.78826
3,0.104600,0.198602,0.873832,0.73622,0.799145
4,0.054800,0.207215,0.814815,0.779528,0.796781
5,0.039600,0.216794,0.829167,0.783465,0.805668


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 6149.48 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.81      0.77      0.79       315

   micro avg       0.81      0.77      0.79       315
   macro avg       0.81      0.77      0.79       315
weighted avg       0.81      0.77      0.79       315

Precision Score: 0.8114478114478114
Recall Score: 0.765079365079365
F1 Score: 0.7875816993464052
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7467.60 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6492.03 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for GerMedBERT/medbert-512 with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.113731,0.828054,0.792208,0.809735
2,0.190200,0.14541,0.789683,0.861472,0.824017
3,0.087200,0.198744,0.835556,0.813853,0.824561
4,0.035600,0.214815,0.818565,0.839827,0.82906
5,0.018900,0.263924,0.837719,0.82684,0.832244


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5316.11 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.92      0.75      0.83       288

   micro avg       0.92      0.75      0.83       288
   macro avg       0.92      0.75      0.83       288
weighted avg       0.92      0.75      0.83       288

Precision Score: 0.9191489361702128
Recall Score: 0.75
F1 Score: 0.8260038240917782
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O',

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6737.52 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5419.83 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for deepset/gbert-base with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.130743,0.802372,0.799213,0.800789
2,0.190700,0.15397,0.823529,0.826772,0.825147
3,0.094100,0.199375,0.825911,0.80315,0.814371
4,0.039700,0.215434,0.834646,0.834646,0.834646
5,0.023900,0.240411,0.829457,0.84252,0.835938


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5812.76 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.91      0.79      0.84       315

   micro avg       0.91      0.79      0.84       315
   macro avg       0.91      0.79      0.84       315
weighted avg       0.91      0.79      0.84       315

Precision Score: 0.9054545454545454
Recall Score: 0.7904761904761904
F1 Score: 0.8440677966101695
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

In [None]:
for model in models:
    print(f'training and results for {model}:')
    ate_model(data, model, rn1=42, rn2=42, epochs=6)
    print()
# GPU: NVIDIA GeForce RTX 2080 Ti

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4078.98 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3269.87 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for google-bert/bert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.139645,0.820408,0.770115,0.794466
2,0.189100,0.176285,0.845833,0.777778,0.810379
3,0.080800,0.218479,0.826415,0.83908,0.8327
4,0.026500,0.240427,0.824219,0.808429,0.816248
5,0.013100,0.336769,0.835391,0.777778,0.805556
6,0.013100,0.350261,0.828125,0.812261,0.820116


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3272.89 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.79      0.83       323

   micro avg       0.87      0.79      0.83       323
   macro avg       0.87      0.79      0.83       323
weighted avg       0.87      0.79      0.83       323

Precision Score: 0.8673469387755102
Recall Score: 0.7894736842105263
F1 Score: 0.8265802269043759
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4049.13 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3439.85 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.151488,0.830508,0.771654,0.8
2,0.198400,0.146218,0.846774,0.826772,0.836653
3,0.099900,0.184518,0.855372,0.814961,0.834677
4,0.038800,0.216478,0.819549,0.858268,0.838462
5,0.021200,0.272177,0.836576,0.846457,0.841487
6,0.021200,0.2884,0.841897,0.838583,0.840237


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3200.30 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.81      0.85       315

   micro avg       0.89      0.81      0.85       315
   macro avg       0.89      0.81      0.85       315
weighted avg       0.89      0.81      0.85       315

Precision Score: 0.89198606271777
Recall Score: 0.8126984126984127
F1 Score: 0.8504983388704319
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 3660.62 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3077.39 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-uncased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.14202,0.831169,0.761905,0.795031
2,0.190400,0.17106,0.835443,0.785714,0.809816
3,0.091700,0.224151,0.84322,0.789683,0.815574
4,0.030500,0.244362,0.827309,0.81746,0.822355
5,0.017200,0.306587,0.844538,0.797619,0.820408
6,0.017200,0.316476,0.844538,0.797619,0.820408


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3054.48 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.90      0.78      0.84       316

   micro avg       0.90      0.78      0.84       316
   macro avg       0.90      0.78      0.84       316
weighted avg       0.90      0.78      0.84       316

Precision Score: 0.9014598540145985
Recall Score: 0.7816455696202531
F1 Score: 0.8372881355932204
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 3983.70 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3404.82 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for FacebookAI/xlm-roberta-base with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.167835,0.824818,0.782007,0.802842
2,0.228400,0.185494,0.822064,0.799308,0.810526
3,0.151300,0.161602,0.840426,0.820069,0.830123
4,0.098300,0.201202,0.803333,0.83391,0.818336
5,0.061700,0.2372,0.805281,0.844291,0.824324
6,0.061700,0.256965,0.812081,0.83737,0.824532


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3012.56 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.86      0.76      0.81       346

   micro avg       0.86      0.76      0.81       346
   macro avg       0.86      0.76      0.81       346
weighted avg       0.86      0.76      0.81       346

Precision Score: 0.8571428571428571
Recall Score: 0.7630057803468208
F1 Score: 0.8073394495412844
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4161.47 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3551.46 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_best with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.148966,0.831633,0.768868,0.79902
2,0.184200,0.129755,0.841584,0.801887,0.821256
3,0.106700,0.173518,0.833333,0.778302,0.804878
4,0.056400,0.203573,0.794643,0.839623,0.816514
5,0.035200,0.278865,0.802817,0.806604,0.804706
6,0.035200,0.275809,0.803653,0.830189,0.816705


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3124.52 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.71      0.79       279

   micro avg       0.89      0.71      0.79       279
   macro avg       0.89      0.71      0.79       279
weighted avg       0.89      0.71      0.79       279

Precision Score: 0.8914027149321267
Recall Score: 0.7060931899641577
F1 Score: 0.788
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O'

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4163.66 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3101.23 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_filtered_base_best with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.136868,0.808612,0.79717,0.80285
2,0.181700,0.151986,0.845771,0.801887,0.823245
3,0.099700,0.19552,0.82381,0.816038,0.819905
4,0.047600,0.215909,0.803738,0.811321,0.807512
5,0.028400,0.258574,0.828431,0.79717,0.8125
6,0.028400,0.273697,0.815166,0.811321,0.813239


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3264.70 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.78      0.83       279

   micro avg       0.89      0.78      0.83       279
   macro avg       0.89      0.78      0.83       279
weighted avg       0.89      0.78      0.83       279

Precision Score: 0.8934426229508197
Recall Score: 0.7813620071684588
F1 Score: 0.8336520076481835
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4120.93 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3391.41 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_last with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.153581,0.825,0.778302,0.800971
2,0.183100,0.15076,0.802691,0.84434,0.822989
3,0.112900,0.182726,0.826087,0.806604,0.816229
4,0.056600,0.200577,0.820628,0.863208,0.841379
5,0.037500,0.250213,0.79638,0.830189,0.812933
6,0.037500,0.270521,0.789238,0.830189,0.809195


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3212.17 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.82      0.85       279

   micro avg       0.88      0.82      0.85       279
   macro avg       0.88      0.82      0.85       279
weighted avg       0.88      0.82      0.85       279

Precision Score: 0.8773946360153256
Recall Score: 0.8207885304659498
F1 Score: 0.8481481481481481
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4342.83 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3759.30 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for distilbert/distilbert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.14471,0.850877,0.76378,0.804979
2,0.202000,0.153486,0.837607,0.771654,0.803279
3,0.102500,0.205724,0.866359,0.740157,0.798301
4,0.048000,0.218037,0.829787,0.767717,0.797546
5,0.035300,0.252324,0.843478,0.76378,0.801653
6,0.035300,0.256023,0.839662,0.783465,0.810591


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3244.13 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.90      0.77      0.83       315

   micro avg       0.90      0.77      0.83       315
   macro avg       0.90      0.77      0.83       315
weighted avg       0.90      0.77      0.83       315

Precision Score: 0.9
Recall Score: 0.7714285714285715
F1 Score: 0.8307692307692307
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 3925.59 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3415.58 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for GerMedBERT/medbert-512 with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.115655,0.816667,0.848485,0.832272
2,0.187700,0.127008,0.81893,0.861472,0.839662
3,0.087200,0.216644,0.834821,0.809524,0.821978
4,0.037900,0.214082,0.834061,0.82684,0.830435
5,0.017400,0.28477,0.832618,0.839827,0.836207
6,0.017400,0.283766,0.824268,0.852814,0.838298


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3119.56 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.78      0.82       288

   micro avg       0.87      0.78      0.82       288
   macro avg       0.87      0.78      0.82       288
weighted avg       0.87      0.78      0.82       288

Precision Score: 0.8692307692307693
Recall Score: 0.7847222222222222
F1 Score: 0.8248175182481752
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 3988.59 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3444.10 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for deepset/gbert-base with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.140835,0.832618,0.76378,0.796715
2,0.195200,0.145978,0.833333,0.866142,0.849421
3,0.094000,0.249574,0.889381,0.791339,0.8375
4,0.032800,0.247657,0.841897,0.838583,0.840237


In [7]:
ate_model(data, "deepset/gbert-base", rn1=42, rn2=42, epochs=6)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archit

mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6092.65 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6037.98 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for deepset/gbert-base with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.133192,0.844156,0.767717,0.804124
2,0.187100,0.138036,0.855372,0.814961,0.834677
3,0.090400,0.192427,0.877729,0.791339,0.832298
4,0.029500,0.215252,0.868085,0.80315,0.834356
5,0.019800,0.242905,0.851406,0.834646,0.842942
6,0.019800,0.251559,0.866667,0.818898,0.842105


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5502.28 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.77      0.82       315

   micro avg       0.88      0.77      0.82       315
   macro avg       0.88      0.77      0.82       315
weighted avg       0.88      0.77      0.82       315

Precision Score: 0.8836363636363637
Recall Score: 0.7714285714285715
F1 Score: 0.8237288135593221
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

In [12]:
for model in models:
    print(f'training and results for {model}:')
    ate_model(data, model, rn1=42, rn2=42, epochs=6)
    print()
# GPU: NVIDIA A30

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7412.31 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6472.20 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for google-bert/bert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.139645,0.820408,0.770115,0.794466
2,0.189100,0.176285,0.845833,0.777778,0.810379
3,0.080800,0.218479,0.826415,0.83908,0.8327
4,0.026500,0.240427,0.824219,0.808429,0.816248
5,0.013100,0.336771,0.835391,0.777778,0.805556
6,0.013100,0.350263,0.828125,0.812261,0.820116


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5043.01 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.79      0.83       323

   micro avg       0.87      0.79      0.83       323
   macro avg       0.87      0.79      0.83       323
weighted avg       0.87      0.79      0.83       323

Precision Score: 0.8673469387755102
Recall Score: 0.7894736842105263
F1 Score: 0.8265802269043759
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7607.38 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6533.99 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.142299,0.812766,0.751969,0.781186
2,0.199000,0.14359,0.835294,0.838583,0.836935
3,0.101100,0.177972,0.842105,0.88189,0.861538
4,0.039000,0.229381,0.821705,0.834646,0.828125
5,0.024500,0.25924,0.818182,0.850394,0.833977
6,0.024500,0.268867,0.838583,0.838583,0.838583


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5356.58 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.80      0.83       315

   micro avg       0.87      0.80      0.83       315
   macro avg       0.87      0.80      0.83       315
weighted avg       0.87      0.80      0.83       315

Precision Score: 0.8719723183391004
Recall Score: 0.8
F1 Score: 0.8344370860927153
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6486.96 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5317.64 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-uncased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.146375,0.830435,0.757937,0.792531
2,0.190800,0.169808,0.844156,0.77381,0.807453
3,0.091300,0.223278,0.831276,0.801587,0.816162
4,0.031500,0.248159,0.830645,0.81746,0.824
5,0.019000,0.292255,0.836066,0.809524,0.822581
6,0.019000,0.306869,0.845188,0.801587,0.822811


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5376.77 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.76      0.82       316

   micro avg       0.89      0.76      0.82       316
   macro avg       0.89      0.76      0.82       316
weighted avg       0.89      0.76      0.82       316

Precision Score: 0.8892988929889298
Recall Score: 0.7626582278481012
F1 Score: 0.8211243611584327
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6273.46 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5523.64 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for FacebookAI/xlm-roberta-base with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.138271,0.801394,0.795848,0.798611
2,0.220500,0.144805,0.849817,0.802768,0.825623
3,0.158200,0.144564,0.859712,0.82699,0.843034
4,0.088900,0.184028,0.798046,0.847751,0.822148
5,0.067600,0.206823,0.808581,0.847751,0.827703
6,0.067600,0.23282,0.816327,0.83045,0.823328


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5313.86 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.85      0.77      0.81       346

   micro avg       0.85      0.77      0.81       346
   macro avg       0.85      0.77      0.81       346
weighted avg       0.85      0.77      0.81       346

Precision Score: 0.8498402555910544
Recall Score: 0.7687861271676301
F1 Score: 0.8072837632776936
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7095.37 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5851.25 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_best with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.150005,0.832487,0.773585,0.801956
2,0.184700,0.151872,0.845361,0.773585,0.807882
3,0.115200,0.171141,0.827103,0.834906,0.830986
4,0.056700,0.1938,0.809524,0.882075,0.844244
5,0.041200,0.248002,0.82243,0.830189,0.826291
6,0.041200,0.262757,0.81448,0.849057,0.831409


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 4851.22 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.83      0.85       279

   micro avg       0.87      0.83      0.85       279
   macro avg       0.87      0.83      0.85       279
weighted avg       0.87      0.83      0.85       279

Precision Score: 0.868421052631579
Recall Score: 0.8279569892473119
F1 Score: 0.8477064220183487
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7309.21 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6018.51 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_filtered_base_best with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.136023,0.8125,0.79717,0.804762
2,0.181200,0.147919,0.826923,0.811321,0.819048
3,0.098100,0.198319,0.814286,0.806604,0.810427
4,0.048700,0.217074,0.822115,0.806604,0.814286
5,0.031000,0.242993,0.815166,0.811321,0.813239
6,0.031000,0.264496,0.814286,0.806604,0.810427


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5580.73 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.78      0.82       279

   micro avg       0.88      0.78      0.82       279
   macro avg       0.88      0.78      0.82       279
weighted avg       0.88      0.78      0.82       279

Precision Score: 0.875
Recall Score: 0.7777777777777778
F1 Score: 0.823529411764706
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O',

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7260.53 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5983.06 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_last with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.164725,0.823232,0.768868,0.795122
2,0.183500,0.140214,0.809955,0.84434,0.82679
3,0.110600,0.189895,0.795652,0.863208,0.828054
4,0.052300,0.222863,0.802752,0.825472,0.813953
5,0.034000,0.272853,0.783784,0.820755,0.801843
6,0.034000,0.294934,0.792793,0.830189,0.81106


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5477.06 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.85      0.82      0.84       279

   micro avg       0.85      0.82      0.84       279
   macro avg       0.85      0.82      0.84       279
weighted avg       0.85      0.82      0.84       279

Precision Score: 0.8513011152416357
Recall Score: 0.8207885304659498
F1 Score: 0.8357664233576642
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 8142.36 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6853.33 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for distilbert/distilbert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.14471,0.850877,0.76378,0.804979
2,0.202000,0.153486,0.837607,0.771654,0.803279
3,0.102500,0.205724,0.866359,0.740157,0.798301
4,0.048000,0.218037,0.829787,0.767717,0.797546
5,0.035300,0.252324,0.843478,0.76378,0.801653
6,0.035300,0.256023,0.839662,0.783465,0.810591


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 6249.55 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.90      0.77      0.83       315

   micro avg       0.90      0.77      0.83       315
   macro avg       0.90      0.77      0.83       315
weighted avg       0.90      0.77      0.83       315

Precision Score: 0.9
Recall Score: 0.7714285714285715
F1 Score: 0.8307692307692307
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7433.09 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6355.48 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for GerMedBERT/medbert-512 with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.115651,0.816667,0.848485,0.832272
2,0.187700,0.127292,0.815574,0.861472,0.837895
3,0.087300,0.212509,0.834821,0.809524,0.821978
4,0.037900,0.213372,0.831169,0.831169,0.831169
5,0.017700,0.281785,0.829787,0.844156,0.83691
6,0.017700,0.283614,0.823529,0.848485,0.835821


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5086.97 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.78      0.82       288

   micro avg       0.87      0.78      0.82       288
   macro avg       0.87      0.78      0.82       288
weighted avg       0.87      0.78      0.82       288

Precision Score: 0.8659003831417624
Recall Score: 0.7847222222222222
F1 Score: 0.8233151183970856
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7356.21 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6373.46 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for deepset/gbert-base with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.142978,0.830435,0.751969,0.789256
2,0.195200,0.146292,0.830827,0.870079,0.85
3,0.094200,0.248587,0.87069,0.795276,0.831276
4,0.032800,0.230214,0.83004,0.826772,0.828402
5,0.020200,0.262754,0.820611,0.846457,0.833333
6,0.020200,0.296022,0.837398,0.811024,0.824


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5696.20 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.84      0.82      0.83       315

   micro avg       0.84      0.82      0.83       315
   macro avg       0.84      0.82      0.83       315
weighted avg       0.84      0.82      0.83       315

Precision Score: 0.8403908794788274
Recall Score: 0.819047619047619
F1 Score: 0.8295819935691318
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

In [8]:
for model in models:
    print(f'training and results for {model}:')
    ate_model(data, model, rn1=42, rn2=42, epochs=7)
    print()

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6915.31 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5357.59 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for google-bert/bert-base-german-cased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.152189,0.838298,0.754789,0.794355
2,0.191800,0.140404,0.84898,0.796935,0.822134
3,0.081200,0.261907,0.795181,0.758621,0.776471
4,0.022200,0.306328,0.82,0.785441,0.802348
5,0.015600,0.333418,0.79845,0.789272,0.793834
6,0.015600,0.370123,0.820408,0.770115,0.794466
7,0.003500,0.369736,0.81746,0.789272,0.803119


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5603.98 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.92      0.76      0.84       323

   micro avg       0.92      0.76      0.84       323
   macro avg       0.92      0.76      0.84       323
weighted avg       0.92      0.76      0.84       323

Precision Score: 0.9216417910447762
Recall Score: 0.7647058823529411
F1 Score: 0.8358714043993233
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7737.67 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6531.97 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.160344,0.855204,0.744094,0.795789
2,0.201500,0.139365,0.847107,0.807087,0.826613
3,0.097900,0.201237,0.820312,0.826772,0.823529
4,0.042700,0.201669,0.809701,0.854331,0.831418
5,0.021100,0.283584,0.833992,0.830709,0.832347
6,0.021100,0.323753,0.845188,0.795276,0.819473
7,0.004700,0.340902,0.83871,0.818898,0.828685


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5996.57 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.79      0.84       315

   micro avg       0.89      0.79      0.84       315
   macro avg       0.89      0.79      0.84       315
weighted avg       0.89      0.79      0.84       315

Precision Score: 0.8861209964412812
Recall Score: 0.7904761904761904
F1 Score: 0.8355704697986578
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6346.32 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5648.18 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-uncased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.151462,0.825328,0.75,0.785863
2,0.189900,0.168428,0.87963,0.753968,0.811966
3,0.091400,0.237234,0.863014,0.75,0.802548
4,0.032800,0.242232,0.845833,0.805556,0.825203
5,0.018200,0.292831,0.85,0.809524,0.829268
6,0.018200,0.310768,0.836653,0.833333,0.83499
7,0.006400,0.315201,0.85124,0.81746,0.834008


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5074.78 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.90      0.74      0.81       316

   micro avg       0.90      0.74      0.81       316
   macro avg       0.90      0.74      0.81       316
weighted avg       0.90      0.74      0.81       316

Precision Score: 0.9
Recall Score: 0.740506329113924
F1 Score: 0.8125000000000001
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', '

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 5106.23 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3941.94 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for FacebookAI/xlm-roberta-base with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.160727,0.790541,0.809689,0.8
2,0.225700,0.158051,0.826241,0.806228,0.816112
3,0.153600,0.185047,0.846154,0.799308,0.822064
4,0.097600,0.166811,0.818182,0.871972,0.844221
5,0.067000,0.202995,0.825503,0.851211,0.83816
6,0.067000,0.234241,0.832765,0.844291,0.838488
7,0.033400,0.246795,0.81457,0.851211,0.832487


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 4947.77 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.82      0.85       346

   micro avg       0.88      0.82      0.85       346
   macro avg       0.88      0.82      0.85       346
weighted avg       0.88      0.82      0.85       346

Precision Score: 0.8761609907120743
Recall Score: 0.8179190751445087
F1 Score: 0.8460388639760837
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6597.73 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5639.01 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_best with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.180544,0.828125,0.75,0.787129
2,0.184100,0.153192,0.802885,0.787736,0.795238
3,0.106300,0.181219,0.807882,0.773585,0.790361
4,0.052100,0.196765,0.79646,0.849057,0.821918
5,0.033800,0.269548,0.817308,0.801887,0.809524
6,0.033800,0.24264,0.828054,0.863208,0.845266
7,0.010500,0.287287,0.836538,0.820755,0.828571


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5459.31 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.84      0.84      0.84       279

   micro avg       0.84      0.84      0.84       279
   macro avg       0.84      0.84      0.84       279
weighted avg       0.84      0.84      0.84       279

Precision Score: 0.8447653429602888
Recall Score: 0.8387096774193549
F1 Score: 0.841726618705036
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6604.86 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5278.48 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_filtered_base_best with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.136535,0.819048,0.811321,0.815166
2,0.186200,0.168617,0.821256,0.801887,0.811456
3,0.100400,0.200951,0.795455,0.825472,0.810185
4,0.052800,0.216185,0.814815,0.830189,0.82243
5,0.028600,0.258158,0.829493,0.849057,0.839161
6,0.028600,0.287721,0.839806,0.816038,0.827751
7,0.015300,0.276672,0.830275,0.853774,0.84186


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5191.41 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.86      0.77      0.81       279

   micro avg       0.86      0.77      0.81       279
   macro avg       0.86      0.77      0.81       279
weighted avg       0.86      0.77      0.81       279

Precision Score: 0.856
Recall Score: 0.7670250896057348
F1 Score: 0.8090737240075615
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O'

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7145.13 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5473.19 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_last with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.190992,0.857923,0.740566,0.794937
2,0.180400,0.113842,0.819383,0.877358,0.84738
3,0.114000,0.179953,0.828431,0.79717,0.8125
4,0.059400,0.156169,0.840909,0.872642,0.856481
5,0.039300,0.228704,0.821918,0.849057,0.835267
6,0.039300,0.238615,0.824324,0.863208,0.843318
7,0.017600,0.262896,0.837321,0.825472,0.831354


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5137.98 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.81      0.83       279

   micro avg       0.87      0.81      0.83       279
   macro avg       0.87      0.81      0.83       279
weighted avg       0.87      0.81      0.83       279

Precision Score: 0.8653846153846154
Recall Score: 0.8064516129032258
F1 Score: 0.8348794063079776
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7393.88 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5954.47 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for distilbert/distilbert-base-german-cased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.139097,0.823529,0.771654,0.796748
2,0.205700,0.155902,0.827586,0.755906,0.790123
3,0.105200,0.212884,0.876777,0.728346,0.795699
4,0.049400,0.211798,0.855263,0.767717,0.809129
5,0.035000,0.248456,0.86758,0.748031,0.803383
6,0.035000,0.253412,0.839827,0.76378,0.8
7,0.017100,0.270706,0.85022,0.759843,0.802495


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5605.74 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.76      0.81       315

   micro avg       0.87      0.76      0.81       315
   macro avg       0.87      0.76      0.81       315
weighted avg       0.87      0.76      0.81       315

Precision Score: 0.8695652173913043
Recall Score: 0.7619047619047619
F1 Score: 0.8121827411167514
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7164.48 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5702.77 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for GerMedBERT/medbert-512 with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.104758,0.827586,0.831169,0.829374
2,0.190200,0.145665,0.794677,0.904762,0.846154
3,0.088600,0.210362,0.84186,0.78355,0.811659
4,0.037600,0.224878,0.831111,0.809524,0.820175
5,0.019400,0.272657,0.824034,0.831169,0.827586
6,0.019400,0.309032,0.819742,0.82684,0.823276
7,0.005200,0.320906,0.821277,0.835498,0.828326


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5240.70 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.83      0.81      0.82       288

   micro avg       0.83      0.81      0.82       288
   macro avg       0.83      0.81      0.82       288
weighted avg       0.83      0.81      0.82       288

Precision Score: 0.8268551236749117
Recall Score: 0.8125
F1 Score: 0.8196147110332749
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6913.13 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5509.28 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for deepset/gbert-base with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.120722,0.821429,0.814961,0.818182
2,0.190500,0.147927,0.844828,0.771654,0.806584
3,0.089500,0.235073,0.865801,0.787402,0.824742
4,0.032900,0.243635,0.838057,0.814961,0.826347
5,0.020200,0.280862,0.815789,0.854331,0.834615
6,0.020200,0.318454,0.808989,0.850394,0.829175
7,0.008000,0.329083,0.8125,0.818898,0.815686


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5193.43 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.82      0.85       315

   micro avg       0.88      0.82      0.85       315
   macro avg       0.88      0.82      0.85       315
weighted avg       0.88      0.82      0.85       315

Precision Score: 0.8831615120274914
Recall Score: 0.8158730158730159
F1 Score: 0.8481848184818481
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

In [9]:
for model in models:
    print(f'training and results for {model}:')
    ate_model(data, model, rn1=42, rn2=42, epochs=8)
    print()

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7458.87 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6466.76 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for google-bert/bert-base-german-cased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.13901,0.817073,0.770115,0.792899
2,0.186700,0.148873,0.833333,0.804598,0.818713
3,0.077900,0.211869,0.833333,0.766284,0.798403
4,0.027200,0.255072,0.844262,0.789272,0.815842
5,0.011000,0.341162,0.846809,0.762452,0.802419
6,0.011000,0.356213,0.838057,0.793103,0.814961
7,0.001700,0.395481,0.833333,0.766284,0.798403
8,0.002500,0.401347,0.833333,0.766284,0.798403


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 1799.42 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.91      0.76      0.83       323

   micro avg       0.91      0.76      0.83       323
   macro avg       0.91      0.76      0.83       323
weighted avg       0.91      0.76      0.83       323

Precision Score: 0.9077490774907749
Recall Score: 0.7616099071207431
F1 Score: 0.8282828282828283
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6987.36 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6026.64 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.138354,0.789062,0.795276,0.792157
2,0.201800,0.155322,0.842742,0.822835,0.832669
3,0.099900,0.226823,0.836,0.822835,0.829365
4,0.036000,0.238081,0.799296,0.893701,0.843866
5,0.017100,0.306837,0.824903,0.834646,0.829746
6,0.017100,0.321137,0.832685,0.84252,0.837573
7,0.006000,0.328803,0.835249,0.858268,0.846602
8,0.003600,0.331772,0.833333,0.866142,0.849421


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5780.40 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.86      0.78      0.82       315

   micro avg       0.86      0.78      0.82       315
   macro avg       0.86      0.78      0.82       315
weighted avg       0.86      0.78      0.82       315

Precision Score: 0.8636363636363636
Recall Score: 0.7841269841269841
F1 Score: 0.8219633943427621
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6284.87 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5594.62 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-uncased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.136919,0.832618,0.769841,0.8
2,0.186900,0.169789,0.857143,0.761905,0.806723
3,0.091300,0.213695,0.858333,0.81746,0.837398
4,0.033600,0.227878,0.829365,0.829365,0.829365
5,0.019000,0.272597,0.834008,0.81746,0.825651
6,0.019000,0.31931,0.84,0.833333,0.836653
7,0.003700,0.326533,0.830769,0.857143,0.84375
8,0.002000,0.336904,0.815094,0.857143,0.83559


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 4880.21 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.77      0.83       316

   micro avg       0.89      0.77      0.83       316
   macro avg       0.89      0.77      0.83       316
weighted avg       0.89      0.77      0.83       316

Precision Score: 0.8901098901098901
Recall Score: 0.7689873417721519
F1 Score: 0.8251273344651952
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6159.84 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4688.81 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for FacebookAI/xlm-roberta-base with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.183908,0.841699,0.754325,0.79562
2,0.214100,0.157779,0.840149,0.782007,0.810036
3,0.141300,0.13372,0.839465,0.868512,0.853741
4,0.085800,0.180644,0.813505,0.875433,0.843333
5,0.064500,0.204385,0.807074,0.868512,0.836667
6,0.064500,0.262711,0.833333,0.813149,0.823117
7,0.028700,0.26757,0.808777,0.892734,0.848684
8,0.019600,0.25187,0.834437,0.871972,0.852792


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 4688.53 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.85      0.75      0.80       346

   micro avg       0.85      0.75      0.80       346
   macro avg       0.85      0.75      0.80       346
weighted avg       0.85      0.75      0.80       346

Precision Score: 0.8501628664495114
Recall Score: 0.7543352601156069
F1 Score: 0.7993874425727412
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6582.69 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5188.30 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_best with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.203948,0.834197,0.759434,0.795062
2,0.183300,0.137782,0.81448,0.849057,0.831409
3,0.111100,0.198892,0.79386,0.853774,0.822727
4,0.055500,0.180993,0.813953,0.825472,0.819672
5,0.039900,0.269558,0.8,0.830189,0.814815
6,0.039900,0.306176,0.785388,0.811321,0.798144
7,0.013400,0.349309,0.8,0.792453,0.796209
8,0.007400,0.358056,0.8,0.792453,0.796209


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5216.67 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.77      0.82       279

   micro avg       0.88      0.77      0.82       279
   macro avg       0.88      0.77      0.82       279
weighted avg       0.88      0.77      0.82       279

Precision Score: 0.8770491803278688
Recall Score: 0.7670250896057348
F1 Score: 0.8183556405353728
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6974.54 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5495.91 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_filtered_base_best with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.118536,0.829384,0.825472,0.827423
2,0.180500,0.153299,0.829016,0.754717,0.790123
3,0.101800,0.181281,0.803493,0.867925,0.834467
4,0.051500,0.18668,0.78903,0.882075,0.832962
5,0.037800,0.219964,0.802575,0.882075,0.840449
6,0.037800,0.271882,0.799145,0.882075,0.838565
7,0.017100,0.287528,0.812785,0.839623,0.825986
8,0.005100,0.307646,0.79386,0.853774,0.822727


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 4866.72 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.84      0.84      0.84       279

   micro avg       0.84      0.84      0.84       279
   macro avg       0.84      0.84      0.84       279
weighted avg       0.84      0.84      0.84       279

Precision Score: 0.8422939068100358
Recall Score: 0.8422939068100358
F1 Score: 0.8422939068100358
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7135.23 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5569.39 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_last with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.180668,0.834171,0.783019,0.807786
2,0.183300,0.158114,0.809302,0.820755,0.814988
3,0.111800,0.180783,0.806306,0.84434,0.824885
4,0.058400,0.206683,0.821101,0.84434,0.832558
5,0.036200,0.254359,0.789916,0.886792,0.835556
6,0.036200,0.302936,0.794393,0.801887,0.798122
7,0.014400,0.300831,0.788793,0.863208,0.824324
8,0.008400,0.306916,0.808219,0.834906,0.821346


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5587.07 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.85      0.81      0.83       279

   micro avg       0.85      0.81      0.83       279
   macro avg       0.85      0.81      0.83       279
weighted avg       0.85      0.81      0.83       279

Precision Score: 0.8533834586466166
Recall Score: 0.8136200716845878
F1 Score: 0.8330275229357799
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7459.77 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6451.60 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for distilbert/distilbert-base-german-cased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.144885,0.829167,0.783465,0.805668
2,0.208200,0.169266,0.835498,0.759843,0.795876
3,0.102900,0.199689,0.83913,0.759843,0.797521
4,0.048100,0.234594,0.834783,0.755906,0.793388
5,0.031300,0.266809,0.827731,0.775591,0.800813
6,0.031300,0.284576,0.825911,0.80315,0.814371
7,0.014700,0.304624,0.827869,0.795276,0.811245
8,0.005400,0.314196,0.834711,0.795276,0.814516


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 6237.34 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.86      0.75      0.80       315

   micro avg       0.86      0.75      0.80       315
   macro avg       0.86      0.75      0.80       315
weighted avg       0.86      0.75      0.80       315

Precision Score: 0.864963503649635
Recall Score: 0.7523809523809524
F1 Score: 0.8047538200339559
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7469.85 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6373.84 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for GerMedBERT/medbert-512 with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.113751,0.835749,0.748918,0.789954
2,0.189800,0.150912,0.816,0.883117,0.848233
3,0.088500,0.207059,0.818565,0.839827,0.82906
4,0.037600,0.233957,0.807531,0.835498,0.821277
5,0.019100,0.288852,0.807377,0.852814,0.829474
6,0.019100,0.313917,0.807531,0.835498,0.821277
7,0.003900,0.321759,0.830435,0.82684,0.828633
8,0.004500,0.320276,0.8107,0.852814,0.831224


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 4886.79 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.78      0.82       288

   micro avg       0.87      0.78      0.82       288
   macro avg       0.87      0.78      0.82       288
weighted avg       0.87      0.78      0.82       288

Precision Score: 0.8653846153846154
Recall Score: 0.78125
F1 Score: 0.8211678832116789
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', '

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6244.88 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5425.52 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for deepset/gbert-base with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.135569,0.827731,0.775591,0.800813
2,0.192400,0.144324,0.863636,0.822835,0.842742
3,0.087100,0.244297,0.852941,0.799213,0.825203
4,0.031700,0.238428,0.850806,0.830709,0.840637
5,0.020400,0.267196,0.852,0.838583,0.845238
6,0.020400,0.314748,0.84252,0.84252,0.84252
7,0.007800,0.332497,0.855967,0.818898,0.837022
8,0.002500,0.328087,0.854839,0.834646,0.844622


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5191.10 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.92      0.80      0.85       315

   micro avg       0.92      0.80      0.85       315
   macro avg       0.92      0.80      0.85       315
weighted avg       0.92      0.80      0.85       315

Precision Score: 0.9163636363636364
Recall Score: 0.8
F1 Score: 0.8542372881355933
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 

In [10]:
for model in models:
    print(f'training and results for {model}:')
    ate_model(data, model, rn1=42, rn2=42, epochs=10)
    print()

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6732.72 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5363.29 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for google-bert/bert-base-german-cased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.155934,0.817021,0.735632,0.774194
2,0.189800,0.155716,0.822134,0.796935,0.809339
3,0.081400,0.238581,0.798507,0.819923,0.809074
4,0.023800,0.315924,0.807229,0.770115,0.788235
5,0.014300,0.370962,0.824219,0.808429,0.816248
6,0.014300,0.419882,0.828571,0.777778,0.802372
7,0.006900,0.42602,0.832653,0.781609,0.806324
8,0.001900,0.43518,0.833992,0.808429,0.821012
9,0.001500,0.448943,0.828685,0.796935,0.8125
10,0.000300,0.456059,0.828685,0.796935,0.8125


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5208.86 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.91      0.78      0.84       323

   micro avg       0.91      0.78      0.84       323
   macro avg       0.91      0.78      0.84       323
weighted avg       0.91      0.78      0.84       323

Precision Score: 0.9100719424460432
Recall Score: 0.7832817337461301
F1 Score: 0.8419301164725459
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7420.82 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6523.02 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.146035,0.813765,0.791339,0.802395
2,0.198500,0.159785,0.820312,0.826772,0.823529
3,0.094500,0.217841,0.8125,0.818898,0.815686
4,0.036300,0.303409,0.765886,0.901575,0.82821
5,0.020300,0.307205,0.814229,0.811024,0.812623
6,0.020300,0.360567,0.836134,0.783465,0.808943
7,0.007500,0.401393,0.805344,0.830709,0.817829
8,0.005400,0.399152,0.798561,0.874016,0.834586
9,0.001400,0.397701,0.821293,0.850394,0.83559
10,0.001700,0.415791,0.808824,0.866142,0.836502


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5250.54 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.81      0.84       315

   micro avg       0.87      0.81      0.84       315
   macro avg       0.87      0.81      0.84       315
weighted avg       0.87      0.81      0.84       315

Precision Score: 0.8668941979522184
Recall Score: 0.8063492063492064
F1 Score: 0.8355263157894738
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6091.12 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5431.29 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-uncased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.141636,0.828326,0.765873,0.795876
2,0.191900,0.184094,0.855263,0.77381,0.8125
3,0.091400,0.224732,0.83682,0.793651,0.814664
4,0.032800,0.225308,0.818533,0.84127,0.829746
5,0.020300,0.352512,0.806324,0.809524,0.807921
6,0.020300,0.343862,0.836653,0.833333,0.83499
7,0.005000,0.395735,0.830645,0.81746,0.824
8,0.002600,0.419514,0.836,0.829365,0.832669
9,0.002400,0.425919,0.840164,0.813492,0.826613
10,0.000800,0.417686,0.836,0.829365,0.832669


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5068.59 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.77      0.83       316

   micro avg       0.89      0.77      0.83       316
   macro avg       0.89      0.77      0.83       316
weighted avg       0.89      0.77      0.83       316

Precision Score: 0.8933823529411765
Recall Score: 0.7689873417721519
F1 Score: 0.826530612244898
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6466.26 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4839.54 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for FacebookAI/xlm-roberta-base with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.148574,0.808362,0.802768,0.805556
2,0.213400,0.146242,0.821678,0.813149,0.817391
3,0.145800,0.157183,0.857143,0.851211,0.854167
4,0.098700,0.226581,0.779141,0.878893,0.826016
5,0.066100,0.246026,0.819398,0.847751,0.833333
6,0.066100,0.252373,0.854545,0.813149,0.833333
7,0.042000,0.295753,0.788644,0.865052,0.825083
8,0.030200,0.316428,0.789969,0.871972,0.828947
9,0.022800,0.32278,0.813333,0.844291,0.828523
10,0.017100,0.330756,0.814815,0.83737,0.825939


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 4819.46 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.86      0.78      0.82       346

   micro avg       0.86      0.78      0.82       346
   macro avg       0.86      0.78      0.82       346
weighted avg       0.86      0.78      0.82       346

Precision Score: 0.8575949367088608
Recall Score: 0.7832369942196532
F1 Score: 0.8187311178247735
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7193.98 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5693.42 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_best with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.183387,0.833333,0.778302,0.804878
2,0.181500,0.165714,0.814815,0.830189,0.82243
3,0.114300,0.211667,0.826087,0.806604,0.816229
4,0.057300,0.189074,0.781893,0.896226,0.835165
5,0.037300,0.265611,0.829146,0.778302,0.80292
6,0.037300,0.264248,0.842365,0.806604,0.824096
7,0.017000,0.393104,0.846154,0.726415,0.781726
8,0.009200,0.321706,0.845771,0.801887,0.823245
9,0.006500,0.346467,0.854167,0.773585,0.811881
10,0.004600,0.341956,0.864583,0.783019,0.821782


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5541.19 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.83      0.84      0.84       279

   micro avg       0.83      0.84      0.84       279
   macro avg       0.83      0.84      0.84       279
weighted avg       0.83      0.84      0.84       279

Precision Score: 0.8303886925795053
Recall Score: 0.8422939068100358
F1 Score: 0.8362989323843417
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7095.67 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5619.26 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_filtered_base_best with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.150694,0.819095,0.768868,0.793187
2,0.185400,0.221328,0.816832,0.778302,0.797101
3,0.100400,0.166658,0.827586,0.792453,0.809639
4,0.050300,0.182153,0.810427,0.806604,0.808511
5,0.031400,0.25804,0.778261,0.84434,0.809955
6,0.031400,0.286487,0.815166,0.811321,0.813239
7,0.011000,0.299594,0.848485,0.792453,0.819512
8,0.005700,0.306467,0.826291,0.830189,0.828235
9,0.002900,0.353155,0.852941,0.820755,0.836538
10,0.002100,0.34407,0.837321,0.825472,0.831354


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5457.36 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.75      0.80       279

   micro avg       0.87      0.75      0.80       279
   macro avg       0.87      0.75      0.80       279
weighted avg       0.87      0.75      0.80       279

Precision Score: 0.8702928870292888
Recall Score: 0.7455197132616488
F1 Score: 0.8030888030888031
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6901.22 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5348.66 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_last with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.19364,0.845745,0.75,0.795
2,0.177400,0.13588,0.807339,0.830189,0.818605
3,0.107200,0.192823,0.834171,0.783019,0.807786
4,0.051600,0.188162,0.835749,0.816038,0.825776
5,0.038000,0.27838,0.806452,0.825472,0.815851
6,0.038000,0.306811,0.815668,0.834906,0.825175
7,0.011500,0.321121,0.841026,0.773585,0.805897
8,0.005900,0.387451,0.835897,0.768868,0.800983
9,0.006900,0.385824,0.827411,0.768868,0.797066
10,0.002000,0.386976,0.828283,0.773585,0.8


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5561.00 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.85      0.81      0.83       279

   micro avg       0.85      0.81      0.83       279
   macro avg       0.85      0.81      0.83       279
weighted avg       0.85      0.81      0.83       279

Precision Score: 0.8522727272727273
Recall Score: 0.8064516129032258
F1 Score: 0.8287292817679558
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7835.69 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6758.64 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for distilbert/distilbert-base-german-cased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.144258,0.827731,0.775591,0.800813
2,0.205500,0.162938,0.821739,0.744094,0.780992
3,0.101200,0.226657,0.859813,0.724409,0.786325
4,0.045200,0.229085,0.802281,0.830709,0.816248
5,0.034400,0.262624,0.816,0.80315,0.809524
6,0.034400,0.305888,0.817797,0.759843,0.787755
7,0.012800,0.360086,0.790698,0.80315,0.796875
8,0.004100,0.359875,0.795367,0.811024,0.803119
9,0.004100,0.366117,0.792308,0.811024,0.801556
10,0.001400,0.370863,0.792308,0.811024,0.801556


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 6199.02 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.86      0.81      0.83       315

   micro avg       0.86      0.81      0.83       315
   macro avg       0.86      0.81      0.83       315
weighted avg       0.86      0.81      0.83       315

Precision Score: 0.8639455782312925
Recall Score: 0.8063492063492064
F1 Score: 0.8341543513957307
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7522.51 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5951.54 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for GerMedBERT/medbert-512 with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.109198,0.817021,0.831169,0.824034
2,0.189900,0.142552,0.794466,0.87013,0.830579
3,0.088000,0.226248,0.847291,0.744589,0.792627
4,0.037200,0.1855,0.822314,0.861472,0.841438
5,0.023100,0.278918,0.836207,0.839827,0.838013
6,0.023100,0.290126,0.84375,0.818182,0.830769
7,0.003700,0.302058,0.826446,0.865801,0.845666
8,0.005100,0.321133,0.821138,0.874459,0.84696
9,0.000700,0.33972,0.836207,0.839827,0.838013
10,0.000400,0.342359,0.834746,0.852814,0.843683


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5654.42 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.77      0.82       288

   micro avg       0.88      0.77      0.82       288
   macro avg       0.88      0.77      0.82       288
weighted avg       0.88      0.77      0.82       288

Precision Score: 0.8809523809523809
Recall Score: 0.7708333333333334
F1 Score: 0.8222222222222222
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 5366.29 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5882.94 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for deepset/gbert-base with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.143593,0.812749,0.80315,0.807921
2,0.189900,0.182269,0.814229,0.811024,0.812623
3,0.088600,0.229563,0.868996,0.783465,0.824017
4,0.033300,0.266637,0.868085,0.80315,0.834356
5,0.020400,0.35947,0.858407,0.76378,0.808333
6,0.020400,0.353546,0.821012,0.830709,0.825832
7,0.008400,0.367762,0.865801,0.787402,0.824742
8,0.002700,0.382766,0.825397,0.818898,0.822134
9,0.000900,0.393986,0.830645,0.811024,0.820717
10,0.002400,0.402136,0.822835,0.822835,0.822835


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5893.07 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.90      0.77      0.83       315

   micro avg       0.90      0.77      0.83       315
   macro avg       0.90      0.77      0.83       315
weighted avg       0.90      0.77      0.83       315

Precision Score: 0.8992537313432836
Recall Score: 0.765079365079365
F1 Score: 0.8267581475128645
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

In [11]:
for model in models:
    print(f'training and results for {model}:')
    ate_model(data, model, rn1=42, rn2=42, epochs=12)
    print()

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7451.28 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6214.97 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for google-bert/bert-base-german-cased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.152613,0.826271,0.747126,0.784708
2,0.192700,0.157459,0.836653,0.804598,0.820312
3,0.084000,0.256476,0.814516,0.773946,0.793713
4,0.025200,0.246603,0.83682,0.766284,0.8
5,0.013700,0.389158,0.823045,0.766284,0.793651
6,0.013700,0.425314,0.824786,0.739464,0.779798
7,0.005900,0.390681,0.819277,0.781609,0.8
8,0.001300,0.439771,0.814516,0.773946,0.793713
9,0.001200,0.459176,0.829787,0.747126,0.78629
10,0.000800,0.450369,0.82449,0.773946,0.798419


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5578.33 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.91      0.77      0.83       323

   micro avg       0.91      0.77      0.83       323
   macro avg       0.91      0.77      0.83       323
weighted avg       0.91      0.77      0.83       323

Precision Score: 0.9054545454545454
Recall Score: 0.7708978328173375
F1 Score: 0.8327759197324415
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7524.63 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6510.99 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.151376,0.828452,0.779528,0.803245
2,0.207300,0.17472,0.871245,0.799213,0.833676
3,0.099300,0.242942,0.806569,0.870079,0.837121
4,0.041200,0.247713,0.81203,0.850394,0.830769
5,0.026200,0.344759,0.821012,0.830709,0.825832
6,0.026200,0.369353,0.865546,0.811024,0.837398
7,0.008900,0.398198,0.80292,0.866142,0.833333
8,0.005800,0.403215,0.811111,0.862205,0.835878
9,0.002700,0.428254,0.828794,0.838583,0.833659
10,0.000700,0.455041,0.832653,0.80315,0.817635


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5870.99 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.90      0.74      0.81       315

   micro avg       0.90      0.74      0.81       315
   macro avg       0.90      0.74      0.81       315
weighted avg       0.90      0.74      0.81       315

Precision Score: 0.8992248062015504
Recall Score: 0.7365079365079366
F1 Score: 0.8097731239092497
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 5844.37 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5023.83 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-uncased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.140491,0.841202,0.777778,0.808247
2,0.187300,0.170501,0.854701,0.793651,0.823045
3,0.089600,0.225716,0.840816,0.81746,0.828974
4,0.033800,0.243995,0.804598,0.833333,0.818713
5,0.023500,0.326639,0.817797,0.765873,0.790984
6,0.023500,0.404941,0.83913,0.765873,0.80083
7,0.005500,0.39478,0.796154,0.821429,0.808594
8,0.002300,0.400745,0.8,0.825397,0.8125
9,0.001200,0.424725,0.827869,0.801587,0.814516
10,0.000500,0.440652,0.816733,0.813492,0.815109


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5120.82 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.90      0.70      0.79       316

   micro avg       0.90      0.70      0.79       316
   macro avg       0.90      0.70      0.79       316
weighted avg       0.90      0.70      0.79       316

Precision Score: 0.9016393442622951
Recall Score: 0.6962025316455697
F1 Score: 0.7857142857142858
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6703.16 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5028.96 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for FacebookAI/xlm-roberta-base with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.151191,0.838235,0.788927,0.812834
2,0.212000,0.158098,0.847015,0.785467,0.815081
3,0.152400,0.192857,0.848921,0.816609,0.832451
4,0.090200,0.195004,0.805195,0.858131,0.830821
5,0.068700,0.223434,0.834483,0.83737,0.835924
6,0.068700,0.232482,0.83557,0.861592,0.848382
7,0.038400,0.2774,0.815534,0.871972,0.842809
8,0.025900,0.290079,0.8125,0.854671,0.833052
9,0.015100,0.301551,0.839721,0.83391,0.836806
10,0.014000,0.30255,0.81759,0.868512,0.842282


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 4777.00 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.87      0.79      0.83       346

   micro avg       0.87      0.79      0.83       346
   macro avg       0.87      0.79      0.83       346
weighted avg       0.87      0.79      0.83       346

Precision Score: 0.8694267515923567
Recall Score: 0.7890173410404624
F1 Score: 0.8272727272727274
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 5074.76 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4112.94 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_best with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.180583,0.838889,0.712264,0.770408
2,0.181000,0.144356,0.831818,0.863208,0.847222
3,0.109100,0.203244,0.800866,0.872642,0.835214
4,0.058800,0.205852,0.817757,0.825472,0.821596
5,0.031700,0.332093,0.793722,0.834906,0.813793
6,0.031700,0.388515,0.830769,0.764151,0.796069
7,0.015300,0.349024,0.821256,0.801887,0.811456
8,0.008800,0.398207,0.831633,0.768868,0.79902
9,0.007200,0.433267,0.809045,0.759434,0.783455
10,0.004300,0.456135,0.838542,0.759434,0.79703


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5674.22 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.74      0.81       279

   micro avg       0.88      0.74      0.81       279
   macro avg       0.88      0.74      0.81       279
weighted avg       0.88      0.74      0.81       279

Precision Score: 0.8846153846153846
Recall Score: 0.7419354838709677
F1 Score: 0.8070175438596492
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7385.42 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6257.38 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_filtered_base_best with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.129242,0.79638,0.830189,0.812933
2,0.176900,0.144196,0.794521,0.820755,0.807425
3,0.103800,0.184937,0.804444,0.853774,0.828375
4,0.052700,0.186701,0.784753,0.825472,0.804598
5,0.035600,0.244386,0.786611,0.886792,0.833703
6,0.035600,0.258669,0.810573,0.867925,0.838269
7,0.012100,0.315011,0.830846,0.787736,0.808717
8,0.006900,0.291145,0.821101,0.84434,0.832558
9,0.003600,0.337856,0.816901,0.820755,0.818824
10,0.000900,0.358809,0.817757,0.825472,0.821596


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5862.54 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.85      0.76      0.80       279

   micro avg       0.85      0.76      0.80       279
   macro avg       0.85      0.76      0.80       279
weighted avg       0.85      0.76      0.80       279

Precision Score: 0.848
Recall Score: 0.7598566308243727
F1 Score: 0.8015122873345936
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O'

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7427.53 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5773.11 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for TUM/GottBERT_base_last with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.162489,0.835,0.787736,0.81068
2,0.179100,0.136849,0.819005,0.853774,0.836028
3,0.107500,0.146438,0.813278,0.924528,0.865342
4,0.057300,0.196739,0.814286,0.806604,0.810427
5,0.038400,0.275449,0.795349,0.806604,0.800937
6,0.038400,0.282135,0.816038,0.816038,0.816038
7,0.019200,0.273606,0.8,0.924528,0.857768
8,0.011800,0.351198,0.825871,0.783019,0.803874
9,0.003400,0.349286,0.841584,0.801887,0.821256
10,0.005100,0.332842,0.813953,0.825472,0.819672


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5850.04 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.83      0.83      0.83       279

   micro avg       0.83      0.83      0.83       279
   macro avg       0.83      0.83      0.83       279
weighted avg       0.83      0.83      0.83       279

Precision Score: 0.8345323741007195
Recall Score: 0.8315412186379928
F1 Score: 0.8330341113105925
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 8166.69 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6971.98 examples/s]


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}


  trainer = Trainer(


Training results for distilbert/distilbert-base-german-cased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.138312,0.808765,0.799213,0.80396
2,0.209300,0.160013,0.846847,0.740157,0.789916
3,0.098600,0.190818,0.821577,0.779528,0.8
4,0.045500,0.225302,0.839662,0.783465,0.810591
5,0.030200,0.259969,0.804688,0.811024,0.807843
6,0.030200,0.29463,0.816733,0.807087,0.811881
7,0.012000,0.322202,0.8125,0.818898,0.815686
8,0.005300,0.356956,0.82716,0.791339,0.808853
9,0.002200,0.362176,0.799242,0.830709,0.814672
10,0.001500,0.380425,0.801556,0.811024,0.806262


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5916.78 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.86      0.80      0.83       315

   micro avg       0.86      0.80      0.83       315
   macro avg       0.86      0.80      0.83       315
weighted avg       0.86      0.80      0.83       315

Precision Score: 0.863013698630137
Recall Score: 0.8
F1 Score: 0.8303130148270181
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', '

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7577.16 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6025.70 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for GerMedBERT/medbert-512 with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.110658,0.837209,0.779221,0.807175
2,0.189600,0.157869,0.829787,0.844156,0.83691
3,0.092900,0.224962,0.825112,0.796537,0.810573
4,0.038200,0.229189,0.843318,0.792208,0.816964
5,0.025700,0.248979,0.815126,0.839827,0.827292
6,0.025700,0.316327,0.84434,0.774892,0.808126
7,0.006900,0.338732,0.824786,0.835498,0.830108
8,0.005200,0.325793,0.814346,0.835498,0.824786
9,0.003500,0.345409,0.842105,0.831169,0.836601
10,0.000900,0.358279,0.848889,0.82684,0.837719


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5794.73 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.76      0.82       288

   micro avg       0.89      0.76      0.82       288
   macro avg       0.89      0.76      0.82       288
weighted avg       0.89      0.76      0.82       288

Precision Score: 0.8943089430894309
Recall Score: 0.7638888888888888
F1 Score: 0.8239700374531834
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4887.60 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3427.30 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for deepset/gbert-base with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.139483,0.859031,0.767717,0.810811
2,0.187000,0.161027,0.843137,0.846457,0.844794
3,0.090300,0.227668,0.844,0.830709,0.837302
4,0.033200,0.249084,0.871681,0.775591,0.820833
5,0.021500,0.325955,0.822134,0.818898,0.820513
6,0.021500,0.334239,0.838843,0.799213,0.818548
7,0.009400,0.37715,0.832653,0.80315,0.817635
8,0.005100,0.371767,0.851406,0.834646,0.842942
9,0.001900,0.366353,0.854251,0.830709,0.842315
10,0.001500,0.371999,0.85259,0.84252,0.847525


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5920.31 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.90      0.83      0.86       315

   micro avg       0.90      0.83      0.86       315
   macro avg       0.90      0.83      0.86       315
weighted avg       0.90      0.83      0.86       315

Precision Score: 0.9
Recall Score: 0.8285714285714286
F1 Score: 0.8628099173553719
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 

### 2. Train category-aware ATE Models for 5, 6, 7, 8, 10, 12 epochs

In [5]:
for model in models:
    print(f'training and results for {model}:')
    ate_cat_model(data, model, rn1=42, rn2=42, epochs=5)
    print()

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6339.03 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5290.15 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training google-bert/bert-base-german-cased for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.222765,0.703057,0.616858,0.657143
2,0.329900,0.19429,0.738776,0.693487,0.715415
3,0.133200,0.232997,0.721374,0.724138,0.722753
4,0.048700,0.256593,0.770833,0.708812,0.738523
5,0.027200,0.2898,0.777778,0.697318,0.735354


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 6010.04 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.84      0.90      0.87        52
    Krankenhaus       0.86      0.68      0.76       119
       Personal       0.64      0.64      0.64        14
 Pflegepersonal       0.94      0.94      0.94        18
anderer Service       0.74      0.42      0.54        33
 mediz. Service       0.82      0.79      0.81        87

      micro avg       0.83      0.73      0.78       323
      macro avg       0.81      0.73      0.76       323
   weighted avg       0.83      0.73      0.77       323

Precision Score: 0.8315789473684211
Recall Score: 0.7337461300309598
F1 Score: 0.7796052631578948
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7804.36 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6793.32 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training dbmdz/bert-base-german-cased for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.221487,0.686916,0.57874,0.628205
2,0.353600,0.201327,0.703252,0.681102,0.692
3,0.153100,0.226097,0.780172,0.712598,0.744856
4,0.068000,0.255953,0.754864,0.76378,0.759295
5,0.036200,0.27047,0.763052,0.748031,0.755467


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5844.76 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.81      0.95      0.87        55
    Krankenhaus       0.86      0.65      0.74       117
       Personal       0.64      0.54      0.58        13
 Pflegepersonal       1.00      0.89      0.94        18
anderer Service       0.69      0.57      0.62        35
 mediz. Service       0.81      0.75      0.78        77

      micro avg       0.82      0.73      0.77       315
      macro avg       0.80      0.72      0.76       315
   weighted avg       0.82      0.73      0.77       315

Precision Score: 0.8178571428571428
Recall Score: 0.726984126984127
F1 Score: 0.7697478991596639
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 4651.11 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5245.35 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training dbmdz/bert-base-german-uncased for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.226346,0.730769,0.603175,0.66087
2,0.324000,0.23641,0.754545,0.65873,0.70339
3,0.147900,0.233186,0.785425,0.769841,0.777555
4,0.063900,0.276949,0.768293,0.75,0.759036
5,0.041700,0.284904,0.776423,0.757937,0.767068


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5683.11 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.81      0.95      0.88        63
    Krankenhaus       0.90      0.51      0.65       112
       Personal       0.64      0.64      0.64        14
 Pflegepersonal       1.00      0.95      0.97        19
anderer Service       0.63      0.35      0.45        34
 mediz. Service       0.85      0.77      0.81        74

      micro avg       0.84      0.67      0.75       316
      macro avg       0.81      0.70      0.73       316
   weighted avg       0.84      0.67      0.73       316

Precision Score: 0.8352941176470589
Recall Score: 0.6740506329113924
F1 Score: 0.7460595446584938
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6207.71 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5650.29 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training FacebookAI/xlm-roberta-base for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.300229,0.645455,0.491349,0.557957
2,0.419300,0.224869,0.730769,0.657439,0.692168
3,0.232800,0.216382,0.72449,0.737024,0.730703
4,0.149900,0.208943,0.725424,0.740484,0.732877
5,0.107600,0.224733,0.741259,0.733564,0.737391


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5545.50 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.82      0.95      0.88        59
    Krankenhaus       0.81      0.73      0.77       120
       Personal       0.62      0.67      0.65        15
 Pflegepersonal       0.96      0.92      0.94        24
anderer Service       0.62      0.33      0.43        45
 mediz. Service       0.68      0.72      0.70        83

      micro avg       0.77      0.73      0.74       346
      macro avg       0.75      0.72      0.73       346
   weighted avg       0.76      0.73      0.73       346

Precision Score: 0.7652439024390244
Recall Score: 0.7254335260115607
F1 Score: 0.7448071216617211
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 5181.21 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5583.04 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_base_best for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.26122,0.711111,0.603774,0.653061
2,0.317600,0.194135,0.717172,0.669811,0.692683
3,0.164300,0.22751,0.716981,0.716981,0.716981
4,0.084300,0.234508,0.760204,0.70283,0.730392
5,0.055600,0.273006,0.731707,0.707547,0.719424


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5896.72 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.93      0.98      0.95        52
    Krankenhaus       0.92      0.74      0.82       104
       Personal       0.69      0.75      0.72        12
 Pflegepersonal       1.00      1.00      1.00        14
anderer Service       0.55      0.37      0.44        30
 mediz. Service       0.70      0.72      0.71        67

      micro avg       0.82      0.75      0.79       279
      macro avg       0.80      0.76      0.77       279
   weighted avg       0.82      0.75      0.78       279

Precision Score: 0.8235294117647058
Recall Score: 0.7526881720430108
F1 Score: 0.7865168539325843
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7329.84 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6275.92 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_filtered_base_best for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.193376,0.766667,0.650943,0.704082
2,0.323400,0.170987,0.76555,0.754717,0.760095
3,0.161700,0.169006,0.781553,0.759434,0.770335
4,0.083200,0.206142,0.794872,0.731132,0.761671
5,0.054900,0.189984,0.803922,0.773585,0.788462


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5702.50 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.89      0.96      0.93        52
    Krankenhaus       0.93      0.71      0.80       104
       Personal       0.69      0.75      0.72        12
 Pflegepersonal       1.00      0.93      0.96        14
anderer Service       0.73      0.37      0.49        30
 mediz. Service       0.75      0.73      0.74        67

      micro avg       0.85      0.74      0.79       279
      macro avg       0.83      0.74      0.77       279
   weighted avg       0.85      0.74      0.78       279

Precision Score: 0.8512396694214877
Recall Score: 0.7383512544802867
F1 Score: 0.7907869481765835
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7293.53 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6160.20 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_base_last for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.230513,0.713514,0.622642,0.664987
2,0.314100,0.183804,0.77,0.726415,0.747573
3,0.171800,0.226143,0.775,0.731132,0.752427
4,0.083600,0.290549,0.791878,0.735849,0.762836
5,0.054200,0.290997,0.80102,0.740566,0.769608


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5439.32 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.96      0.98      0.97        52
    Krankenhaus       0.91      0.78      0.84       104
       Personal       0.69      0.75      0.72        12
 Pflegepersonal       0.93      1.00      0.97        14
anderer Service       0.75      0.40      0.52        30
 mediz. Service       0.67      0.76      0.71        67

      micro avg       0.83      0.78      0.81       279
      macro avg       0.82      0.78      0.79       279
   weighted avg       0.84      0.78      0.80       279

Precision Score: 0.8320610687022901
Recall Score: 0.7813620071684588
F1 Score: 0.8059149722735676
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7950.43 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6567.93 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training distilbert/distilbert-base-german-cased for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.241931,0.632558,0.535433,0.579957
2,0.397300,0.222672,0.715556,0.633858,0.672234
3,0.172500,0.243208,0.748792,0.610236,0.672451
4,0.096700,0.243671,0.75,0.685039,0.716049
5,0.069700,0.255057,0.73617,0.681102,0.707566


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 6145.15 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.89      0.93      0.91        55
    Krankenhaus       0.94      0.56      0.71       117
       Personal       0.60      0.46      0.52        13
 Pflegepersonal       0.94      0.89      0.91        18
anderer Service       0.75      0.34      0.47        35
 mediz. Service       0.68      0.68      0.68        77

      micro avg       0.82      0.64      0.72       315
      macro avg       0.80      0.64      0.70       315
   weighted avg       0.83      0.64      0.71       315

Precision Score: 0.8218623481781376
Recall Score: 0.6444444444444445
F1 Score: 0.7224199288256228
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7561.98 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5863.32 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training GerMedBERT/medbert-512 for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.202197,0.685279,0.584416,0.630841
2,0.320200,0.171362,0.70354,0.688312,0.695842
3,0.146400,0.216528,0.753623,0.675325,0.712329
4,0.063300,0.23398,0.728889,0.709957,0.719298
5,0.040300,0.249327,0.732143,0.709957,0.720879


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5597.67 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.98      0.96      0.97        54
    Krankenhaus       0.89      0.76      0.82       105
       Personal       0.93      0.88      0.90        16
 Pflegepersonal       0.93      0.93      0.93        15
anderer Service       0.56      0.40      0.47        35
 mediz. Service       0.79      0.60      0.68        63

      micro avg       0.86      0.74      0.79       288
      macro avg       0.85      0.76      0.80       288
   weighted avg       0.85      0.74      0.79       288

Precision Score: 0.8617886178861789
Recall Score: 0.7361111111111112
F1 Score: 0.7940074906367042
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7338.76 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5535.62 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training deepset/gbert-base for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.229855,0.685185,0.582677,0.629787
2,0.337800,0.188612,0.784689,0.645669,0.708423
3,0.147400,0.217004,0.758621,0.692913,0.72428
4,0.059300,0.246612,0.769565,0.69685,0.731405
5,0.038100,0.255069,0.755102,0.728346,0.741483


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5930.81 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.95      0.91        55
    Krankenhaus       0.90      0.62      0.73       117
       Personal       0.80      0.62      0.70        13
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.69      0.51      0.59        35
 mediz. Service       0.88      0.73      0.79        77

      micro avg       0.87      0.71      0.78       315
      macro avg       0.86      0.73      0.78       315
   weighted avg       0.87      0.71      0.77       315

Precision Score: 0.87109375
Recall Score: 0.707936507936508
F1 Score: 0.7810858143607706
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', '

In [6]:
for model in models:
    print(f'training and results for {model}:')
    ate_cat_model(data, model, rn1=42, rn2=42, epochs=6)
    print()

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7518.54 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6282.81 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training google-bert/bert-base-german-cased for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.223776,0.703704,0.582375,0.637317
2,0.325700,0.207315,0.698276,0.62069,0.657201
3,0.134500,0.220422,0.767347,0.720307,0.743083
4,0.052200,0.257904,0.775424,0.701149,0.736419
5,0.024000,0.298461,0.760684,0.681992,0.719192
6,0.024000,0.310659,0.757322,0.693487,0.724


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5765.21 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.86      0.96      0.91        52
    Krankenhaus       0.92      0.61      0.73       119
       Personal       0.67      0.43      0.52        14
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.68      0.45      0.55        33
 mediz. Service       0.77      0.80      0.79        87

      micro avg       0.84      0.71      0.77       323
      macro avg       0.82      0.70      0.74       323
   weighted avg       0.84      0.71      0.76       323

Precision Score: 0.8363636363636363
Recall Score: 0.7120743034055728
F1 Score: 0.7692307692307693
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6679.78 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6379.79 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training dbmdz/bert-base-german-cased for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.221036,0.704545,0.610236,0.654008
2,0.342700,0.201095,0.752988,0.744094,0.748515
3,0.163100,0.229705,0.722846,0.759843,0.740883
4,0.075200,0.2306,0.792373,0.73622,0.763265
5,0.045100,0.276974,0.738971,0.791339,0.764259
6,0.045100,0.282252,0.756654,0.783465,0.769826


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5869.38 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.95      0.91        55
    Krankenhaus       0.91      0.73      0.81       117
       Personal       0.64      0.54      0.58        13
 Pflegepersonal       0.94      0.94      0.94        18
anderer Service       0.69      0.57      0.62        35
 mediz. Service       0.80      0.78      0.79        77

      micro avg       0.85      0.77      0.80       315
      macro avg       0.81      0.75      0.78       315
   weighted avg       0.85      0.77      0.80       315

Precision Score: 0.8456140350877193
Recall Score: 0.765079365079365
F1 Score: 0.8033333333333333
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6615.79 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5823.50 examples/s]


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}


  trainer = Trainer(


Training dbmdz/bert-base-german-uncased for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.226915,0.702703,0.619048,0.658228
2,0.325900,0.229937,0.727273,0.666667,0.695652
3,0.151100,0.21535,0.77551,0.753968,0.764588
4,0.063100,0.242977,0.776371,0.730159,0.752556
5,0.039600,0.256752,0.744361,0.785714,0.764479
6,0.039600,0.292526,0.752033,0.734127,0.742972


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5474.12 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.94      0.94      0.94        63
    Krankenhaus       0.91      0.61      0.73       112
       Personal       0.71      0.71      0.71        14
 Pflegepersonal       1.00      0.95      0.97        19
anderer Service       0.58      0.32      0.42        34
 mediz. Service       0.87      0.74      0.80        74

      micro avg       0.88      0.70      0.78       316
      macro avg       0.83      0.71      0.76       316
   weighted avg       0.87      0.70      0.77       316

Precision Score: 0.876984126984127
Recall Score: 0.6993670886075949
F1 Score: 0.778169014084507
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O',

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6794.07 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5069.16 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training FacebookAI/xlm-roberta-base for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.26635,0.637168,0.49827,0.559223
2,0.418100,0.207464,0.776892,0.67474,0.722222
3,0.215500,0.208947,0.762238,0.754325,0.758261
4,0.132000,0.222055,0.779661,0.795848,0.787671
5,0.093000,0.262956,0.732087,0.813149,0.770492
6,0.093000,0.26263,0.77551,0.788927,0.782161


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5177.34 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.87      0.93      0.90        59
    Krankenhaus       0.79      0.70      0.74       120
       Personal       0.77      0.67      0.71        15
 Pflegepersonal       1.00      0.96      0.98        24
anderer Service       0.42      0.38      0.40        45
 mediz. Service       0.91      0.75      0.82        83

      micro avg       0.80      0.73      0.76       346
      macro avg       0.80      0.73      0.76       346
   weighted avg       0.80      0.73      0.76       346

Precision Score: 0.8019169329073482
Recall Score: 0.7254335260115607
F1 Score: 0.7617602427921094
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7032.22 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5380.66 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_base_best for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.233617,0.715116,0.580189,0.640625
2,0.319700,0.199339,0.758242,0.650943,0.700508
3,0.168800,0.224806,0.730233,0.740566,0.735363
4,0.082200,0.268663,0.712195,0.688679,0.70024
5,0.054000,0.294921,0.707763,0.731132,0.719258
6,0.054000,0.333623,0.70283,0.70283,0.70283


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5607.14 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.84      0.98      0.90        52
    Krankenhaus       0.94      0.60      0.73       104
       Personal       0.64      0.75      0.69        12
 Pflegepersonal       0.93      0.93      0.93        14
anderer Service       0.59      0.33      0.43        30
 mediz. Service       0.66      0.81      0.72        67

      micro avg       0.78      0.71      0.75       279
      macro avg       0.77      0.73      0.73       279
   weighted avg       0.80      0.71      0.74       279

Precision Score: 0.7834645669291339
Recall Score: 0.7132616487455197
F1 Score: 0.7467166979362102
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7199.02 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5477.79 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_filtered_base_best for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.199197,0.689119,0.627358,0.65679
2,0.324200,0.176801,0.744681,0.660377,0.7
3,0.168500,0.16511,0.753363,0.792453,0.772414
4,0.087500,0.175075,0.778894,0.731132,0.754258
5,0.059400,0.223883,0.741627,0.731132,0.736342
6,0.059400,0.243691,0.748792,0.731132,0.739857


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5582.92 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.96      0.98      0.97        52
    Krankenhaus       0.86      0.72      0.79       104
       Personal       0.69      0.75      0.72        12
 Pflegepersonal       1.00      0.93      0.96        14
anderer Service       0.64      0.30      0.41        30
 mediz. Service       0.64      0.81      0.72        67

      micro avg       0.80      0.76      0.78       279
      macro avg       0.80      0.75      0.76       279
   weighted avg       0.80      0.76      0.77       279

Precision Score: 0.7992424242424242
Recall Score: 0.7562724014336918
F1 Score: 0.7771639042357273
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7283.05 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6052.65 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_base_last for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.248563,0.728916,0.570755,0.640212
2,0.336300,0.213689,0.719577,0.641509,0.678304
3,0.171200,0.214071,0.733645,0.740566,0.737089
4,0.083000,0.235917,0.722222,0.735849,0.728972
5,0.059200,0.291103,0.700855,0.773585,0.735426
6,0.059200,0.31813,0.69163,0.740566,0.715262


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5440.22 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.91      0.98      0.94        52
    Krankenhaus       0.88      0.61      0.72       104
       Personal       0.75      0.75      0.75        12
 Pflegepersonal       1.00      1.00      1.00        14
anderer Service       0.50      0.33      0.40        30
 mediz. Service       0.65      0.82      0.73        67

      micro avg       0.78      0.72      0.75       279
      macro avg       0.78      0.75      0.76       279
   weighted avg       0.79      0.72      0.74       279

Precision Score: 0.7829457364341085
Recall Score: 0.7240143369175627
F1 Score: 0.7523277467411544
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7189.94 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5366.48 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training distilbert/distilbert-base-german-cased for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.232994,0.648402,0.559055,0.600423
2,0.388600,0.21897,0.688525,0.661417,0.674699
3,0.173900,0.232267,0.75,0.649606,0.696203
4,0.091800,0.240864,0.757202,0.724409,0.740443
5,0.062200,0.266235,0.759494,0.708661,0.733198
6,0.062200,0.280191,0.741667,0.700787,0.720648


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5909.59 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.91      0.93      0.92        55
    Krankenhaus       0.92      0.61      0.73       117
       Personal       0.62      0.62      0.62        13
 Pflegepersonal       1.00      0.89      0.94        18
anderer Service       0.78      0.40      0.53        35
 mediz. Service       0.65      0.71      0.68        77

      micro avg       0.81      0.68      0.74       315
      macro avg       0.81      0.69      0.74       315
   weighted avg       0.83      0.68      0.74       315

Precision Score: 0.8113207547169812
Recall Score: 0.6825396825396826
F1 Score: 0.7413793103448275
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6287.34 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3952.83 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training GerMedBERT/medbert-512 for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.190511,0.707692,0.597403,0.647887
2,0.320100,0.167146,0.733333,0.714286,0.723684
3,0.148900,0.226236,0.763285,0.683983,0.721461
4,0.063200,0.248713,0.741784,0.683983,0.711712
5,0.036500,0.277474,0.724891,0.718615,0.721739
6,0.036500,0.286391,0.732456,0.722944,0.727669


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5524.09 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       1.00      0.98      0.99        54
    Krankenhaus       0.86      0.81      0.83       105
       Personal       1.00      0.62      0.77        16
 Pflegepersonal       0.79      1.00      0.88        15
anderer Service       0.58      0.43      0.49        35
 mediz. Service       0.85      0.70      0.77        63

      micro avg       0.86      0.77      0.81       288
      macro avg       0.85      0.76      0.79       288
   weighted avg       0.85      0.77      0.81       288

Precision Score: 0.8571428571428571
Recall Score: 0.7708333333333334
F1 Score: 0.8117001828153565
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7190.27 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5768.07 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training deepset/gbert-base for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.238001,0.67907,0.574803,0.622601
2,0.330400,0.187735,0.74026,0.673228,0.705155
3,0.143500,0.21213,0.792531,0.751969,0.771717
4,0.056900,0.230783,0.768595,0.732283,0.75
5,0.033400,0.25467,0.741176,0.744094,0.742633
6,0.033400,0.252418,0.767068,0.751969,0.759443


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5821.54 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.85      0.95      0.90        55
    Krankenhaus       0.90      0.76      0.82       117
       Personal       0.69      0.69      0.69        13
 Pflegepersonal       0.94      0.94      0.94        18
anderer Service       0.80      0.34      0.48        35
 mediz. Service       0.85      0.74      0.79        77

      micro avg       0.86      0.75      0.80       315
      macro avg       0.84      0.74      0.77       315
   weighted avg       0.86      0.75      0.79       315

Precision Score: 0.8644688644688645
Recall Score: 0.7492063492063492
F1 Score: 0.8027210884353742
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

In [7]:
for model in models:
    print(f'training and results for {model}:')
    ate_cat_model(data, model, rn1=42, rn2=42, epochs=7)
    print()

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7147.64 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5989.15 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training google-bert/bert-base-german-cased for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.223721,0.690909,0.582375,0.632017
2,0.332800,0.191899,0.756198,0.701149,0.727634
3,0.136400,0.240071,0.738956,0.704981,0.721569
4,0.048900,0.290695,0.748899,0.651341,0.696721
5,0.025500,0.332393,0.763485,0.704981,0.733068
6,0.025500,0.348485,0.76824,0.685824,0.724696
7,0.005900,0.359795,0.760504,0.693487,0.725451


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5772.76 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.80      0.92      0.86        52
    Krankenhaus       0.88      0.64      0.74       119
       Personal       0.67      0.57      0.62        14
 Pflegepersonal       0.89      0.94      0.92        18
anderer Service       0.68      0.45      0.55        33
 mediz. Service       0.89      0.78      0.83        87

      micro avg       0.84      0.72      0.78       323
      macro avg       0.80      0.72      0.75       323
   weighted avg       0.84      0.72      0.77       323

Precision Score: 0.8436363636363636
Recall Score: 0.718266253869969
F1 Score: 0.7759197324414716
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7562.79 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6526.34 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training dbmdz/bert-base-german-cased for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.247812,0.694444,0.590551,0.638298
2,0.355400,0.173476,0.750973,0.759843,0.755382
3,0.161400,0.20179,0.771654,0.771654,0.771654
4,0.068400,0.226542,0.775591,0.775591,0.775591
5,0.038200,0.249443,0.752896,0.767717,0.760234
6,0.038200,0.263661,0.770428,0.779528,0.774951
7,0.015200,0.28532,0.772549,0.775591,0.774067


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5847.64 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.91      0.95      0.93        55
    Krankenhaus       0.88      0.63      0.74       117
       Personal       0.75      0.69      0.72        13
 Pflegepersonal       0.94      0.94      0.94        18
anderer Service       0.58      0.40      0.47        35
 mediz. Service       0.80      0.78      0.79        77

      micro avg       0.84      0.72      0.77       315
      macro avg       0.81      0.73      0.77       315
   weighted avg       0.83      0.72      0.77       315

Precision Score: 0.837037037037037
Recall Score: 0.7174603174603175
F1 Score: 0.7726495726495727
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 5309.82 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4917.06 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training dbmdz/bert-base-german-uncased for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.220856,0.729064,0.587302,0.650549
2,0.335300,0.209449,0.765487,0.686508,0.723849
3,0.148400,0.243748,0.837963,0.718254,0.773504
4,0.063400,0.251579,0.771318,0.789683,0.780392
5,0.034300,0.287322,0.779167,0.742063,0.760163
6,0.034300,0.301348,0.810924,0.765873,0.787755
7,0.011500,0.321273,0.781893,0.753968,0.767677


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5568.38 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.94      0.91        63
    Krankenhaus       0.93      0.60      0.73       112
       Personal       0.71      0.86      0.77        14
 Pflegepersonal       1.00      0.95      0.97        19
anderer Service       0.54      0.38      0.45        34
 mediz. Service       0.82      0.76      0.79        74

      micro avg       0.85      0.71      0.77       316
      macro avg       0.81      0.75      0.77       316
   weighted avg       0.85      0.71      0.76       316

Precision Score: 0.8458646616541353
Recall Score: 0.7120253164556962
F1 Score: 0.7731958762886597
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6750.88 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5530.49 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training FacebookAI/xlm-roberta-base for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.283419,0.570896,0.529412,0.549372
2,0.453900,0.204128,0.693141,0.66436,0.678445
3,0.229600,0.222915,0.71223,0.685121,0.698413
4,0.138300,0.23301,0.762238,0.754325,0.758261
5,0.107200,0.25528,0.716612,0.761246,0.738255
6,0.107200,0.249093,0.749129,0.743945,0.746528
7,0.062600,0.264,0.751724,0.754325,0.753022


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5415.29 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.81      0.88      0.85        59
    Krankenhaus       0.78      0.67      0.72       120
       Personal       0.71      0.67      0.69        15
 Pflegepersonal       1.00      0.88      0.93        24
anderer Service       0.52      0.31      0.39        45
 mediz. Service       0.70      0.72      0.71        83

      micro avg       0.75      0.68      0.72       346
      macro avg       0.75      0.69      0.71       346
   weighted avg       0.75      0.68      0.71       346

Precision Score: 0.7547770700636943
Recall Score: 0.684971098265896
F1 Score: 0.7181818181818183
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6888.93 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5810.16 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_base_best for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.244206,0.753086,0.575472,0.652406
2,0.338300,0.199671,0.71028,0.716981,0.713615
3,0.172700,0.247849,0.689815,0.70283,0.696262
4,0.089100,0.271641,0.757426,0.721698,0.73913
5,0.060800,0.315351,0.75,0.707547,0.728155
6,0.060800,0.349454,0.740933,0.674528,0.706173
7,0.027500,0.369082,0.742105,0.665094,0.701493


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5160.97 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.98      0.98      0.98        52
    Krankenhaus       0.84      0.80      0.82       104
       Personal       0.75      0.75      0.75        12
 Pflegepersonal       1.00      1.00      1.00        14
anderer Service       0.67      0.33      0.44        30
 mediz. Service       0.72      0.78      0.75        67

      micro avg       0.83      0.78      0.81       279
      macro avg       0.83      0.77      0.79       279
   weighted avg       0.82      0.78      0.80       279

Precision Score: 0.8295454545454546
Recall Score: 0.7849462365591398
F1 Score: 0.8066298342541436
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7297.44 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5606.54 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_filtered_base_best for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.206075,0.68306,0.589623,0.632911
2,0.329200,0.162027,0.742857,0.735849,0.739336
3,0.162000,0.158456,0.787736,0.787736,0.787736
4,0.084600,0.189384,0.723982,0.754717,0.73903
5,0.049300,0.246227,0.700461,0.716981,0.708625
6,0.049300,0.281494,0.769608,0.740566,0.754808
7,0.020100,0.288761,0.759615,0.745283,0.752381


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5609.05 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.94      0.98      0.96        52
    Krankenhaus       0.82      0.62      0.71       104
       Personal       0.64      0.75      0.69        12
 Pflegepersonal       1.00      0.93      0.96        14
anderer Service       0.71      0.33      0.45        30
 mediz. Service       0.63      0.78      0.69        67

      micro avg       0.78      0.72      0.75       279
      macro avg       0.79      0.73      0.75       279
   weighted avg       0.79      0.72      0.74       279

Precision Score: 0.7782101167315175
Recall Score: 0.7168458781362007
F1 Score: 0.746268656716418
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7137.86 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5071.04 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_base_last for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.283365,0.724359,0.533019,0.61413
2,0.356300,0.188622,0.756345,0.70283,0.728606
3,0.183700,0.187824,0.742081,0.773585,0.757506
4,0.090300,0.220014,0.731915,0.811321,0.769575
5,0.061800,0.278749,0.744292,0.768868,0.756381
6,0.061800,0.283812,0.75,0.735849,0.742857
7,0.028200,0.30758,0.752381,0.745283,0.748815


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5565.20 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.89      0.98      0.94        52
    Krankenhaus       0.86      0.79      0.82       104
       Personal       0.69      0.75      0.72        12
 Pflegepersonal       0.82      1.00      0.90        14
anderer Service       0.58      0.37      0.45        30
 mediz. Service       0.66      0.76      0.71        67

      micro avg       0.78      0.78      0.78       279
      macro avg       0.75      0.77      0.76       279
   weighted avg       0.78      0.78      0.78       279

Precision Score: 0.7841726618705036
Recall Score: 0.7813620071684588
F1 Score: 0.7827648114901257
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 8050.58 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6057.75 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training distilbert/distilbert-base-german-cased for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.235504,0.701923,0.574803,0.632035
2,0.394800,0.20747,0.70339,0.653543,0.677551
3,0.179500,0.229822,0.771028,0.649606,0.705128
4,0.091800,0.23703,0.758065,0.740157,0.749004
5,0.062200,0.250153,0.743083,0.740157,0.741617
6,0.062200,0.26161,0.762846,0.759843,0.761341
7,0.034200,0.276165,0.762097,0.744094,0.752988


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 6326.06 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.87      0.95      0.90        55
    Krankenhaus       0.94      0.54      0.68       117
       Personal       0.75      0.46      0.57        13
 Pflegepersonal       0.94      0.94      0.94        18
anderer Service       0.71      0.43      0.54        35
 mediz. Service       0.63      0.75      0.69        77

      micro avg       0.79      0.67      0.73       315
      macro avg       0.81      0.68      0.72       315
   weighted avg       0.82      0.67      0.72       315

Precision Score: 0.793233082706767
Recall Score: 0.6698412698412698
F1 Score: 0.7263339070567986
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7460.04 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6497.21 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training GerMedBERT/medbert-512 for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.208702,0.653266,0.562771,0.604651
2,0.319400,0.188286,0.709402,0.718615,0.713978
3,0.146400,0.233942,0.75,0.688312,0.717833
4,0.071400,0.247464,0.764423,0.688312,0.724374
5,0.035900,0.262795,0.740426,0.753247,0.746781
6,0.035900,0.293373,0.748879,0.722944,0.735683
7,0.011900,0.299455,0.744589,0.744589,0.744589


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5497.76 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.98      0.96      0.97        54
    Krankenhaus       0.81      0.78      0.80       105
       Personal       0.93      0.88      0.90        16
 Pflegepersonal       0.93      0.93      0.93        15
anderer Service       0.65      0.43      0.52        35
 mediz. Service       0.85      0.63      0.73        63

      micro avg       0.85      0.75      0.80       288
      macro avg       0.86      0.77      0.81       288
   weighted avg       0.85      0.75      0.79       288

Precision Score: 0.8543307086614174
Recall Score: 0.7534722222222222
F1 Score: 0.8007380073800738
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7400.48 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5151.14 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training deepset/gbert-base for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.23892,0.709845,0.53937,0.612975
2,0.327400,0.200221,0.76652,0.685039,0.723493
3,0.144800,0.232318,0.766234,0.69685,0.729897
4,0.057800,0.245373,0.75,0.732283,0.741036
5,0.034800,0.260837,0.736059,0.779528,0.75717
6,0.034800,0.271196,0.746154,0.76378,0.754864
7,0.011000,0.28108,0.753846,0.771654,0.762646


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5440.43 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.93      0.90        55
    Krankenhaus       0.90      0.72      0.80       117
       Personal       0.75      0.69      0.72        13
 Pflegepersonal       0.85      0.94      0.89        18
anderer Service       0.81      0.49      0.61        35
 mediz. Service       0.76      0.78      0.77        77

      micro avg       0.84      0.76      0.80       315
      macro avg       0.83      0.76      0.78       315
   weighted avg       0.84      0.76      0.79       315

Precision Score: 0.8409893992932862
Recall Score: 0.7555555555555555
F1 Score: 0.7959866220735786
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

In [8]:
for model in models:
    print(f'training and results for {model}:')
    ate_cat_model(data, model, rn1=42, rn2=42, epochs=8)
    print()

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7548.95 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6472.89 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training google-bert/bert-base-german-cased for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.224393,0.753555,0.609195,0.673729
2,0.329700,0.214753,0.73913,0.651341,0.692464
3,0.131100,0.241609,0.721374,0.724138,0.722753
4,0.049100,0.246813,0.757937,0.731801,0.744639
5,0.025200,0.311609,0.776423,0.731801,0.753452
6,0.025200,0.337981,0.777778,0.724138,0.75
7,0.004800,0.360644,0.753036,0.712644,0.732283
8,0.003100,0.366825,0.760331,0.704981,0.73161


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5099.64 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.89      0.98      0.94        52
    Krankenhaus       0.90      0.73      0.81       119
       Personal       0.73      0.57      0.64        14
 Pflegepersonal       0.89      0.94      0.92        18
anderer Service       0.79      0.45      0.58        33
 mediz. Service       0.79      0.75      0.77        87

      micro avg       0.85      0.75      0.80       323
      macro avg       0.83      0.74      0.77       323
   weighted avg       0.85      0.75      0.79       323

Precision Score: 0.8526315789473684
Recall Score: 0.7523219814241486
F1 Score: 0.7993421052631579
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6905.54 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5715.78 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training dbmdz/bert-base-german-cased for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.212496,0.662338,0.602362,0.630928
2,0.342500,0.190888,0.784141,0.700787,0.740125
3,0.149700,0.206817,0.749049,0.775591,0.762089
4,0.068900,0.233784,0.784387,0.830709,0.806883
5,0.034500,0.293668,0.756,0.744094,0.75
6,0.034500,0.290505,0.794677,0.822835,0.808511
7,0.010900,0.326328,0.785425,0.76378,0.774451
8,0.006300,0.328294,0.776923,0.795276,0.785992


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5903.56 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.90      0.95      0.92        55
    Krankenhaus       0.88      0.60      0.71       117
       Personal       0.75      0.69      0.72        13
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.77      0.49      0.60        35
 mediz. Service       0.71      0.84      0.77        77

      micro avg       0.82      0.73      0.77       315
      macro avg       0.83      0.75      0.78       315
   weighted avg       0.83      0.73      0.77       315

Precision Score: 0.8214285714285714
Recall Score: 0.7301587301587301
F1 Score: 0.773109243697479
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6769.18 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5990.25 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training dbmdz/bert-base-german-uncased for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.218303,0.753769,0.595238,0.665188
2,0.326400,0.229591,0.767857,0.68254,0.722689
3,0.150500,0.242295,0.808889,0.722222,0.763103
4,0.065600,0.253982,0.759036,0.75,0.754491
5,0.037900,0.288597,0.752988,0.75,0.751491
6,0.037900,0.352901,0.806452,0.694444,0.746269
7,0.012900,0.382492,0.779736,0.702381,0.73904
8,0.007000,0.349885,0.757937,0.757937,0.757937


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5387.95 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.83      0.94      0.88        63
    Krankenhaus       0.98      0.49      0.65       112
       Personal       0.70      0.50      0.58        14
 Pflegepersonal       0.94      0.84      0.89        19
anderer Service       0.65      0.32      0.43        34
 mediz. Service       0.95      0.70      0.81        74

      micro avg       0.88      0.63      0.74       316
      macro avg       0.84      0.63      0.71       316
   weighted avg       0.89      0.63      0.72       316

Precision Score: 0.8849557522123894
Recall Score: 0.6329113924050633
F1 Score: 0.7380073800738007
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6608.24 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5449.39 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training FacebookAI/xlm-roberta-base for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.249704,0.664,0.574394,0.615955
2,0.402600,0.289264,0.704036,0.543253,0.613281
3,0.240700,0.215119,0.72,0.747405,0.733447
4,0.140800,0.214492,0.751701,0.764706,0.758148
5,0.098600,0.289303,0.70347,0.771626,0.735974
6,0.098600,0.292044,0.756184,0.740484,0.748252
7,0.058000,0.308531,0.716981,0.788927,0.751236
8,0.030000,0.316598,0.72549,0.768166,0.746218


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5177.90 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.87      0.92      0.89        59
    Krankenhaus       0.86      0.74      0.79       120
       Personal       0.67      0.67      0.67        15
 Pflegepersonal       0.85      0.96      0.90        24
anderer Service       0.52      0.33      0.41        45
 mediz. Service       0.83      0.72      0.77        83

      micro avg       0.81      0.73      0.77       346
      macro avg       0.77      0.72      0.74       346
   weighted avg       0.80      0.73      0.76       346

Precision Score: 0.8122977346278317
Recall Score: 0.7254335260115607
F1 Score: 0.766412213740458
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6935.61 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5744.22 examples/s]


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}


  trainer = Trainer(


Training TUM/GottBERT_base_best for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.254277,0.746753,0.542453,0.628415
2,0.320800,0.217545,0.739796,0.683962,0.710784
3,0.166200,0.206095,0.748837,0.759434,0.754098
4,0.084400,0.207507,0.748858,0.773585,0.761021
5,0.056900,0.274669,0.721973,0.759434,0.74023
6,0.056900,0.306921,0.735294,0.707547,0.721154
7,0.022600,0.322672,0.7343,0.716981,0.725537
8,0.014200,0.325861,0.732719,0.75,0.741259


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5648.45 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.94      0.96      0.95        52
    Krankenhaus       0.86      0.79      0.82       104
       Personal       0.73      0.67      0.70        12
 Pflegepersonal       0.88      1.00      0.93        14
anderer Service       0.79      0.37      0.50        30
 mediz. Service       0.70      0.87      0.77        67

      micro avg       0.82      0.80      0.81       279
      macro avg       0.82      0.77      0.78       279
   weighted avg       0.83      0.80      0.80       279

Precision Score: 0.8198529411764706
Recall Score: 0.7992831541218638
F1 Score: 0.8094373865698731
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7159.09 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6055.41 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_filtered_base_best for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.21218,0.789157,0.617925,0.693122
2,0.311100,0.166924,0.748815,0.745283,0.747045
3,0.155700,0.179459,0.739496,0.830189,0.782222
4,0.079300,0.210767,0.736842,0.792453,0.763636
5,0.053500,0.235268,0.724891,0.783019,0.752834
6,0.053500,0.237617,0.8125,0.79717,0.804762
7,0.021600,0.244271,0.761062,0.811321,0.785388
8,0.012600,0.258484,0.781395,0.792453,0.786885


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5500.73 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.96      0.98      0.97        52
    Krankenhaus       0.95      0.69      0.80       104
       Personal       0.75      0.75      0.75        12
 Pflegepersonal       1.00      0.93      0.96        14
anderer Service       0.55      0.37      0.44        30
 mediz. Service       0.76      0.78      0.77        67

      micro avg       0.86      0.75      0.80       279
      macro avg       0.83      0.75      0.78       279
   weighted avg       0.86      0.75      0.79       279

Precision Score: 0.859504132231405
Recall Score: 0.7455197132616488
F1 Score: 0.7984644913627639
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7057.41 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5844.23 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_base_last for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.237486,0.769231,0.566038,0.652174
2,0.315800,0.200506,0.728643,0.683962,0.705596
3,0.163900,0.229121,0.728507,0.759434,0.743649
4,0.087200,0.227746,0.796954,0.740566,0.767726
5,0.055500,0.294504,0.683128,0.783019,0.72967
6,0.055500,0.335479,0.787129,0.75,0.768116
7,0.021300,0.338404,0.710526,0.764151,0.736364
8,0.015400,0.372497,0.730769,0.716981,0.72381


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5172.02 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       1.00      0.98      0.99        52
    Krankenhaus       0.89      0.70      0.78       104
       Personal       0.65      0.92      0.76        12
 Pflegepersonal       1.00      1.00      1.00        14
anderer Service       0.67      0.40      0.50        30
 mediz. Service       0.81      0.76      0.78        67

      micro avg       0.87      0.76      0.81       279
      macro avg       0.84      0.79      0.80       279
   weighted avg       0.86      0.76      0.80       279

Precision Score: 0.8653061224489796
Recall Score: 0.7598566308243727
F1 Score: 0.8091603053435114
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7681.46 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6424.69 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training distilbert/distilbert-base-german-cased for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.238099,0.64977,0.555118,0.598726
2,0.393000,0.221648,0.748837,0.633858,0.686567
3,0.168900,0.220918,0.783019,0.653543,0.712446
4,0.090800,0.250111,0.767544,0.688976,0.726141
5,0.059300,0.253673,0.757812,0.76378,0.760784
6,0.059300,0.260656,0.773109,0.724409,0.747967
7,0.031000,0.274106,0.756198,0.720472,0.737903
8,0.017900,0.277571,0.757202,0.724409,0.740443


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5974.12 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.87      0.95      0.90        55
    Krankenhaus       0.95      0.59      0.73       117
       Personal       0.80      0.31      0.44        13
 Pflegepersonal       0.81      0.94      0.87        18
anderer Service       0.59      0.37      0.46        35
 mediz. Service       0.60      0.70      0.65        77

      micro avg       0.77      0.66      0.71       315
      macro avg       0.77      0.64      0.67       315
   weighted avg       0.79      0.66      0.70       315

Precision Score: 0.7712177121771218
Recall Score: 0.6634920634920635
F1 Score: 0.7133105802047781
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6547.47 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5239.18 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training GerMedBERT/medbert-512 for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.191477,0.678049,0.601732,0.637615
2,0.326500,0.196002,0.725322,0.731602,0.728448
3,0.151600,0.259563,0.765625,0.636364,0.695035
4,0.067900,0.248632,0.746606,0.714286,0.730088
5,0.037200,0.302711,0.709163,0.770563,0.738589
6,0.037200,0.316546,0.757709,0.744589,0.751092
7,0.010300,0.331569,0.74569,0.748918,0.7473
8,0.005700,0.341815,0.746725,0.74026,0.743478


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3087.63 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       1.00      0.96      0.98        54
    Krankenhaus       0.85      0.79      0.82       105
       Personal       0.88      0.94      0.91        16
 Pflegepersonal       1.00      0.93      0.97        15
anderer Service       0.61      0.49      0.54        35
 mediz. Service       0.86      0.59      0.70        63

      micro avg       0.87      0.76      0.81       288
      macro avg       0.87      0.78      0.82       288
   weighted avg       0.86      0.76      0.80       288

Precision Score: 0.8650793650793651
Recall Score: 0.7569444444444444
F1 Score: 0.8074074074074075
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7524.79 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6362.93 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training deepset/gbert-base for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.227271,0.712329,0.614173,0.659619
2,0.316000,0.182275,0.748936,0.692913,0.719836
3,0.145200,0.207398,0.771318,0.783465,0.777344
4,0.057100,0.246431,0.743295,0.76378,0.753398
5,0.034700,0.263632,0.746377,0.811024,0.777358
6,0.034700,0.258587,0.768939,0.799213,0.783784
7,0.012600,0.275203,0.752768,0.80315,0.777143
8,0.007500,0.273943,0.750929,0.795276,0.772467


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5798.97 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.91      0.95      0.93        55
    Krankenhaus       0.90      0.78      0.83       117
       Personal       0.70      0.54      0.61        13
 Pflegepersonal       0.89      0.94      0.92        18
anderer Service       0.80      0.57      0.67        35
 mediz. Service       0.88      0.74      0.80        77

      micro avg       0.88      0.77      0.82       315
      macro avg       0.85      0.75      0.79       315
   weighted avg       0.88      0.77      0.82       315

Precision Score: 0.8808664259927798
Recall Score: 0.7746031746031746
F1 Score: 0.8243243243243243
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

In [9]:
for model in models:
    print(f'training and results for {model}:')
    ate_cat_model(data, model, rn1=42, rn2=42, epochs=10)
    print()

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7329.19 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6346.06 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training google-bert/bert-base-german-cased for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.228611,0.665179,0.570881,0.614433
2,0.326000,0.203977,0.740741,0.689655,0.714286
3,0.137800,0.225556,0.736,0.704981,0.720157
4,0.052700,0.268761,0.714286,0.689655,0.701754
5,0.025600,0.341682,0.723577,0.681992,0.70217
6,0.025600,0.351785,0.751004,0.716475,0.733333
7,0.005300,0.387418,0.738095,0.712644,0.725146
8,0.001800,0.412987,0.732283,0.712644,0.72233
9,0.000900,0.413352,0.733068,0.704981,0.71875
10,0.000600,0.422537,0.737903,0.701149,0.719057


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5746.93 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.84      0.94      0.89        52
    Krankenhaus       0.92      0.55      0.69       119
       Personal       0.75      0.64      0.69        14
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.73      0.48      0.58        33
 mediz. Service       0.78      0.80      0.79        87

      micro avg       0.84      0.70      0.76       323
      macro avg       0.84      0.73      0.77       323
   weighted avg       0.85      0.70      0.75       323

Precision Score: 0.8376383763837638
Recall Score: 0.7027863777089783
F1 Score: 0.7643097643097643
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7266.70 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5447.36 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training dbmdz/bert-base-german-cased for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.232776,0.672489,0.606299,0.637681
2,0.350000,0.191989,0.77459,0.744094,0.759036
3,0.159300,0.208185,0.762452,0.783465,0.772816
4,0.067400,0.274858,0.713725,0.716535,0.715128
5,0.036200,0.363171,0.720165,0.688976,0.704225
6,0.036200,0.346212,0.727626,0.73622,0.731898
7,0.011000,0.411253,0.738589,0.700787,0.719192
8,0.006100,0.364038,0.729323,0.76378,0.746154
9,0.003100,0.387222,0.72963,0.775591,0.751908
10,0.002200,0.383074,0.747082,0.755906,0.751468


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5839.42 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.90      0.96      0.93        55
    Krankenhaus       0.86      0.54      0.66       117
       Personal       0.69      0.69      0.69        13
 Pflegepersonal       0.89      0.94      0.92        18
anderer Service       0.76      0.37      0.50        35
 mediz. Service       0.63      0.81      0.71        77

      micro avg       0.78      0.69      0.73       315
      macro avg       0.79      0.72      0.74       315
   weighted avg       0.80      0.69      0.72       315

Precision Score: 0.7777777777777778
Recall Score: 0.6888888888888889
F1 Score: 0.7306397306397305
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6750.29 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5903.35 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training dbmdz/bert-base-german-uncased for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.226525,0.73399,0.59127,0.654945
2,0.326000,0.221462,0.766667,0.638889,0.69697
3,0.147300,0.225847,0.800847,0.75,0.77459
4,0.064600,0.26149,0.761905,0.761905,0.761905
5,0.038600,0.297467,0.765873,0.765873,0.765873
6,0.038600,0.317757,0.776423,0.757937,0.767068
7,0.012900,0.326984,0.767717,0.77381,0.770751
8,0.005800,0.342967,0.768,0.761905,0.76494
9,0.001600,0.348449,0.777778,0.805556,0.791423
10,0.001800,0.345536,0.77821,0.793651,0.785855


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5440.98 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.97      0.94      0.95        63
    Krankenhaus       0.90      0.74      0.81       112
       Personal       0.71      0.86      0.77        14
 Pflegepersonal       1.00      0.95      0.97        19
anderer Service       0.60      0.44      0.51        34
 mediz. Service       0.83      0.80      0.81        74

      micro avg       0.87      0.78      0.82       316
      macro avg       0.83      0.79      0.81       316
   weighted avg       0.86      0.78      0.82       316

Precision Score: 0.8661971830985915
Recall Score: 0.7784810126582279
F1 Score: 0.82
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', '

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6491.39 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5412.49 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training FacebookAI/xlm-roberta-base for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.2779,0.737705,0.467128,0.572034
2,0.452500,0.229581,0.70412,0.650519,0.676259
3,0.231100,0.232594,0.727273,0.719723,0.723478
4,0.131400,0.238561,0.71987,0.764706,0.741611
5,0.100600,0.269071,0.725552,0.795848,0.759076
6,0.100600,0.290488,0.77193,0.761246,0.766551
7,0.055400,0.290133,0.730769,0.788927,0.758735
8,0.035500,0.296211,0.768707,0.782007,0.7753
9,0.024200,0.300635,0.73871,0.792388,0.764608
10,0.013100,0.309139,0.753247,0.802768,0.777219


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5225.78 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.86      0.95      0.90        59
    Krankenhaus       0.81      0.69      0.75       120
       Personal       0.83      0.67      0.74        15
 Pflegepersonal       1.00      1.00      1.00        24
anderer Service       0.47      0.31      0.37        45
 mediz. Service       0.63      0.70      0.66        83

      micro avg       0.75      0.71      0.73       346
      macro avg       0.77      0.72      0.74       346
   weighted avg       0.75      0.71      0.72       346

Precision Score: 0.7538461538461538
Recall Score: 0.708092485549133
F1 Score: 0.7302533532041728
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6946.03 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6083.07 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_base_best for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.253152,0.746988,0.584906,0.656085
2,0.328700,0.196819,0.742424,0.693396,0.717073
3,0.175600,0.224701,0.707547,0.707547,0.707547
4,0.087200,0.237126,0.707207,0.740566,0.723502
5,0.057500,0.319452,0.698347,0.79717,0.744493
6,0.057500,0.296376,0.759259,0.773585,0.766355
7,0.024400,0.345963,0.736364,0.764151,0.75
8,0.011300,0.365086,0.712446,0.783019,0.746067
9,0.009600,0.384838,0.723982,0.754717,0.73903
10,0.006700,0.383792,0.737089,0.740566,0.738824


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5684.77 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.98      0.98      0.98        52
    Krankenhaus       0.86      0.72      0.79       104
       Personal       0.85      0.92      0.88        12
 Pflegepersonal       1.00      1.00      1.00        14
anderer Service       0.65      0.57      0.61        30
 mediz. Service       0.67      0.70      0.69        67

      micro avg       0.82      0.77      0.79       279
      macro avg       0.84      0.81      0.82       279
   weighted avg       0.82      0.77      0.79       279

Precision Score: 0.8206106870229007
Recall Score: 0.7706093189964157
F1 Score: 0.7948243992606283
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6386.50 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4927.13 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_filtered_base_best for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.209153,0.737143,0.608491,0.666667
2,0.332600,0.164789,0.801105,0.683962,0.737913
3,0.166700,0.153189,0.768559,0.830189,0.798186
4,0.078400,0.16068,0.799043,0.787736,0.793349
5,0.055000,0.2809,0.791444,0.698113,0.741855
6,0.055000,0.255733,0.765766,0.801887,0.78341
7,0.019300,0.2311,0.790698,0.801887,0.796253
8,0.011000,0.288328,0.792746,0.721698,0.755556
9,0.006100,0.294164,0.800995,0.759434,0.779661
10,0.003800,0.297626,0.789474,0.778302,0.783848


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5673.69 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.94      0.98      0.96        52
    Krankenhaus       0.89      0.64      0.75       104
       Personal       0.64      0.75      0.69        12
 Pflegepersonal       0.93      0.93      0.93        14
anderer Service       0.71      0.33      0.45        30
 mediz. Service       0.73      0.82      0.77        67

      micro avg       0.83      0.73      0.78       279
      macro avg       0.81      0.74      0.76       279
   weighted avg       0.84      0.73      0.77       279

Precision Score: 0.8333333333333334
Recall Score: 0.7347670250896058
F1 Score: 0.7809523809523811
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7271.11 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6038.41 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_base_last for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.260249,0.727848,0.542453,0.621622
2,0.326300,0.183578,0.73545,0.65566,0.693267
3,0.174200,0.237138,0.731707,0.707547,0.719424
4,0.086500,0.219128,0.763285,0.745283,0.754177
5,0.058200,0.303666,0.728507,0.759434,0.743649
6,0.058200,0.321826,0.760766,0.75,0.755344
7,0.022600,0.358756,0.709957,0.773585,0.740406
8,0.011800,0.397418,0.740385,0.726415,0.733333
9,0.006700,0.382272,0.739336,0.735849,0.737589
10,0.004400,0.400775,0.754808,0.740566,0.747619


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5651.36 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.94      0.98      0.96        52
    Krankenhaus       0.85      0.65      0.74       104
       Personal       0.69      0.75      0.72        12
 Pflegepersonal       1.00      1.00      1.00        14
anderer Service       0.54      0.43      0.48        30
 mediz. Service       0.66      0.73      0.70        67

      micro avg       0.79      0.73      0.76       279
      macro avg       0.78      0.76      0.77       279
   weighted avg       0.79      0.73      0.75       279

Precision Score: 0.7876447876447876
Recall Score: 0.7311827956989247
F1 Score: 0.758364312267658


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-mediz. Service', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-Krankenhaus', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'B-Krankenhaus', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Results saved to testresult/BO/ate_cat/10_epochs/TUM_GottBERT_base_last_ate_cat_test_results.txt
Confusion matrix saved to testresult/ate_cat/10_epochs/

=== Performance Metrics ===
GPU: NVIDIA A30
Average epoch time: 15.96s
Total training time: 159.59s
Peak GPU memory: 3443.0MB
Average batch time: 0.0386s
Training complete. Model directory deleted 

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7327.25 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6079.57 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training distilbert/distilbert-base-german-cased for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.244423,0.65625,0.57874,0.615063
2,0.392400,0.225741,0.721739,0.653543,0.68595
3,0.172200,0.256577,0.776744,0.65748,0.712154
4,0.087700,0.265688,0.735294,0.688976,0.711382
5,0.055900,0.295494,0.734127,0.728346,0.731225
6,0.055900,0.312018,0.77686,0.740157,0.758065
7,0.024400,0.318799,0.750973,0.759843,0.755382
8,0.014700,0.341944,0.759657,0.69685,0.726899
9,0.011500,0.340218,0.761317,0.728346,0.744467
10,0.005500,0.345061,0.765432,0.732283,0.748491


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 6115.89 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.90      0.95      0.92        55
    Krankenhaus       0.93      0.59      0.72       117
       Personal       0.67      0.46      0.55        13
 Pflegepersonal       0.89      0.94      0.92        18
anderer Service       0.68      0.37      0.48        35
 mediz. Service       0.60      0.69      0.64        77

      micro avg       0.79      0.67      0.72       315
      macro avg       0.78      0.67      0.71       315
   weighted avg       0.80      0.67      0.71       315

Precision Score: 0.7865168539325843
Recall Score: 0.6666666666666666
F1 Score: 0.7216494845360824
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7342.26 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5428.16 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training GerMedBERT/medbert-512 for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.195355,0.694581,0.61039,0.64977
2,0.318600,0.175375,0.733624,0.727273,0.730435
3,0.150200,0.266156,0.787234,0.640693,0.706444
4,0.072100,0.254554,0.734513,0.718615,0.726477
5,0.039200,0.281222,0.699588,0.735931,0.7173
6,0.039200,0.300259,0.77512,0.701299,0.736364
7,0.010200,0.333692,0.719298,0.709957,0.714597
8,0.007000,0.349973,0.708861,0.727273,0.717949
9,0.001900,0.359183,0.742991,0.688312,0.714607
10,0.001000,0.35731,0.734234,0.705628,0.719647


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5065.04 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.95      0.98      0.96        54
    Krankenhaus       0.96      0.70      0.81       105
       Personal       0.79      0.69      0.73        16
 Pflegepersonal       0.93      0.87      0.90        15
anderer Service       0.62      0.43      0.51        35
 mediz. Service       0.86      0.60      0.71        63

      micro avg       0.89      0.70      0.79       288
      macro avg       0.85      0.71      0.77       288
   weighted avg       0.88      0.70      0.78       288

Precision Score: 0.8903508771929824
Recall Score: 0.7048611111111112
F1 Score: 0.7868217054263567
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6773.19 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5391.89 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training deepset/gbert-base for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.212765,0.789216,0.633858,0.703057
2,0.324100,0.201206,0.802817,0.673228,0.732334
3,0.147400,0.207265,0.7875,0.744094,0.765182
4,0.062800,0.262759,0.814815,0.692913,0.748936
5,0.038500,0.275988,0.7393,0.748031,0.74364
6,0.038500,0.273156,0.795745,0.73622,0.764826
7,0.015600,0.308652,0.750958,0.771654,0.761165
8,0.007800,0.317833,0.772549,0.775591,0.774067
9,0.004800,0.330199,0.765625,0.771654,0.768627
10,0.001700,0.328842,0.767442,0.779528,0.773438


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5867.04 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.86      0.93      0.89        55
    Krankenhaus       0.92      0.72      0.81       117
       Personal       0.64      0.69      0.67        13
 Pflegepersonal       0.85      0.94      0.89        18
anderer Service       0.78      0.51      0.62        35
 mediz. Service       0.76      0.74      0.75        77

      micro avg       0.84      0.75      0.79       315
      macro avg       0.80      0.76      0.77       315
   weighted avg       0.84      0.75      0.79       315

Precision Score: 0.8368794326241135
Recall Score: 0.7492063492063492
F1 Score: 0.7906197654941373
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

In [10]:
for model in models:
    print(f'training and results for {model}:')
    ate_cat_model(data, model, rn1=42, rn2=42, epochs=12)
    print()

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7348.36 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6294.67 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training google-bert/bert-base-german-cased for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.21118,0.723502,0.601533,0.656904
2,0.332900,0.194936,0.74359,0.666667,0.70303
3,0.132800,0.260421,0.737903,0.701149,0.719057
4,0.052000,0.264661,0.763158,0.666667,0.711656
5,0.027700,0.351439,0.710744,0.659004,0.683897
6,0.027700,0.382957,0.726087,0.639847,0.680244
7,0.006600,0.406302,0.718876,0.685824,0.701961
8,0.002500,0.411426,0.695817,0.701149,0.698473
9,0.000600,0.409907,0.724806,0.716475,0.720617
10,0.000900,0.437056,0.701613,0.666667,0.683694


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5745.39 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.86      0.96      0.91        52
    Krankenhaus       0.91      0.67      0.77       119
       Personal       0.67      0.57      0.62        14
 Pflegepersonal       0.94      0.94      0.94        18
anderer Service       0.72      0.39      0.51        33
 mediz. Service       0.77      0.83      0.80        87

      micro avg       0.84      0.74      0.79       323
      macro avg       0.81      0.73      0.76       323
   weighted avg       0.84      0.74      0.78       323

Precision Score: 0.8362369337979094
Recall Score: 0.7430340557275542
F1 Score: 0.7868852459016394
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7292.18 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5278.55 examples/s]


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}


  trainer = Trainer(


Training dbmdz/bert-base-german-cased for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.252347,0.752381,0.622047,0.681034
2,0.339800,0.20858,0.765217,0.692913,0.727273
3,0.159100,0.195095,0.809717,0.787402,0.798403
4,0.067400,0.239675,0.814516,0.795276,0.804781
5,0.037200,0.233406,0.788235,0.791339,0.789784
6,0.037200,0.291862,0.820408,0.791339,0.805611
7,0.013700,0.335676,0.812766,0.751969,0.781186
8,0.004200,0.308814,0.785156,0.791339,0.788235
9,0.003300,0.333823,0.798419,0.795276,0.796844
10,0.001400,0.344803,0.8,0.787402,0.793651


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5715.30 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       1.00      0.93      0.96        55
    Krankenhaus       0.89      0.55      0.68       117
       Personal       0.71      0.92      0.80        13
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.67      0.40      0.50        35
 mediz. Service       0.73      0.79      0.76        77

      micro avg       0.84      0.70      0.76       315
      macro avg       0.83      0.76      0.78       315
   weighted avg       0.84      0.70      0.75       315

Precision Score: 0.8390804597701149
Recall Score: 0.6952380952380952
F1 Score: 0.7604166666666666
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 5899.44 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5905.25 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training dbmdz/bert-base-german-uncased for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.234657,0.72381,0.603175,0.658009
2,0.320600,0.25597,0.766667,0.638889,0.69697
3,0.146100,0.26753,0.805556,0.690476,0.74359
4,0.063100,0.295659,0.782222,0.698413,0.737945
5,0.037800,0.287016,0.752768,0.809524,0.780115
6,0.037800,0.326704,0.786008,0.757937,0.771717
7,0.012900,0.369921,0.784232,0.75,0.766734
8,0.005500,0.404254,0.795745,0.742063,0.767967
9,0.002300,0.381755,0.792,0.785714,0.788845
10,0.002100,0.376032,0.792,0.785714,0.788845


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5501.93 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.95      0.92        63
    Krankenhaus       0.89      0.62      0.73       112
       Personal       0.75      0.64      0.69        14
 Pflegepersonal       0.90      0.95      0.92        19
anderer Service       0.68      0.38      0.49        34
 mediz. Service       0.82      0.72      0.76        74

      micro avg       0.85      0.71      0.77       316
      macro avg       0.82      0.71      0.75       316
   weighted avg       0.84      0.71      0.76       316

Precision Score: 0.8479087452471483
Recall Score: 0.7056962025316456
F1 Score: 0.770293609671848
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6677.93 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4989.81 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training FacebookAI/xlm-roberta-base for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.290972,0.601695,0.491349,0.540952
2,0.443000,0.210959,0.724806,0.647059,0.683729
3,0.238500,0.22031,0.7,0.750865,0.724541
4,0.144800,0.195964,0.744186,0.775087,0.759322
5,0.104200,0.270166,0.698718,0.754325,0.725458
6,0.104200,0.277574,0.771127,0.757785,0.764398
7,0.064700,0.291566,0.711974,0.761246,0.735786
8,0.045800,0.322243,0.711538,0.768166,0.738769
9,0.024400,0.330971,0.71519,0.782007,0.747107
10,0.013900,0.352433,0.708333,0.764706,0.735441


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5138.16 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.85      0.93      0.89        59
    Krankenhaus       0.83      0.64      0.72       120
       Personal       0.82      0.60      0.69        15
 Pflegepersonal       0.96      1.00      0.98        24
anderer Service       0.46      0.29      0.36        45
 mediz. Service       0.70      0.65      0.67        83

      micro avg       0.78      0.67      0.72       346
      macro avg       0.77      0.69      0.72       346
   weighted avg       0.76      0.67      0.71       346

Precision Score: 0.7759197324414716
Recall Score: 0.6705202312138728
F1 Score: 0.7193798449612403
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7356.06 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5649.91 examples/s]


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}


  trainer = Trainer(


Training TUM/GottBERT_base_best for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.229253,0.702381,0.556604,0.621053
2,0.318700,0.182888,0.768844,0.721698,0.744526
3,0.170200,0.232146,0.731707,0.707547,0.719424
4,0.087200,0.216653,0.734513,0.783019,0.757991
5,0.059000,0.298489,0.688797,0.783019,0.732892
6,0.059000,0.359363,0.773684,0.693396,0.731343
7,0.025400,0.38569,0.730392,0.70283,0.716346
8,0.013900,0.418151,0.71831,0.721698,0.72
9,0.009600,0.397341,0.722222,0.735849,0.728972
10,0.006500,0.443843,0.725118,0.721698,0.723404


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5363.16 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.94      0.98      0.96        52
    Krankenhaus       0.94      0.60      0.73       104
       Personal       0.67      0.83      0.74        12
 Pflegepersonal       0.93      1.00      0.97        14
anderer Service       0.55      0.40      0.46        30
 mediz. Service       0.70      0.78      0.74        67

      micro avg       0.82      0.72      0.77       279
      macro avg       0.79      0.76      0.77       279
   weighted avg       0.83      0.72      0.76       279

Precision Score: 0.8170731707317073
Recall Score: 0.7204301075268817
F1 Score: 0.7657142857142857
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6962.59 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5892.68 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_filtered_base_best for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.202441,0.738889,0.627358,0.678571
2,0.331200,0.165922,0.775,0.731132,0.752427
3,0.164900,0.172678,0.757709,0.811321,0.783599
4,0.086600,0.194903,0.760181,0.792453,0.775982
5,0.050900,0.283478,0.743961,0.726415,0.735084
6,0.050900,0.252856,0.824121,0.773585,0.798054
7,0.021400,0.309155,0.789474,0.707547,0.746269
8,0.011900,0.333994,0.77619,0.768868,0.772512
9,0.005600,0.3448,0.792079,0.754717,0.772947
10,0.007300,0.36931,0.779412,0.75,0.764423


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5600.24 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.96      0.92      0.94        52
    Krankenhaus       0.90      0.58      0.70       104
       Personal       0.60      0.75      0.67        12
 Pflegepersonal       1.00      0.93      0.96        14
anderer Service       0.61      0.37      0.46        30
 mediz. Service       0.64      0.67      0.66        67

      micro avg       0.80      0.67      0.73       279
      macro avg       0.78      0.70      0.73       279
   weighted avg       0.81      0.67      0.72       279

Precision Score: 0.7982832618025751
Recall Score: 0.6666666666666666
F1 Score: 0.7265625
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', '

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7250.15 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6046.08 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training TUM/GottBERT_base_last for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.260167,0.756579,0.542453,0.631868
2,0.327500,0.197668,0.742857,0.735849,0.739336
3,0.171900,0.276609,0.744898,0.688679,0.715686
4,0.082700,0.257806,0.727679,0.768868,0.747706
5,0.056900,0.334588,0.703704,0.806604,0.751648
6,0.056900,0.354588,0.755,0.712264,0.73301
7,0.022200,0.359107,0.726027,0.75,0.737819
8,0.011300,0.392136,0.722467,0.773585,0.747153
9,0.009600,0.390864,0.724299,0.731132,0.7277
10,0.006400,0.38669,0.743119,0.764151,0.753488


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5628.75 examples/s]


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.94      0.98      0.96        52
    Krankenhaus       0.90      0.68      0.78       104
       Personal       0.64      0.75      0.69        12
 Pflegepersonal       0.93      0.93      0.93        14
anderer Service       0.52      0.37      0.43        30
 mediz. Service       0.70      0.76      0.73        67

      micro avg       0.81      0.74      0.77       279
      macro avg       0.77      0.74      0.75       279
   weighted avg       0.81      0.74      0.77       279

Precision Score: 0.807843137254902
Recall Score: 0.7383512544802867
F1 Score: 0.7715355805243447
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 8152.35 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6793.87 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training distilbert/distilbert-base-german-cased for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.225392,0.696682,0.57874,0.632258
2,0.394000,0.222005,0.707424,0.637795,0.670807
3,0.171900,0.22577,0.795349,0.673228,0.729211
4,0.084700,0.254331,0.765217,0.692913,0.727273
5,0.049600,0.262513,0.769231,0.748031,0.758483
6,0.049600,0.293081,0.77381,0.767717,0.770751
7,0.023600,0.308278,0.751938,0.76378,0.757812
8,0.010700,0.326156,0.776,0.76378,0.769841
9,0.005500,0.342215,0.787234,0.728346,0.756646
10,0.004600,0.347001,0.784553,0.759843,0.772


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5475.52 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.86      0.89      0.88        55
    Krankenhaus       0.95      0.60      0.73       117
       Personal       0.67      0.31      0.42        13
 Pflegepersonal       0.78      1.00      0.88        18
anderer Service       0.70      0.40      0.51        35
 mediz. Service       0.68      0.71      0.70        77

      micro avg       0.80      0.67      0.73       315
      macro avg       0.77      0.65      0.69       315
   weighted avg       0.82      0.67      0.72       315

Precision Score: 0.8045977011494253
Recall Score: 0.6666666666666666
F1 Score: 0.7291666666666666
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 6743.71 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5419.55 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training GerMedBERT/medbert-512 for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.199581,0.695876,0.584416,0.635294
2,0.322200,0.16907,0.721739,0.718615,0.720174
3,0.151800,0.272486,0.778409,0.593074,0.673219
4,0.066500,0.244132,0.724444,0.705628,0.714912
5,0.037300,0.280367,0.719665,0.744589,0.731915
6,0.037300,0.314205,0.744292,0.705628,0.724444
7,0.014400,0.381934,0.720183,0.679654,0.699332
8,0.004600,0.344402,0.71308,0.731602,0.722222
9,0.001100,0.39947,0.742991,0.688312,0.714607
10,0.002700,0.396454,0.741784,0.683983,0.711712


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5598.55 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.98      0.96      0.97        54
    Krankenhaus       0.79      0.77      0.78       105
       Personal       0.57      0.50      0.53        16
 Pflegepersonal       0.71      1.00      0.83        15
anderer Service       0.60      0.51      0.55        35
 mediz. Service       0.77      0.65      0.71        63

      micro avg       0.79      0.75      0.77       288
      macro avg       0.74      0.73      0.73       288
   weighted avg       0.78      0.75      0.76       288

Precision Score: 0.7875457875457875
Recall Score: 0.7465277777777778
F1 Score: 0.766488413547237
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7545.24 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6561.72 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-mediz. Service' 'B-Personal' 'B-anderer Service'
 'B-Arzt' 'B-Pflegepersonal' 'O']
{0: 5.983217355710193, 1: 4.959959280624364, 2: 18.317042606516292, 3: 10.546176046176047, 4: 6.8239962651727355, 5: 14.014381591562799, 6: 0.1596439493228484}
Training deepset/gbert-base for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.227517,0.702326,0.594488,0.643923
2,0.329600,0.194439,0.745763,0.692913,0.718367
3,0.150800,0.210067,0.790514,0.787402,0.788955
4,0.056400,0.248432,0.787402,0.787402,0.787402
5,0.032400,0.308927,0.725979,0.80315,0.762617
6,0.032400,0.27139,0.78022,0.838583,0.808349
7,0.013700,0.273768,0.785992,0.795276,0.790607
8,0.006900,0.317414,0.759542,0.783465,0.771318
9,0.003500,0.322511,0.747331,0.826772,0.785047
10,0.001700,0.307005,0.771218,0.822835,0.79619


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5903.72 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.87      0.95      0.90        55
    Krankenhaus       0.89      0.77      0.83       117
       Personal       0.60      0.69      0.64        13
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.64      0.40      0.49        35
 mediz. Service       0.83      0.78      0.81        77

      micro avg       0.84      0.77      0.80       315
      macro avg       0.80      0.76      0.77       315
   weighted avg       0.84      0.77      0.80       315

Precision Score: 0.8432055749128919
Recall Score: 0.7682539682539683
F1 Score: 0.8039867109634551
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

### k-fold crossvalidation

In [7]:
for model in models:
    print(f'training and results for {model}:')
    ate_model_kfold(data, model, rn1=42, rn2=42, k=3, epochs=5)
    print()

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7441.49 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6574.33 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7629.62 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.164414,0.798578,0.742291,0.769406
2,0.170800,0.229732,0.770419,0.768722,0.76957
3,0.059700,0.300959,0.74498,0.817181,0.779412
4,0.059700,0.369277,0.780761,0.768722,0.774695
5,0.013800,0.405222,0.767442,0.799559,0.783172


Evaluating fold 1


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 1 Results - Precision: 0.8483412322274881, Recall: 0.7902869757174393, F1: 0.8182857142857143
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7551.65 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7061.12 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6594.88 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.018858,0.99022,0.975904,0.98301
2,0.102300,0.013051,0.958333,0.99759,0.977568
3,0.034100,0.015013,0.973934,0.990361,0.982079
4,0.034100,0.020067,0.973872,0.987952,0.980861
5,0.007100,0.019085,0.976247,0.990361,0.983254


Evaluating fold 2


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 2 Results - Precision: 0.9670781893004116, Recall: 0.9853249475890985, F1: 0.976116303219107
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7511.98 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6486.16 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6395.31 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.007094,1.0,0.995465,0.997727
2,0.030300,0.004172,1.0,0.997732,0.998865
3,0.012600,0.004661,1.0,0.997732,0.998865
4,0.012600,0.004804,1.0,0.997732,0.998865
5,0.001600,0.004909,1.0,0.997732,0.998865


Evaluating fold 3


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 3 Results - Precision: 0.9980276134122288, Recall: 1.0, F1: 0.9990128331688055

=== Final Cross-Validation Results ===
Average Precision: 0.9378156783133761
Average Recall: 0.9252039744355126
Average F1 Score: 0.9311382835578756
Average epoch time: 13.23s ± 0.09s
Total training time: 3.3 minutes
Peak memory usage: 2605.0MB
Average batch time: 0.0374s ± 0.0003s
GPUs used: NVIDIA A30
Training complete. Model directory for fold 1 deleted to free memory.
Training complete. Model directory for fold 2 deleted to free memory.
Training complete. Model directory for fold 3 deleted to free memory.

training and results for dbmdz/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7683.44 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7050.66 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7834.28 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.179078,0.837398,0.703872,0.764851
2,0.185600,0.213568,0.768349,0.763098,0.765714
3,0.078900,0.293205,0.749503,0.85877,0.800425
4,0.078900,0.330617,0.771889,0.763098,0.767468
5,0.024600,0.35978,0.75378,0.794989,0.773836


Evaluating fold 1


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 1 Results - Precision: 0.8299319727891157, Recall: 0.8375286041189931, F1: 0.8337129840546698
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7091.78 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7367.12 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6287.42 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.034674,0.949153,0.982456,0.965517
2,0.108000,0.047103,0.971503,0.93985,0.955414
3,0.048100,0.046873,0.969231,0.947368,0.958175
4,0.048100,0.045085,0.964377,0.949875,0.957071
5,0.015600,0.054593,0.945813,0.962406,0.954037


Evaluating fold 2


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 2 Results - Precision: 0.9111111111111111, Recall: 0.9783080260303688, F1: 0.9435146443514644
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7630.27 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7068.77 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6559.30 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.025096,0.959551,0.993023,0.976
2,0.065200,0.025176,0.979118,0.981395,0.980256
3,0.030000,0.03017,0.963719,0.988372,0.97589
4,0.030000,0.036784,0.954955,0.986047,0.970252
5,0.007000,0.035975,0.965831,0.986047,0.975834


Evaluating fold 3


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 3 Results - Precision: 0.9711934156378601, Recall: 0.9752066115702479, F1: 0.9731958762886598

=== Final Cross-Validation Results ===
Average Precision: 0.9040788331793622
Average Recall: 0.9303477472398699
Average F1 Score: 0.9168078348982647
Average epoch time: 13.09s ± 0.12s
Total training time: 3.3 minutes
Peak memory usage: 2625.0MB
Average batch time: 0.0371s ± 0.0004s
GPUs used: NVIDIA A30
Training complete. Model directory for fold 1 deleted to free memory.
Training complete. Model directory for fold 2 deleted to free memory.
Training complete. Model directory for fold 3 deleted to free memory.

training and results for dbmdz/bert-base-german-uncased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 5191.09 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6081.53 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6569.51 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.15629,0.791569,0.787879,0.78972
2,0.175700,0.184327,0.78458,0.806527,0.795402
3,0.075300,0.241417,0.74477,0.829837,0.785006
4,0.075300,0.291725,0.775463,0.780886,0.778165
5,0.025800,0.329046,0.7593,0.808858,0.783296


Evaluating fold 1


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 1 Results - Precision: 0.8697916666666666, Recall: 0.759090909090909, F1: 0.8106796116504853
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6703.01 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6126.11 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 5775.11 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.058762,0.898585,0.936118,0.916968
2,0.109600,0.093926,0.89486,0.941032,0.917365
3,0.041300,0.096675,0.919315,0.923833,0.921569
4,0.041300,0.104787,0.910843,0.928747,0.919708
5,0.015600,0.113971,0.911271,0.933661,0.92233


Evaluating fold 2


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 2 Results - Precision: 0.9230769230769231, Recall: 0.9190371991247265, F1: 0.9210526315789473
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6158.18 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 5990.69 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 5666.62 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.029168,0.997619,0.972158,0.984724
2,0.056500,0.005106,0.993072,0.99768,0.99537
3,0.022000,0.012391,0.988426,0.990719,0.989571
4,0.022000,0.016743,0.986111,0.988399,0.987254
5,0.006100,0.011773,0.990719,0.990719,0.990719


Evaluating fold 3


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 3 Results - Precision: 0.9880715705765407, Recall: 0.9979919678714859, F1: 0.9930069930069929

=== Final Cross-Validation Results ===
Average Precision: 0.9269800534400435
Average Recall: 0.8920400253623738
Average F1 Score: 0.9082464120788085
Average epoch time: 13.18s ± 0.03s
Total training time: 3.3 minutes
Peak memory usage: 2625.0MB
Average batch time: 0.0373s ± 0.0001s
GPUs used: NVIDIA A30
Training complete. Model directory for fold 1 deleted to free memory.
Training complete. Model directory for fold 2 deleted to free memory.
Training complete. Model directory for fold 3 deleted to free memory.

training and results for FacebookAI/xlm-roberta-base:


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 5799.88 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 3495.20 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6177.07 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.164127,0.797468,0.751491,0.773797
2,0.214000,0.169465,0.755682,0.793241,0.774006
3,0.137600,0.277082,0.687403,0.878728,0.771379
4,0.137600,0.246166,0.768224,0.817097,0.791908
5,0.072100,0.307405,0.724662,0.852883,0.783562


Evaluating fold 1


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 1 Results - Precision: 0.8631346578366446, Recall: 0.7995910020449898, F1: 0.8301486199575372
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7878.19 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7190.38 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6276.34 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.076301,0.921325,0.911885,0.916581
2,0.145800,0.098059,0.840426,0.971311,0.901141
3,0.093200,0.094476,0.904031,0.965164,0.933598
4,0.093200,0.112473,0.892045,0.965164,0.927165
5,0.043300,0.097818,0.925049,0.961066,0.942714


Evaluating fold 2


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 2 Results - Precision: 0.9276437847866419, Recall: 0.931098696461825, F1: 0.929368029739777
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7795.01 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7505.54 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 5784.02 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.01972,0.991886,0.955078,0.973134
2,0.099100,0.018954,0.969112,0.980469,0.974757
3,0.050000,0.019619,0.970874,0.976562,0.97371
4,0.050000,0.031189,0.956522,0.988281,0.972142
5,0.017100,0.025642,0.969231,0.984375,0.976744


Evaluating fold 3


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 3 Results - Precision: 0.9928057553956835, Recall: 0.9928057553956835, F1: 0.9928057553956835

=== Final Cross-Validation Results ===
Average Precision: 0.9278613993396566
Average Recall: 0.9078318179674995
Average F1 Score: 0.917440801697666
Average epoch time: 21.54s ± 0.41s
Total training time: 5.4 minutes
Peak memory usage: 5871.0MB
Average batch time: 0.0590s ± 0.0012s
GPUs used: NVIDIA A30
Training complete. Model directory for fold 1 deleted to free memory.
Training complete. Model directory for fold 2 deleted to free memory.
Training complete. Model directory for fold 3 deleted to free memory.

training and results for TUM/GottBERT_base_best:


Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 2529.22 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 5918.88 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6671.47 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.149155,0.807808,0.696891,0.748261
2,0.172600,0.141662,0.8125,0.774611,0.793103
3,0.090800,0.229301,0.753333,0.878238,0.811005
4,0.090800,0.264821,0.774882,0.84715,0.809406
5,0.039400,0.300966,0.781022,0.831606,0.805521


Evaluating fold 1


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 1 Results - Precision: 0.8036649214659686, Recall: 0.8342391304347826, F1: 0.8186666666666667
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 8111.94 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7648.77 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6813.71 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.053581,0.925414,0.951705,0.938375
2,0.104400,0.075726,0.936963,0.928977,0.932953
3,0.050600,0.091022,0.912568,0.948864,0.930362
4,0.050600,0.089783,0.95614,0.928977,0.942363
5,0.017700,0.095252,0.937677,0.940341,0.939007


Evaluating fold 2


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 2 Results - Precision: 0.9577114427860697, Recall: 0.941320293398533, F1: 0.9494451294697904
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7777.29 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7063.88 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6841.00 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.021032,0.960212,0.991781,0.975741
2,0.058300,0.016597,0.991643,0.975342,0.983425
3,0.031100,0.015137,0.991713,0.983562,0.98762
4,0.031100,0.020982,0.986339,0.989041,0.987688
5,0.004300,0.021922,0.986339,0.989041,0.987688


Evaluating fold 3


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 3 Results - Precision: 0.9839080459770115, Recall: 0.9953488372093023, F1: 0.9895953757225434

=== Final Cross-Validation Results ===
Average Precision: 0.9150948034096832
Average Recall: 0.9236360870142059
Average F1 Score: 0.9192357239530001
Average epoch time: 13.71s ± 0.17s
Total training time: 3.4 minutes
Peak memory usage: 3409.0MB
Average batch time: 0.0386s ± 0.0005s
GPUs used: NVIDIA A30
Training complete. Model directory for fold 1 deleted to free memory.
Training complete. Model directory for fold 2 deleted to free memory.
Training complete. Model directory for fold 3 deleted to free memory.

training and results for TUM/GottBERT_filtered_base_best:


Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6982.67 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6148.93 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6995.68 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.136131,0.827089,0.743523,0.783083
2,0.162900,0.161202,0.814016,0.782383,0.797886
3,0.082900,0.24675,0.740175,0.878238,0.803318
4,0.082900,0.275604,0.810585,0.753886,0.781208
5,0.031900,0.295568,0.765957,0.839378,0.800989


Evaluating fold 1


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 1 Results - Precision: 0.806615776081425, Recall: 0.8614130434782609, F1: 0.8331143232588699
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7572.32 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7567.69 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6428.90 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.051166,0.937853,0.943182,0.94051
2,0.099600,0.075988,0.935211,0.943182,0.93918
3,0.044700,0.078678,0.923497,0.960227,0.941504
4,0.044700,0.078038,0.965318,0.948864,0.95702
5,0.012500,0.076945,0.957265,0.954545,0.955903


Evaluating fold 2


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 2 Results - Precision: 0.969309462915601, Recall: 0.9266503667481663, F1: 0.9474999999999999
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 8021.25 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7436.71 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6896.85 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.014207,0.975741,0.991781,0.983696
2,0.051400,0.010762,0.986301,0.986301,0.986301
3,0.027800,0.017562,0.99169,0.980822,0.986226
4,0.027800,0.017234,0.978378,0.991781,0.985034
5,0.007400,0.016298,0.978378,0.991781,0.985034


Evaluating fold 3


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 3 Results - Precision: 0.997624703087886, Recall: 0.9767441860465116, F1: 0.9870740305522914

=== Final Cross-Validation Results ===
Average Precision: 0.9245166473616374
Average Recall: 0.9216025320909796
Average F1 Score: 0.9225627846037204
Average epoch time: 13.97s ± 0.31s
Total training time: 3.5 minutes
Peak memory usage: 3443.0MB
Average batch time: 0.0394s ± 0.0009s
GPUs used: NVIDIA A30
Training complete. Model directory for fold 1 deleted to free memory.
Training complete. Model directory for fold 2 deleted to free memory.
Training complete. Model directory for fold 3 deleted to free memory.

training and results for TUM/GottBERT_base_last:


Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7261.78 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6465.86 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6577.56 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.142974,0.796703,0.751295,0.773333
2,0.170500,0.21167,0.854785,0.670984,0.751814
3,0.096700,0.228571,0.774118,0.852332,0.811344
4,0.096700,0.295973,0.772959,0.784974,0.77892
5,0.033400,0.338744,0.770574,0.800518,0.78526


Evaluating fold 1


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 1 Results - Precision: 0.8096514745308311, Recall: 0.8206521739130435, F1: 0.8151147098515519
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6566.16 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6721.06 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6193.32 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.042795,0.967647,0.934659,0.950867
2,0.104400,0.080136,0.943343,0.946023,0.944681
3,0.048500,0.072839,0.948718,0.946023,0.947368
4,0.048500,0.091387,0.948864,0.948864,0.948864
5,0.018900,0.087557,0.946176,0.948864,0.947518


Evaluating fold 2


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 2 Results - Precision: 0.9407407407407408, Recall: 0.9315403422982885, F1: 0.9361179361179361
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7760.36 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7046.01 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6721.19 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.035774,0.949735,0.983562,0.966353
2,0.076000,0.046736,0.911168,0.983562,0.945982
3,0.040500,0.039076,0.977208,0.939726,0.958101
4,0.040500,0.067758,0.926893,0.972603,0.949198
5,0.010000,0.067026,0.936,0.961644,0.948649


Evaluating fold 3


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 3 Results - Precision: 0.963302752293578, Recall: 0.9767441860465116, F1: 0.9699769053117783

=== Final Cross-Validation Results ===
Average Precision: 0.9045649891883834
Average Recall: 0.9096455674192812
Average F1 Score: 0.9070698504270888
Average epoch time: 13.91s ± 0.14s
Total training time: 3.5 minutes
Peak memory usage: 3443.0MB
Average batch time: 0.0391s ± 0.0004s
GPUs used: NVIDIA A30
Training complete. Model directory for fold 1 deleted to free memory.
Training complete. Model directory for fold 2 deleted to free memory.
Training complete. Model directory for fold 3 deleted to free memory.

training and results for distilbert/distilbert-base-german-cased:


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7828.64 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7172.09 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7924.57 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.166955,0.763098,0.763098,0.763098
2,0.198200,0.188911,0.77221,0.77221,0.77221
3,0.091300,0.235071,0.722868,0.849658,0.781152
4,0.091300,0.251023,0.761161,0.776765,0.768884
5,0.045600,0.265973,0.767494,0.774487,0.770975


Evaluating fold 1


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 1 Results - Precision: 0.8025751072961373, Recall: 0.8558352402745996, F1: 0.8283499446290143
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 8477.41 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 8218.47 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7305.42 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.046199,0.968831,0.934837,0.951531
2,0.105600,0.056018,0.973404,0.917293,0.944516
3,0.052400,0.053112,0.957071,0.949875,0.953459
4,0.052400,0.063402,0.954887,0.954887,0.954887
5,0.021100,0.064282,0.959288,0.944862,0.95202


Evaluating fold 2


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 2 Results - Precision: 0.9216101694915254, Recall: 0.9436008676789588, F1: 0.932475884244373
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 8110.89 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6902.38 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6735.76 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.005492,0.990741,0.995349,0.993039
2,0.044000,0.004305,0.993023,0.993023,0.993023
3,0.023100,0.008525,0.981693,0.997674,0.989619
4,0.023100,0.005888,0.990762,0.997674,0.994206
5,0.008500,0.005475,0.993056,0.997674,0.99536


Evaluating fold 3


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 3 Results - Precision: 1.0, Recall: 0.9979338842975206, F1: 0.998965873836608

=== Final Cross-Validation Results ===
Average Precision: 0.9080617589292209
Average Recall: 0.932456664083693
Average F1 Score: 0.9199305675699985
Average epoch time: 7.44s ± 0.16s
Total training time: 1.9 minutes
Peak memory usage: 1701.0MB
Average batch time: 0.0210s ± 0.0004s
GPUs used: NVIDIA A30
Training complete. Model directory for fold 1 deleted to free memory.
Training complete. Model directory for fold 2 deleted to free memory.
Training complete. Model directory for fold 3 deleted to free memory.

training and results for GerMedBERT/medbert-512:


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6984.72 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 5780.64 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6410.70 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.15883,0.798898,0.692124,0.741688
2,0.177700,0.19168,0.758389,0.809069,0.78291
3,0.070400,0.257181,0.762115,0.825776,0.792669
4,0.070400,0.31891,0.79198,0.754177,0.772616
5,0.019700,0.341037,0.769053,0.794749,0.78169


Evaluating fold 1


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 1 Results - Precision: 0.8333333333333334, Recall: 0.7848101265822784, F1: 0.8083441981747067
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7054.67 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6522.78 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 5989.89 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.043589,0.953608,0.951157,0.952381
2,0.099200,0.069186,0.940874,0.940874,0.940874
3,0.033000,0.046925,0.969072,0.966581,0.967825
4,0.033000,0.060008,0.961735,0.969152,0.965429
5,0.012700,0.063435,0.956743,0.966581,0.961637


Evaluating fold 2


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 2 Results - Precision: 0.9695550351288056, Recall: 0.9452054794520548, F1: 0.9572254335260115
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7411.89 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6717.99 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6798.16 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.019633,0.968599,0.992574,0.98044
2,0.047000,0.010864,0.97343,0.997525,0.98533
3,0.021200,0.002495,0.995074,1.0,0.997531
4,0.021200,0.004475,0.992629,1.0,0.996301
5,0.003200,0.004381,0.992629,1.0,0.996301


Evaluating fold 3


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 3 Results - Precision: 0.9932126696832579, Recall: 0.9954648526077098, F1: 0.9943374858437145

=== Final Cross-Validation Results ===
Average Precision: 0.9320336793817989
Average Recall: 0.9084934862140144
Average F1 Score: 0.9199690391814775
Average epoch time: 13.19s ± 0.20s
Total training time: 3.3 minutes
Peak memory usage: 3061.0MB
Average batch time: 0.0374s ± 0.0006s
GPUs used: NVIDIA A30
Training complete. Model directory for fold 1 deleted to free memory.
Training complete. Model directory for fold 2 deleted to free memory.
Training complete. Model directory for fold 3 deleted to free memory.

training and results for deepset/gbert-base:


Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6977.26 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6460.11 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7060.34 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.161256,0.804762,0.769932,0.786962
2,0.177500,0.189438,0.83038,0.747153,0.786571
3,0.076400,0.249218,0.743295,0.883827,0.807492
4,0.076400,0.263445,0.798186,0.801822,0.8
5,0.022300,0.291742,0.782241,0.842825,0.811404


Evaluating fold 1


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 1 Results - Precision: 0.8802992518703242, Recall: 0.8077803203661327, F1: 0.8424821002386633
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7563.43 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7423.55 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6673.67 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.021046,0.958537,0.984962,0.97157
2,0.094800,0.021718,0.967901,0.982456,0.975124
3,0.038600,0.033615,0.975124,0.982456,0.978777
4,0.038600,0.036577,0.972772,0.984962,0.978829
5,0.010900,0.037364,0.972705,0.982456,0.977556


Evaluating fold 2


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 2 Results - Precision: 0.9889867841409692, Recall: 0.9739696312364425, F1: 0.9814207650273225
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7594.52 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7112.79 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6661.88 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.002999,0.997674,0.997674,0.997674
2,0.029600,0.006367,0.997674,0.997674,0.997674
3,0.016900,0.00643,0.993056,0.997674,0.99536
4,0.016900,0.007164,0.99536,0.997674,0.996516
5,0.003100,0.008883,0.99536,0.997674,0.996516


Evaluating fold 3


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Fold 3 Results - Precision: 1.0, Recall: 0.9958677685950413, F1: 0.9979296066252588

=== Final Cross-Validation Results ===
Average Precision: 0.9564286786704311
Average Recall: 0.9258725733992055
Average F1 Score: 0.9406108239637483
Average epoch time: 13.19s ± 0.09s
Total training time: 3.3 minutes
Peak memory usage: 3077.0MB
Average batch time: 0.0374s ± 0.0003s
GPUs used: NVIDIA A30
Training complete. Model directory for fold 1 deleted to free memory.
Training complete. Model directory for fold 2 deleted to free memory.
Training complete. Model directory for fold 3 deleted to free memory.



In [8]:
for model in models:
    print(f'training and results for {model}:')
    ate_cat_model_kfold(data, model, rn1=42, rn2=42, k=3, epochs=5)
    print()

training and results for google-bert/bert-base-german-cased:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7650.82 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6851.91 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7515.64 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.237318,0.727778,0.577093,0.643735
2,0.307000,0.262989,0.738208,0.689427,0.712984
3,0.104100,0.323069,0.690574,0.742291,0.715499
4,0.104100,0.354335,0.75174,0.713656,0.732203
5,0.026400,0.373438,0.716484,0.718062,0.717272


Evaluating fold 1


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.86      0.93      0.90        74
    Krankenhaus       0.79      0.69      0.74       131
       Personal       0.68      0.62      0.65        24
 Pflegepersonal       0.84      0.87      0.86        31
anderer Service       0.76      0.43      0.55        65
 mediz. Service       0.85      0.64      0.73       128

      micro avg       0.81      0.69      0.75       453
      macro avg       0.80      0.70      0.74       453
   weighted avg       0.81      0.69      0.74       453

Fold 1 Results - Precision: 0.814621409921671, Recall: 0.6887417218543046, F1: 0.7464114832535885
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7715.04 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7658.91 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6853.50 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.031665,0.956522,0.954217,0.955368
2,0.124300,0.021446,0.966346,0.968675,0.967509
3,0.051700,0.02898,0.973558,0.975904,0.974729
4,0.051700,0.02915,0.966427,0.971084,0.96875
5,0.008800,0.032194,0.954869,0.968675,0.961722


Evaluating fold 2


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.95      0.98      0.96        81
    Krankenhaus       0.97      0.96      0.96        94
       Personal       0.97      1.00      0.99        39
 Pflegepersonal       0.98      0.98      0.98        48
anderer Service       1.00      0.88      0.94        51
 mediz. Service       0.99      0.96      0.97       164

      micro avg       0.98      0.96      0.97       477
      macro avg       0.98      0.96      0.97       477
   weighted avg       0.98      0.96      0.97       477

Fold 2 Results - Precision: 0.9764957264957265, Recall: 0.9580712788259959, F1: 0.9671957671957672
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7085.84 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6377.44 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6000.89 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.00913,0.990909,0.988662,0.989784
2,0.042500,0.012117,0.984305,0.995465,0.989853
3,0.016300,0.008321,0.993213,0.995465,0.994337
4,0.016300,0.008519,0.99093,0.99093,0.99093
5,0.004400,0.008011,0.993197,0.993197,0.993197


Evaluating fold 3


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       1.00      1.00      1.00       124
    Krankenhaus       1.00      1.00      1.00       137
       Personal       1.00      1.00      1.00        16
 Pflegepersonal       1.00      0.97      0.99        36
anderer Service       1.00      1.00      1.00        35
 mediz. Service       1.00      1.00      1.00       158

      micro avg       1.00      1.00      1.00       506
      macro avg       1.00      1.00      1.00       506
   weighted avg       1.00      1.00      1.00       506

Fold 3 Results - Precision: 1.0, Recall: 0.9980237154150198, F1: 0.9990108803165183

=== Final Cross-Validation Results ===
Average Precision: 0.930372378805799
Average Recall: 0.8816122386984402
Average F1 Score: 0.9042060435886246
Average epoch time: 13.27s ± 0.23s
Total training time: 3.3 minutes
Peak mem

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6801.56 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6699.02 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7614.46 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.262493,0.708333,0.580866,0.638298
2,0.331300,0.259184,0.70726,0.687927,0.69746
3,0.124400,0.314788,0.700651,0.735763,0.717778
4,0.124400,0.338272,0.731884,0.690205,0.710434
5,0.052200,0.363588,0.697168,0.728929,0.712695


Evaluating fold 1


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.92      0.92      0.92        75
    Krankenhaus       0.80      0.80      0.80       128
       Personal       0.69      0.41      0.51        22
 Pflegepersonal       0.62      0.84      0.71        31
anderer Service       0.70      0.39      0.50        66
 mediz. Service       0.74      0.75      0.74       115

      micro avg       0.77      0.73      0.75       437
      macro avg       0.75      0.68      0.70       437
   weighted avg       0.77      0.73      0.74       437

Fold 1 Results - Precision: 0.7737226277372263, Recall: 0.7276887871853547, F1: 0.75
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7434.69 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6959.71 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6304.59 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.088692,0.871671,0.902256,0.8867
2,0.151400,0.117112,0.867647,0.887218,0.877323
3,0.068400,0.114442,0.890819,0.899749,0.895262
4,0.068400,0.114251,0.888337,0.897243,0.892768
5,0.020200,0.120885,0.8925,0.894737,0.893617


Evaluating fold 2


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      1.00      0.94        82
    Krankenhaus       0.91      0.98      0.94        91
       Personal       0.94      0.87      0.90        38
 Pflegepersonal       0.93      0.90      0.91        48
anderer Service       0.93      0.82      0.87        51
 mediz. Service       0.93      0.96      0.94       151

      micro avg       0.92      0.94      0.93       461
      macro avg       0.92      0.92      0.92       461
   weighted avg       0.92      0.94      0.93       461

Fold 2 Results - Precision: 0.9175475687103594, Recall: 0.9414316702819957, F1: 0.9293361884368309
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6957.35 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 5597.07 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6209.37 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.01024,0.98829,0.981395,0.984831
2,0.071300,0.026597,0.945701,0.972093,0.958716
3,0.035300,0.028179,0.952703,0.983721,0.967963
4,0.035300,0.033446,0.952489,0.97907,0.965596
5,0.008700,0.030142,0.952489,0.97907,0.965596


Evaluating fold 3


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.99      1.00      1.00       122
    Krankenhaus       0.99      0.99      0.99       137
       Personal       0.94      1.00      0.97        17
 Pflegepersonal       0.97      0.97      0.97        33
anderer Service       1.00      0.97      0.98        32
 mediz. Service       1.00      0.97      0.99       143

      micro avg       0.99      0.98      0.99       484
      macro avg       0.98      0.98      0.98       484
   weighted avg       0.99      0.98      0.99       484

Fold 3 Results - Precision: 0.9896049896049897, Recall: 0.9834710743801653, F1: 0.9865284974093265

=== Final Cross-Validation Results ===
Average Precision: 0.8936250620175251
Average Recall: 0.8841971772825051
Average F1 Score: 0.8886215619487191
Average epoch time: 13.41s ± 0.22s
Total training time: 3.4 

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 5794.71 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 5455.67 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 5990.70 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.243572,0.67013,0.601399,0.633907
2,0.310700,0.268538,0.787966,0.641026,0.706941
3,0.125400,0.274023,0.661795,0.738928,0.698238
4,0.125400,0.28988,0.763682,0.715618,0.738869
5,0.053000,0.321191,0.712719,0.757576,0.734463


Evaluating fold 1


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.94      0.91      0.92        81
    Krankenhaus       0.80      0.67      0.73       129
       Personal       0.54      0.59      0.57        22
 Pflegepersonal       0.76      0.76      0.76        29
anderer Service       0.70      0.37      0.48        62
 mediz. Service       0.90      0.60      0.72       117

      micro avg       0.82      0.65      0.73       440
      macro avg       0.77      0.65      0.70       440
   weighted avg       0.82      0.65      0.72       440

Fold 1 Results - Precision: 0.8228571428571428, Recall: 0.6545454545454545, F1: 0.729113924050633
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6777.72 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 5948.71 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 5619.90 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.045558,0.952261,0.931204,0.941615
2,0.134200,0.052476,0.950372,0.941032,0.945679
3,0.058700,0.062286,0.919903,0.931204,0.925519
4,0.058700,0.063961,0.945274,0.933661,0.939431
5,0.017300,0.066859,0.936275,0.938575,0.937423


Evaluating fold 2


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.93      1.00      0.96        92
    Krankenhaus       0.99      0.92      0.95        88
       Personal       0.97      0.91      0.94        35
 Pflegepersonal       0.93      0.90      0.91        48
anderer Service       0.93      0.80      0.86        54
 mediz. Service       0.96      0.96      0.96       140

      micro avg       0.96      0.93      0.94       457
      macro avg       0.95      0.91      0.93       457
   weighted avg       0.96      0.93      0.94       457

Fold 2 Results - Precision: 0.9550561797752809, Recall: 0.9299781181619255, F1: 0.942350332594235
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6795.24 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6397.99 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 5854.05 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.024989,0.96789,0.979118,0.973472
2,0.069600,0.022265,0.974537,0.976798,0.975666
3,0.037900,0.032156,0.969977,0.974478,0.972222
4,0.037900,0.035959,0.965831,0.983759,0.974713
5,0.006300,0.034505,0.968037,0.983759,0.975834


Evaluating fold 3


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       1.00      1.00      1.00       132
    Krankenhaus       0.97      1.00      0.99       136
       Personal       1.00      0.94      0.97        17
 Pflegepersonal       1.00      0.94      0.97        34
anderer Service       0.97      1.00      0.99        34
 mediz. Service       0.96      0.99      0.98       145

      micro avg       0.98      0.99      0.99       498
      macro avg       0.98      0.98      0.98       498
   weighted avg       0.98      0.99      0.99       498

Fold 3 Results - Precision: 0.9782178217821782, Recall: 0.9919678714859438, F1: 0.9850448654037887

=== Final Cross-Validation Results ===
Average Precision: 0.9187103814715339
Average Recall: 0.8588304813977746
Average F1 Score: 0.8855030406828854
Average epoch time: 13.32s ± 0.15s
Total training time: 3.3 

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6197.97 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 5839.62 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 5861.70 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.283748,0.705556,0.50497,0.588644
2,0.406000,0.252001,0.702355,0.652087,0.676289
3,0.181900,0.312433,0.636213,0.761431,0.693213
4,0.181900,0.306125,0.707635,0.755467,0.730769
5,0.100500,0.351083,0.671329,0.763419,0.714419


Evaluating fold 1


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.83      0.89      0.86        79
    Krankenhaus       0.72      0.87      0.79       127
       Personal       0.46      0.42      0.44        26
 Pflegepersonal       0.94      0.86      0.90        37
anderer Service       0.68      0.39      0.49        83
 mediz. Service       0.77      0.72      0.75       137

      micro avg       0.75      0.72      0.74       489
      macro avg       0.74      0.69      0.70       489
   weighted avg       0.75      0.72      0.73       489

Fold 1 Results - Precision: 0.7547974413646056, Recall: 0.7239263803680982, F1: 0.7390396659707725
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 4756.17 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6913.01 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6447.96 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.129821,0.822811,0.827869,0.825332
2,0.175500,0.130585,0.831325,0.848361,0.839757
3,0.107600,0.137708,0.819578,0.875,0.846383
4,0.107600,0.116006,0.882231,0.875,0.878601
5,0.052600,0.121428,0.883436,0.885246,0.88434


Evaluating fold 2


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.90      1.00      0.95        99
    Krankenhaus       0.97      0.97      0.97       104
       Personal       0.90      0.86      0.88        44
 Pflegepersonal       0.98      0.89      0.93        63
anderer Service       0.83      0.73      0.78        62
 mediz. Service       0.87      0.92      0.89       165

      micro avg       0.91      0.91      0.91       537
      macro avg       0.91      0.90      0.90       537
   weighted avg       0.91      0.91      0.91       537

Fold 2 Results - Precision: 0.9059040590405905, Recall: 0.9143389199255121, F1: 0.9101019462465246
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7815.55 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7449.37 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6736.46 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.024765,0.963532,0.980469,0.971926
2,0.112100,0.020705,0.965451,0.982422,0.973863
3,0.065400,0.022237,0.961977,0.988281,0.974952
4,0.065400,0.028158,0.950758,0.980469,0.965385
5,0.022800,0.023703,0.963671,0.984375,0.973913


Evaluating fold 3


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.96      1.00      0.98       136
    Krankenhaus       0.99      0.99      0.99       141
       Personal       0.89      0.80      0.84        20
 Pflegepersonal       1.00      0.95      0.98        43
anderer Service       0.92      1.00      0.96        45
 mediz. Service       0.97      1.00      0.98       171

      micro avg       0.96      0.99      0.98       556
      macro avg       0.95      0.96      0.95       556
   weighted avg       0.97      0.99      0.97       556

Fold 3 Results - Precision: 0.9647887323943662, Recall: 0.9856115107913669, F1: 0.9750889679715302

=== Final Cross-Validation Results ===
Average Precision: 0.8751634109331873
Average Recall: 0.8746256036949923
Average F1 Score: 0.8747435267296092
Average epoch time: 21.61s ± 0.40s
Total training time: 5.4 

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6999.44 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6264.89 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6559.91 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.236775,0.731293,0.556995,0.632353
2,0.311300,0.224757,0.756098,0.642487,0.694678
3,0.148600,0.247723,0.68764,0.792746,0.736462
4,0.148600,0.259495,0.748718,0.756477,0.752577
5,0.066900,0.276931,0.721271,0.764249,0.742138


Evaluating fold 1


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.87      0.92      0.89        63
    Krankenhaus       0.77      0.73      0.75       111
       Personal       0.50      0.42      0.46        19
 Pflegepersonal       1.00      0.81      0.90        27
anderer Service       0.77      0.39      0.52        51
 mediz. Service       0.72      0.76      0.74        97

      micro avg       0.78      0.71      0.74       368
      macro avg       0.77      0.67      0.71       368
   weighted avg       0.78      0.71      0.74       368

Fold 1 Results - Precision: 0.775811209439528, Recall: 0.7146739130434783, F1: 0.743988684582744
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7834.61 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6845.25 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6737.55 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.082854,0.919162,0.872159,0.895044
2,0.142200,0.097967,0.887671,0.920455,0.903766
3,0.075000,0.101255,0.895317,0.923295,0.909091
4,0.075000,0.117176,0.901685,0.911932,0.90678
5,0.028500,0.109767,0.9,0.920455,0.910112


Evaluating fold 2


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.93      0.99      0.96        76
    Krankenhaus       0.97      0.99      0.98        87
       Personal       0.83      0.97      0.89        35
 Pflegepersonal       0.95      0.90      0.92        40
anderer Service       0.89      0.76      0.82        45
 mediz. Service       0.92      0.94      0.93       126

      micro avg       0.92      0.94      0.93       409
      macro avg       0.91      0.92      0.92       409
   weighted avg       0.92      0.94      0.93       409

Fold 2 Results - Precision: 0.9228915662650602, Recall: 0.9364303178484108, F1: 0.9296116504854369
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7722.14 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7309.58 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6461.78 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.017412,0.97043,0.989041,0.979647
2,0.073200,0.025338,0.954907,0.986301,0.97035
3,0.040800,0.033182,0.967742,0.986301,0.976934
4,0.040800,0.032743,0.962667,0.989041,0.975676
5,0.010100,0.034151,0.962667,0.989041,0.975676


Evaluating fold 3


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.99      1.00      1.00       107
    Krankenhaus       0.96      1.00      0.98       124
       Personal       1.00      0.94      0.97        16
 Pflegepersonal       1.00      1.00      1.00        29
anderer Service       1.00      0.97      0.98        29
 mediz. Service       0.99      0.98      0.99       125

      micro avg       0.98      0.99      0.99       430
      macro avg       0.99      0.98      0.99       430
   weighted avg       0.98      0.99      0.99       430

Fold 3 Results - Precision: 0.9838337182448037, Recall: 0.9906976744186047, F1: 0.9872537659327927

=== Final Cross-Validation Results ===
Average Precision: 0.8941788313164639
Average Recall: 0.880600635103498
Average F1 Score: 0.8869513670003245
Average epoch time: 13.98s ± 0.23s
Total training time: 3.5 m

Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7185.13 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6127.76 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7065.06 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.224291,0.737542,0.57513,0.646288
2,0.312100,0.217883,0.762463,0.673575,0.715268
3,0.142600,0.262052,0.680653,0.756477,0.716564
4,0.142600,0.30311,0.754875,0.702073,0.727517
5,0.061500,0.317908,0.711165,0.759067,0.734336


Evaluating fold 1


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.95      0.92      0.94        63
    Krankenhaus       0.79      0.83      0.81       111
       Personal       0.73      0.42      0.53        19
 Pflegepersonal       0.75      0.89      0.81        27
anderer Service       0.67      0.43      0.52        51
 mediz. Service       0.72      0.76      0.74        97

      micro avg       0.78      0.76      0.77       368
      macro avg       0.77      0.71      0.73       368
   weighted avg       0.78      0.76      0.76       368

Fold 1 Results - Precision: 0.7808988764044944, Recall: 0.7554347826086957, F1: 0.7679558011049723
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 8154.55 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7710.54 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6727.00 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.105623,0.908228,0.815341,0.859281
2,0.133700,0.098942,0.897222,0.917614,0.907303
3,0.066300,0.105091,0.907303,0.917614,0.912429
4,0.066300,0.115734,0.917379,0.914773,0.916074
5,0.021700,0.115664,0.915966,0.928977,0.922426


Evaluating fold 2


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.95      1.00      0.97        76
    Krankenhaus       0.98      0.95      0.97        87
       Personal       0.89      0.91      0.90        35
 Pflegepersonal       0.95      0.97      0.96        40
anderer Service       0.89      0.76      0.82        45
 mediz. Service       0.93      0.97      0.95       126

      micro avg       0.94      0.94      0.94       409
      macro avg       0.93      0.93      0.93       409
   weighted avg       0.94      0.94      0.94       409

Fold 2 Results - Precision: 0.9391727493917275, Recall: 0.9437652811735942, F1: 0.9414634146341463
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 8208.25 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7310.26 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6609.95 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.036652,0.942857,0.994521,0.968
2,0.075900,0.016494,0.97035,0.986301,0.978261
3,0.038700,0.020675,0.973262,0.99726,0.985115
4,0.038700,0.02241,0.973046,0.989041,0.980978
5,0.009100,0.021728,0.973046,0.989041,0.980978


Evaluating fold 3


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.99      1.00      1.00       107
    Krankenhaus       0.95      1.00      0.98       124
       Personal       1.00      1.00      1.00        16
 Pflegepersonal       1.00      0.97      0.98        29
anderer Service       0.91      1.00      0.95        29
 mediz. Service       0.99      0.97      0.98       125

      micro avg       0.97      0.99      0.98       430
      macro avg       0.97      0.99      0.98       430
   weighted avg       0.98      0.99      0.98       430

Fold 3 Results - Precision: 0.9747706422018348, Recall: 0.9883720930232558, F1: 0.9815242494226328

=== Final Cross-Validation Results ===
Average Precision: 0.8982807559993522
Average Recall: 0.8958573856018486
Average F1 Score: 0.8969811550539172
Average epoch time: 13.89s ± 0.16s
Total training time: 3.5 

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7235.80 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6203.28 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7042.46 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.260666,0.646688,0.531088,0.583215
2,0.374100,0.20723,0.739264,0.624352,0.676966
3,0.175300,0.258859,0.650685,0.738342,0.691748
4,0.175300,0.26565,0.719424,0.777202,0.747198
5,0.084900,0.301255,0.713936,0.756477,0.734591


Evaluating fold 1


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.94      0.91        63
    Krankenhaus       0.75      0.75      0.75       111
       Personal       0.57      0.42      0.48        19
 Pflegepersonal       0.96      0.81      0.88        27
anderer Service       0.68      0.37      0.48        51
 mediz. Service       0.64      0.73      0.68        97

      micro avg       0.74      0.71      0.73       368
      macro avg       0.75      0.67      0.70       368
   weighted avg       0.74      0.71      0.72       368

Fold 1 Results - Precision: 0.7401129943502824, Recall: 0.7119565217391305, F1: 0.7257617728531855
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 8163.12 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7625.26 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6727.89 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.105188,0.824873,0.923295,0.871314
2,0.152200,0.101301,0.836317,0.928977,0.880215
3,0.084700,0.107942,0.85752,0.923295,0.889193
4,0.084700,0.115332,0.885154,0.897727,0.891396
5,0.032000,0.117941,0.87062,0.917614,0.893499


Evaluating fold 2


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.92      1.00      0.96        76
    Krankenhaus       0.86      0.94      0.90        87
       Personal       0.94      0.89      0.91        35
 Pflegepersonal       0.82      0.93      0.87        40
anderer Service       0.79      0.69      0.74        45
 mediz. Service       0.91      0.93      0.92       126

      micro avg       0.88      0.91      0.90       409
      macro avg       0.87      0.90      0.88       409
   weighted avg       0.88      0.91      0.90       409

Fold 2 Results - Precision: 0.8841607565011821, Recall: 0.9144254278728606, F1: 0.8990384615384617
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7966.16 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7509.46 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7056.76 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.020625,0.962366,0.980822,0.971506
2,0.086000,0.022006,0.956989,0.975342,0.966079
3,0.053100,0.040588,0.937337,0.983562,0.959893
4,0.053100,0.028388,0.949602,0.980822,0.96496
5,0.010300,0.026195,0.959677,0.978082,0.968792


Evaluating fold 3


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.99      0.98      0.99       107
    Krankenhaus       0.95      1.00      0.98       124
       Personal       0.94      1.00      0.97        16
 Pflegepersonal       0.96      0.93      0.95        29
anderer Service       0.97      1.00      0.98        29
 mediz. Service       0.99      0.98      0.99       125

      micro avg       0.97      0.99      0.98       430
      macro avg       0.97      0.98      0.98       430
   weighted avg       0.98      0.99      0.98       430

Fold 3 Results - Precision: 0.9747126436781609, Recall: 0.986046511627907, F1: 0.9803468208092486

=== Final Cross-Validation Results ===
Average Precision: 0.8663287981765418
Average Recall: 0.8708094870799661
Average F1 Score: 0.868382351733632
Average epoch time: 13.47s ± 0.09s
Total training time: 3.4 mi

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7994.30 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7203.76 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7915.90 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.26769,0.656915,0.562642,0.606135
2,0.391700,0.235984,0.706767,0.642369,0.673031
3,0.149100,0.27016,0.647303,0.710706,0.677524
4,0.149100,0.276616,0.730198,0.671982,0.699881
5,0.079000,0.288829,0.720195,0.67426,0.696471


Evaluating fold 1


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.87      0.87        75
    Krankenhaus       0.77      0.66      0.71       128
       Personal       0.57      0.36      0.44        22
 Pflegepersonal       0.77      0.77      0.77        31
anderer Service       0.72      0.39      0.51        66
 mediz. Service       0.67      0.68      0.67       115

      micro avg       0.75      0.65      0.70       437
      macro avg       0.73      0.62      0.66       437
   weighted avg       0.74      0.65      0.69       437

Fold 1 Results - Precision: 0.7480314960629921, Recall: 0.6521739130434783, F1: 0.6968215158924206
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 8062.29 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7515.71 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7231.34 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.062375,0.927461,0.897243,0.912102
2,0.138700,0.076717,0.909326,0.879699,0.894268
3,0.073800,0.063025,0.915385,0.894737,0.904943
4,0.073800,0.075554,0.918367,0.902256,0.91024
5,0.033200,0.073607,0.927649,0.899749,0.913486


Evaluating fold 2


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.92      0.98      0.95        82
    Krankenhaus       0.88      0.93      0.90        91
       Personal       0.93      0.71      0.81        38
 Pflegepersonal       0.98      0.88      0.92        48
anderer Service       0.93      0.80      0.86        51
 mediz. Service       0.97      0.91      0.94       151

      micro avg       0.93      0.90      0.91       461
      macro avg       0.93      0.87      0.90       461
   weighted avg       0.94      0.90      0.91       461

Fold 2 Results - Precision: 0.9343891402714932, Recall: 0.89587852494577, F1: 0.9147286821705427
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 8159.54 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6932.87 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6546.58 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.010926,0.990544,0.974419,0.982415
2,0.065200,0.016147,0.986014,0.983721,0.984866
3,0.035100,0.011173,0.988426,0.993023,0.990719
4,0.035100,0.011125,0.990698,0.990698,0.990698
5,0.011800,0.011581,0.990698,0.990698,0.990698


Evaluating fold 3


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       1.00      0.99      1.00       122
    Krankenhaus       1.00      0.99      0.99       137
       Personal       1.00      1.00      1.00        17
 Pflegepersonal       1.00      1.00      1.00        33
anderer Service       1.00      1.00      1.00        32
 mediz. Service       0.99      0.99      0.99       143

      micro avg       1.00      0.99      0.99       484
      macro avg       1.00      0.99      1.00       484
   weighted avg       1.00      0.99      0.99       484

Fold 3 Results - Precision: 0.9979166666666667, Recall: 0.9896694214876033, F1: 0.9937759336099585

=== Final Cross-Validation Results ===
Average Precision: 0.8934457676670506
Average Recall: 0.8459072864922837
Average F1 Score: 0.8684420438909739
Average epoch time: 7.27s ± 0.07s
Total training time: 1.8 m

Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7219.28 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6652.97 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7659.63 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.256796,0.731861,0.553699,0.630435
2,0.314700,0.257816,0.735294,0.656325,0.693569
3,0.124100,0.291026,0.692841,0.71599,0.704225
4,0.124100,0.325889,0.735516,0.696897,0.715686
5,0.046500,0.360089,0.697941,0.727924,0.712617


Evaluating fold 1


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.90      0.88      0.89        65
    Krankenhaus       0.76      0.73      0.74       114
       Personal       0.59      0.52      0.55        25
 Pflegepersonal       0.77      0.80      0.79        30
anderer Service       0.70      0.46      0.56        69
 mediz. Service       0.81      0.60      0.69        92

      micro avg       0.78      0.67      0.72       395
      macro avg       0.76      0.66      0.70       395
   weighted avg       0.77      0.67      0.71       395

Fold 1 Results - Precision: 0.7787610619469026, Recall: 0.6683544303797468, F1: 0.7193460490463216
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7223.17 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6836.48 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 5984.07 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.058222,0.889157,0.948586,0.91791
2,0.133600,0.060433,0.935567,0.933162,0.934363
3,0.063100,0.050863,0.948187,0.940874,0.944516
4,0.063100,0.075324,0.957333,0.922879,0.939791
5,0.020000,0.062934,0.950262,0.933162,0.941634


Evaluating fold 2


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.95      1.00      0.97        77
    Krankenhaus       0.94      0.96      0.95       103
       Personal       0.97      0.88      0.93        43
 Pflegepersonal       0.98      0.98      0.98        49
anderer Service       0.96      0.90      0.93        49
 mediz. Service       0.94      0.98      0.96       117

      micro avg       0.95      0.96      0.96       438
      macro avg       0.96      0.95      0.95       438
   weighted avg       0.95      0.96      0.96       438

Fold 2 Results - Precision: 0.9524886877828054, Recall: 0.9611872146118722, F1: 0.9568181818181818
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6934.11 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 6675.35 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6377.19 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.018382,0.975248,0.975248,0.975248
2,0.064000,0.0102,0.982759,0.987624,0.985185
3,0.025600,0.025478,0.943128,0.985149,0.96368
4,0.025600,0.016028,0.96837,0.985149,0.976687
5,0.004000,0.015135,0.97066,0.982673,0.97663


Evaluating fold 3


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.97      1.00      0.98       118
    Krankenhaus       0.98      0.99      0.99       116
       Personal       0.94      0.88      0.91        17
 Pflegepersonal       1.00      1.00      1.00        40
anderer Service       1.00      1.00      1.00        38
 mediz. Service       0.98      0.98      0.98       112

      micro avg       0.98      0.99      0.98       441
      macro avg       0.98      0.98      0.98       441
   weighted avg       0.98      0.99      0.98       441

Fold 3 Results - Precision: 0.9797752808988764, Recall: 0.9886621315192744, F1: 0.984198645598194

=== Final Cross-Validation Results ===
Average Precision: 0.9036750102095281
Average Recall: 0.8727345921702977
Average F1 Score: 0.8867876254875658
Average epoch time: 12.85s ± 0.12s
Total training time: 3.2 m

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Starting fold 1/3


Map: 100%|██████████| 674/674 [00:00<00:00, 6826.59 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 5969.28 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7066.68 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 1


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.24147,0.6775,0.617312,0.646007
2,0.320000,0.235812,0.7425,0.676538,0.707986
3,0.118600,0.293347,0.672065,0.756264,0.711683
4,0.118600,0.290165,0.731544,0.744875,0.738149
5,0.046000,0.316202,0.716102,0.769932,0.742042


Evaluating fold 1


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.87      0.91      0.89        75
    Krankenhaus       0.81      0.70      0.75       128
       Personal       0.43      0.27      0.33        22
 Pflegepersonal       0.83      0.77      0.80        31
anderer Service       0.74      0.47      0.57        66
 mediz. Service       0.83      0.70      0.76       115

      micro avg       0.81      0.68      0.74       437
      macro avg       0.75      0.64      0.68       437
   weighted avg       0.80      0.68      0.73       437

Fold 1 Results - Precision: 0.8081081081081081, Recall: 0.6842105263157895, F1: 0.7410161090458489
Starting fold 2/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7982.95 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7825.02 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 6949.32 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 2


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.075002,0.873303,0.967419,0.917955
2,0.125600,0.042379,0.945,0.947368,0.946183
3,0.054700,0.046555,0.945545,0.957393,0.951432
4,0.054700,0.054789,0.935323,0.942356,0.938826
5,0.017600,0.054736,0.92665,0.949875,0.938119


Evaluating fold 2


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.90      1.00      0.95        82
    Krankenhaus       0.98      0.93      0.96        91
       Personal       0.97      0.87      0.92        38
 Pflegepersonal       1.00      0.98      0.99        48
anderer Service       0.92      0.88      0.90        51
 mediz. Service       0.98      0.97      0.98       151

      micro avg       0.96      0.95      0.96       461
      macro avg       0.96      0.94      0.95       461
   weighted avg       0.96      0.95      0.96       461

Fold 2 Results - Precision: 0.9585152838427947, Recall: 0.9522776572668112, F1: 0.9553862894450489
Starting fold 3/3


Map: 100%|██████████| 674/674 [00:00<00:00, 7889.22 examples/s]
Map: 100%|██████████| 168/168 [00:00<00:00, 7427.07 examples/s]
Map: 100%|██████████| 169/169 [00:00<00:00, 7037.98 examples/s]
  trainer = Trainer(


['B-mediz. Service' 'B-anderer Service' 'B-Arzt' 'B-Krankenhaus' 'O'
 'B-Personal' 'B-Pflegepersonal']
{0: 4.959959280624364, 1: 10.546176046176047, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 0.1596439493228484, 5: 18.317042606516292, 6: 14.014381591562799}
Training fold 3


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.012702,0.983871,0.993023,0.988426
2,0.055500,0.006445,0.981567,0.990698,0.986111
3,0.022400,0.010198,0.990741,0.995349,0.993039
4,0.022400,0.01278,0.986175,0.995349,0.990741
5,0.005400,0.010794,0.986175,0.995349,0.990741


Evaluating fold 3


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.99      1.00      1.00       122
    Krankenhaus       1.00      1.00      1.00       137
       Personal       1.00      1.00      1.00        17
 Pflegepersonal       1.00      1.00      1.00        33
anderer Service       1.00      1.00      1.00        32
 mediz. Service       0.99      0.99      0.99       143

      micro avg       0.99      1.00      0.99       484
      macro avg       1.00      1.00      1.00       484
   weighted avg       0.99      1.00      0.99       484

Fold 3 Results - Precision: 0.9938144329896907, Recall: 0.9958677685950413, F1: 0.9948400412796697

=== Final Cross-Validation Results ===
Average Precision: 0.9201459416468646
Average Recall: 0.877451984059214
Average F1 Score: 0.8970808132568558
Average epoch time: 12.83s ± 0.09s
Total training time: 3.2 m

### 3. Performance for best models (standard ATE: dbmdz BERT cased, category-aware ATE: GBERT)

In [5]:
for epoch in [5, 6, 7, 8, 10, 12]:
    print(f'training and results for DBMDZ Bert for {epoch} epochs:')
    ate_model(data, "dbmdz/bert-base-german-cased", rn1=42, rn2=42, epochs=epoch)
    print()
# GPU: Tesla V100-PCIE-32GB 

training and results for DBMDZ Bert for 5 epochs:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 928.72 examples/s] 
Map: 100%|██████████| 101/101 [00:00<00:00, 3498.46 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.136408,0.808765,0.799213,0.80396
2,0.202300,0.166257,0.831224,0.775591,0.802444
3,0.096400,0.234171,0.82906,0.76378,0.795082
4,0.040000,0.269962,0.804598,0.826772,0.815534
5,0.023100,0.305297,0.808765,0.799213,0.80396


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3219.59 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.78      0.83       315

   micro avg       0.88      0.78      0.83       315
   macro avg       0.88      0.78      0.83       315
weighted avg       0.88      0.78      0.83       315

Precision Score: 0.8785714285714286
Recall Score: 0.780952380952381
F1 Score: 0.8268907563025211
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4074.92 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3255.75 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.141124,0.845188,0.795276,0.819473
2,0.202300,0.150045,0.844538,0.791339,0.817073
3,0.097100,0.205839,0.838583,0.838583,0.838583
4,0.041100,0.245695,0.81749,0.846457,0.831721
5,0.017200,0.287044,0.828244,0.854331,0.841085
6,0.017200,0.308725,0.830769,0.850394,0.840467


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3065.75 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.77      0.83       315

   micro avg       0.89      0.77      0.83       315
   macro avg       0.89      0.77      0.83       315
weighted avg       0.89      0.77      0.83       315

Precision Score: 0.8901098901098901
Recall Score: 0.7714285714285715
F1 Score: 0.8265306122448981
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4082.01 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3243.93 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.143036,0.809129,0.767717,0.787879
2,0.197700,0.153213,0.853556,0.80315,0.827586
3,0.095200,0.209843,0.857143,0.874016,0.865497
4,0.037200,0.262975,0.806818,0.838583,0.822394
5,0.020800,0.32322,0.823077,0.84252,0.832685
6,0.020800,0.367111,0.847107,0.807087,0.826613
7,0.007200,0.367124,0.833333,0.826772,0.83004


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3260.24 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.80      0.84       315

   micro avg       0.88      0.80      0.84       315
   macro avg       0.88      0.80      0.84       315
weighted avg       0.88      0.80      0.84       315

Precision Score: 0.8754325259515571
Recall Score: 0.8031746031746032
F1 Score: 0.8377483443708609
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 3708.53 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3034.08 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.158024,0.866972,0.744094,0.800847
2,0.202100,0.147367,0.845833,0.799213,0.821862
3,0.095600,0.19551,0.843373,0.826772,0.83499
4,0.042800,0.203137,0.827839,0.889764,0.857685
5,0.020600,0.294637,0.849593,0.822835,0.836
6,0.020600,0.341956,0.883178,0.744094,0.807692
7,0.004700,0.360579,0.837398,0.811024,0.824
8,0.002100,0.355876,0.829365,0.822835,0.826087


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3246.98 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.84      0.82      0.83       315

   micro avg       0.84      0.82      0.83       315
   macro avg       0.84      0.82      0.83       315
weighted avg       0.84      0.82      0.83       315

Precision Score: 0.8354838709677419
Recall Score: 0.8222222222222222
F1 Score: 0.8288
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 3991.98 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3447.07 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.136665,0.788104,0.834646,0.810707
2,0.200900,0.149008,0.848361,0.814961,0.831325
3,0.096400,0.226318,0.816176,0.874016,0.844106
4,0.036600,0.270627,0.77972,0.877953,0.825926
5,0.022400,0.330981,0.826923,0.846457,0.836576
6,0.022400,0.369434,0.83691,0.767717,0.800821
7,0.007200,0.363719,0.808989,0.850394,0.829175
8,0.004600,0.408145,0.804511,0.84252,0.823077
9,0.001400,0.410881,0.805147,0.862205,0.8327
10,0.002300,0.408051,0.805054,0.877953,0.839925


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3100.59 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.79      0.84       315

   micro avg       0.88      0.79      0.84       315
   macro avg       0.88      0.79      0.84       315
weighted avg       0.88      0.79      0.84       315

Precision Score: 0.8833922261484098
Recall Score: 0.7936507936507936
F1 Score: 0.8361204013377925
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 3930.97 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3227.52 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.157434,0.807377,0.775591,0.791165
2,0.196500,0.155365,0.843621,0.807087,0.82495
3,0.091300,0.222058,0.825726,0.783465,0.80404
4,0.035200,0.261567,0.795539,0.84252,0.818356
5,0.021500,0.301959,0.807229,0.791339,0.799205
6,0.021500,0.388211,0.78626,0.811024,0.79845
7,0.007600,0.430262,0.781609,0.80315,0.792233
8,0.003000,0.408844,0.807229,0.791339,0.799205
9,0.002200,0.406708,0.818565,0.76378,0.790224
10,0.002100,0.407874,0.795539,0.84252,0.818356


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 2983.81 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.74      0.81       315

   micro avg       0.89      0.74      0.81       315
   macro avg       0.89      0.74      0.81       315
weighted avg       0.89      0.74      0.81       315

Precision Score: 0.8893129770992366
Recall Score: 0.7396825396825397
F1 Score: 0.807625649913345
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

In [None]:
for epoch in [5, 6, 7, 8, 10, 12]:
    print(f'training and results for DBMDZ Bert for {epoch} epochs:')
    ate_model(data, "dbmdz/bert-base-german-cased", rn1=42, rn2=42, epochs=epoch)
    print()
# GPU: NVIDIA GeForce RTX 2080 Ti   

training and results for DBMDZ Bert for 5 epochs:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4505.23 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3428.63 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.143791,0.802419,0.783465,0.792829
2,0.206300,0.158432,0.8625,0.814961,0.838057
3,0.097600,0.234388,0.821293,0.850394,0.83559
4,0.038900,0.255738,0.823077,0.84252,0.832685
5,0.024400,0.295568,0.822394,0.838583,0.830409


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3553.61 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.93      0.74      0.82       315

   micro avg       0.93      0.74      0.82       315
   macro avg       0.93      0.74      0.82       315
weighted avg       0.93      0.74      0.82       315

Precision Score: 0.9282868525896414
Recall Score: 0.7396825396825397
F1 Score: 0.823321554770318
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4451.00 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3736.00 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.141138,0.84188,0.775591,0.807377
2,0.203100,0.145927,0.848,0.834646,0.84127
3,0.099100,0.217954,0.848,0.834646,0.84127
4,0.039200,0.242002,0.818868,0.854331,0.836224
5,0.017500,0.289393,0.819549,0.858268,0.838462
6,0.017500,0.305856,0.828244,0.854331,0.841085


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3521.20 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.88      0.72      0.79       315

   micro avg       0.88      0.72      0.79       315
   macro avg       0.88      0.72      0.79       315
weighted avg       0.88      0.72      0.79       315

Precision Score: 0.8828125
Recall Score: 0.7174603174603175
F1 Score: 0.7915936952714535
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O',

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4411.86 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3691.88 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training results for dbmdz/bert-base-german-cased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.142294,0.8107,0.775591,0.792757
2,0.198000,0.146886,0.85259,0.84252,0.847525
3,0.095300,0.2189,0.84127,0.834646,0.837945
4,0.036700,0.254288,0.811538,0.830709,0.821012
5,0.022900,0.303113,0.816733,0.807087,0.811881
6,0.022900,0.351665,0.828571,0.799213,0.813627
7,0.006800,0.368225,0.815686,0.818898,0.817289


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3585.57 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.90      0.75      0.82       315

   micro avg       0.90      0.75      0.82       315
   macro avg       0.90      0.75      0.82       315
weighted avg       0.90      0.75      0.82       315

Precision Score: 0.9003831417624522
Recall Score: 0.746031746031746
F1 Score: 0.8159722222222222
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4493.67 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3684.08 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training results for dbmdz/bert-base-german-cased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.154031,0.852018,0.748031,0.796646
2,0.201700,0.150253,0.847107,0.807087,0.826613
3,0.096600,0.200131,0.820312,0.826772,0.823529
4,0.043300,0.194709,0.837736,0.874016,0.855491
5,0.019400,0.292323,0.853659,0.826772,0.84
6,0.019400,0.345301,0.862832,0.767717,0.8125
7,0.005200,0.353689,0.855319,0.791339,0.822086
8,0.004200,0.358837,0.834008,0.811024,0.822355


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3514.64 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.85      0.82      0.83       315

   micro avg       0.85      0.82      0.83       315
   macro avg       0.85      0.82      0.83       315
weighted avg       0.85      0.82      0.83       315

Precision Score: 0.8481848184818482
Recall Score: 0.8158730158730159
F1 Score: 0.8317152103559872
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4449.04 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3716.53 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training results for dbmdz/bert-base-german-cased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.137809,0.784387,0.830709,0.806883
2,0.201500,0.144645,0.856557,0.822835,0.839357
3,0.096900,0.213938,0.83908,0.862205,0.850485
4,0.036500,0.27514,0.792254,0.885827,0.836431
5,0.023300,0.322551,0.829268,0.80315,0.816
6,0.023300,0.36246,0.84898,0.818898,0.833667
7,0.009300,0.365571,0.848249,0.858268,0.853229
8,0.002600,0.347235,0.825279,0.874016,0.848948
9,0.003000,0.368532,0.848837,0.862205,0.855469
10,0.001400,0.374046,0.827715,0.870079,0.848369


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3571.65 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.77      0.83       315

   micro avg       0.89      0.77      0.83       315
   macro avg       0.89      0.77      0.83       315
weighted avg       0.89      0.77      0.83       315

Precision Score: 0.8905109489051095
Recall Score: 0.7746031746031746
F1 Score: 0.8285229202037352
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 4488.82 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3674.91 examples/s]
  trainer = Trainer(


['B-ASPECT' 'O']
{0: 4.755042290175667, 1: 0.5587538226299694}
Training results for dbmdz/bert-base-german-cased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.157616,0.807377,0.775591,0.791165
2,0.196500,0.144559,0.846473,0.80315,0.824242
3,0.091400,0.2104,0.826613,0.807087,0.816733
4,0.034300,0.264129,0.801444,0.874016,0.836158
5,0.020800,0.353754,0.805447,0.814961,0.810176
6,0.020800,0.408565,0.80315,0.80315,0.80315
7,0.008000,0.401566,0.789668,0.84252,0.815238
8,0.003900,0.411526,0.799257,0.846457,0.82218
9,0.001600,0.426865,0.801556,0.811024,0.806262
10,0.001300,0.42329,0.8,0.834646,0.816956


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3644.21 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.85      0.83      0.84       315

   micro avg       0.85      0.83      0.84       315
   macro avg       0.85      0.83      0.84       315
weighted avg       0.85      0.83      0.84       315

Precision Score: 0.8529411764705882
Recall Score: 0.8285714285714286
F1 Score: 0.8405797101449276
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

In [5]:
for epoch in [5, 6, 7, 8, 10, 12]:
    print(f'training and results for DBMDZ Bert for {epoch} epochs:')
    ate_model(data, "dbmdz/bert-base-german-cased", rn1=42, rn2=42, epochs=epoch)
    print()
# GPU: NVIDIA A30   

training and results for DBMDZ Bert for 5 epochs:


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 6647.05 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6558.37 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.140273,0.810484,0.791339,0.800797
2,0.200400,0.145776,0.854772,0.811024,0.832323
3,0.100800,0.215527,0.815385,0.834646,0.824903
4,0.037200,0.242071,0.824,0.811024,0.81746
5,0.021700,0.288855,0.819277,0.80315,0.811133


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5956.08 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.92      0.72      0.81       315

   micro avg       0.92      0.72      0.81       315
   macro avg       0.92      0.72      0.81       315
weighted avg       0.92      0.72      0.81       315

Precision Score: 0.9224489795918367
Recall Score: 0.7174603174603175
F1 Score: 0.807142857142857
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7764.26 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6711.21 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.144022,0.844156,0.767717,0.804124
2,0.201800,0.146437,0.84898,0.818898,0.833667
3,0.096600,0.207554,0.839216,0.84252,0.840864
4,0.040000,0.248792,0.81749,0.846457,0.831721
5,0.016600,0.309181,0.84127,0.834646,0.837945
6,0.016600,0.320726,0.826923,0.846457,0.836576


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5913.51 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.90      0.74      0.81       315

   micro avg       0.90      0.74      0.81       315
   macro avg       0.90      0.74      0.81       315
weighted avg       0.90      0.74      0.81       315

Precision Score: 0.9027237354085603
Recall Score: 0.7365079365079366
F1 Score: 0.8111888111888114
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7502.85 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6273.51 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.141232,0.811475,0.779528,0.795181
2,0.197800,0.148823,0.85124,0.811024,0.830645
3,0.097100,0.214233,0.852,0.838583,0.845238
4,0.035800,0.296147,0.787313,0.830709,0.808429
5,0.022100,0.325585,0.808429,0.830709,0.819417
6,0.022100,0.347618,0.8125,0.818898,0.815686
7,0.009000,0.364398,0.816406,0.822835,0.819608


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5900.87 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.77      0.83       315

   micro avg       0.89      0.77      0.83       315
   macro avg       0.89      0.77      0.83       315
weighted avg       0.89      0.77      0.83       315

Precision Score: 0.8937728937728938
Recall Score: 0.7746031746031746
F1 Score: 0.8299319727891157
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7679.40 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6288.69 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.154012,0.855204,0.744094,0.795789
2,0.201900,0.151102,0.846473,0.80315,0.824242
3,0.096800,0.203308,0.828685,0.818898,0.823762
4,0.042300,0.199001,0.828358,0.874016,0.850575
5,0.020600,0.319545,0.840816,0.811024,0.825651
6,0.020600,0.354978,0.870536,0.767717,0.8159
7,0.005300,0.366934,0.847107,0.807087,0.826613
8,0.003400,0.36751,0.842975,0.80315,0.822581


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5969.37 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.84      0.83      0.84       315

   micro avg       0.84      0.83      0.84       315
   macro avg       0.84      0.83      0.84       315
weighted avg       0.84      0.83      0.84       315

Precision Score: 0.842443729903537
Recall Score: 0.8317460317460318
F1 Score: 0.8370607028753992
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7636.28 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6295.04 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.140087,0.788679,0.822835,0.805395
2,0.201200,0.145959,0.84898,0.818898,0.833667
3,0.096200,0.20943,0.82397,0.866142,0.84453
4,0.038600,0.275153,0.818868,0.854331,0.836224
5,0.022200,0.325375,0.80597,0.850394,0.827586
6,0.022200,0.399062,0.846809,0.783465,0.813906
7,0.007100,0.393504,0.844,0.830709,0.837302
8,0.005400,0.39443,0.828897,0.858268,0.843327
9,0.001200,0.408186,0.834615,0.854331,0.844358
10,0.000600,0.411844,0.827586,0.850394,0.838835


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5825.42 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.78      0.83       315

   micro avg       0.89      0.78      0.83       315
   macro avg       0.89      0.78      0.83       315
weighted avg       0.89      0.78      0.83       315

Precision Score: 0.8884892086330936
Recall Score: 0.7841269841269841
F1 Score: 0.833052276559865
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Label

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


mapping of the data



Map: 100%|██████████| 808/808 [00:00<00:00, 7670.35 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6291.39 examples/s]
  trainer = Trainer(


['O' 'B-ASPECT']
{0: 0.5587538226299694, 1: 4.755042290175667}
Training results for dbmdz/bert-base-german-cased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.156757,0.807377,0.775591,0.791165
2,0.196500,0.149944,0.845188,0.795276,0.819473
3,0.090600,0.217388,0.844444,0.748031,0.793319
4,0.035200,0.278473,0.803774,0.838583,0.820809
5,0.017200,0.329946,0.814394,0.846457,0.830116
6,0.017200,0.386355,0.784906,0.818898,0.801541
7,0.008100,0.424874,0.770609,0.846457,0.806754
8,0.003600,0.431949,0.804511,0.84252,0.823077
9,0.001700,0.446138,0.812977,0.838583,0.825581
10,0.000600,0.467103,0.82996,0.807087,0.818363


mapping the test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5929.25 examples/s]


Unique predicted label IDs: {0, 1}
Expected label IDs: {0, 1}
Classification Report:
              precision    recall  f1-score   support

      ASPECT       0.89      0.78      0.83       315

   micro avg       0.89      0.78      0.83       315
   macro avg       0.89      0.78      0.83       315
weighted avg       0.89      0.78      0.83       315

Precision Score: 0.8916967509025271
Recall Score: 0.7841269841269841
F1 Score: 0.8344594594594594
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens     : ['Insbesondere', 'bei', 'Unikliniken', ',', 'mit', 'anderen', 'Krankheitsbildern', 'haben', 'sie', 'leider', 'ab', 'und', 'zu', 'Probleme', '.']
True Labels: ['O', 'O', 'B-ASPECT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labe

In [6]:
for epoch in [5, 6, 7, 8, 10, 12]:
    print(f'training and results for DBMDZ Bert for {epoch} epochs:')
    ate_cat_model(data, "deepset/gbert-base", rn1=42, rn2=42, epochs=epoch)
    print()
# GPU: Tesla V100-PCIE-32GB

training and results for DBMDZ Bert for 5 epochs:


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archit

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 3828.99 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3368.73 examples/s]
  trainer = Trainer(


['O' 'B-Arzt' 'B-mediz. Service' 'B-Personal' 'B-Pflegepersonal'
 'B-anderer Service' 'B-Krankenhaus']
{0: 0.1596439493228484, 1: 6.8239962651727355, 2: 4.959959280624364, 3: 18.317042606516292, 4: 14.014381591562799, 5: 10.546176046176047, 6: 5.983217355710193}
Training deepset/gbert-base for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.225852,0.702326,0.594488,0.643923
2,0.329800,0.189901,0.742616,0.692913,0.716904
3,0.149200,0.215154,0.789474,0.767717,0.778443
4,0.055700,0.226334,0.759843,0.759843,0.759843
5,0.037600,0.231934,0.76378,0.76378,0.76378


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3176.30 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.85      0.95      0.90        55
    Krankenhaus       0.88      0.70      0.78       117
       Personal       0.75      0.69      0.72        13
 Pflegepersonal       0.89      0.89      0.89        18
anderer Service       0.88      0.40      0.55        35
 mediz. Service       0.88      0.74      0.80        77

      micro avg       0.87      0.73      0.79       315
      macro avg       0.85      0.73      0.77       315
   weighted avg       0.87      0.73      0.78       315

Precision Score: 0.8679245283018868
Recall Score: 0.7301587301587301
F1 Score: 0.7931034482758621
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 3990.33 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3416.63 examples/s]
  trainer = Trainer(


['O' 'B-Arzt' 'B-mediz. Service' 'B-Personal' 'B-Pflegepersonal'
 'B-anderer Service' 'B-Krankenhaus']
{0: 0.1596439493228484, 1: 6.8239962651727355, 2: 4.959959280624364, 3: 18.317042606516292, 4: 14.014381591562799, 5: 10.546176046176047, 6: 5.983217355710193}
Training deepset/gbert-base for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.233743,0.682028,0.582677,0.62845
2,0.338700,0.19122,0.818627,0.65748,0.729258
3,0.144800,0.213728,0.754032,0.73622,0.74502
4,0.055200,0.247177,0.759036,0.744094,0.751491
5,0.034200,0.264931,0.723077,0.740157,0.731518
6,0.034200,0.279192,0.729084,0.720472,0.724752


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3184.50 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.95      0.95      0.95        55
    Krankenhaus       0.90      0.65      0.76       117
       Personal       0.75      0.69      0.72        13
 Pflegepersonal       0.94      0.94      0.94        18
anderer Service       0.77      0.57      0.66        35
 mediz. Service       0.89      0.73      0.80        77

      micro avg       0.89      0.73      0.80       315
      macro avg       0.87      0.76      0.80       315
   weighted avg       0.89      0.73      0.80       315

Precision Score: 0.8914728682170543
Recall Score: 0.7301587301587301
F1 Score: 0.8027923211169286
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 3894.63 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3506.45 examples/s]
  trainer = Trainer(


['O' 'B-Arzt' 'B-mediz. Service' 'B-Personal' 'B-Pflegepersonal'
 'B-anderer Service' 'B-Krankenhaus']
{0: 0.1596439493228484, 1: 6.8239962651727355, 2: 4.959959280624364, 3: 18.317042606516292, 4: 14.014381591562799, 5: 10.546176046176047, 6: 5.983217355710193}
Training deepset/gbert-base for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.229213,0.694836,0.582677,0.633833
2,0.329500,0.189755,0.747748,0.653543,0.697479
3,0.139300,0.238579,0.809955,0.704724,0.753684
4,0.056400,0.232594,0.793103,0.724409,0.757202
5,0.033900,0.250524,0.759542,0.783465,0.771318
6,0.033900,0.267027,0.747212,0.791339,0.768642
7,0.012000,0.274806,0.768293,0.744094,0.756


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 2958.43 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.89      0.93      0.91        55
    Krankenhaus       0.90      0.80      0.85       117
       Personal       0.64      0.69      0.67        13
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.63      0.49      0.55        35
 mediz. Service       0.83      0.78      0.81        77

      micro avg       0.85      0.79      0.82       315
      macro avg       0.82      0.77      0.79       315
   weighted avg       0.85      0.79      0.82       315

Precision Score: 0.852233676975945
Recall Score: 0.7873015873015873
F1 Score: 0.8184818481848185
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 3893.48 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3399.14 examples/s]
  trainer = Trainer(


['O' 'B-Arzt' 'B-mediz. Service' 'B-Personal' 'B-Pflegepersonal'
 'B-anderer Service' 'B-Krankenhaus']
{0: 0.1596439493228484, 1: 6.8239962651727355, 2: 4.959959280624364, 3: 18.317042606516292, 4: 14.014381591562799, 5: 10.546176046176047, 6: 5.983217355710193}
Training deepset/gbert-base for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.239388,0.709845,0.53937,0.612975
2,0.327300,0.19949,0.76652,0.685039,0.723493
3,0.145400,0.23527,0.762712,0.708661,0.734694
4,0.058300,0.247751,0.75,0.720472,0.73494
5,0.036100,0.27431,0.719557,0.767717,0.742857
6,0.036100,0.288713,0.769231,0.748031,0.758483
7,0.010700,0.302245,0.735632,0.755906,0.745631
8,0.006900,0.300851,0.744,0.732283,0.738095


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3114.79 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.93      0.90        55
    Krankenhaus       0.93      0.68      0.78       117
       Personal       0.90      0.69      0.78        13
 Pflegepersonal       0.85      0.94      0.89        18
anderer Service       0.85      0.49      0.62        35
 mediz. Service       0.74      0.77      0.75        77

      micro avg       0.85      0.74      0.79       315
      macro avg       0.86      0.75      0.79       315
   weighted avg       0.86      0.74      0.78       315

Precision Score: 0.8498168498168498
Recall Score: 0.7365079365079366
F1 Score: 0.7891156462585033
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 3873.72 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3418.23 examples/s]
  trainer = Trainer(


['O' 'B-Arzt' 'B-mediz. Service' 'B-Personal' 'B-Pflegepersonal'
 'B-anderer Service' 'B-Krankenhaus']
{0: 0.1596439493228484, 1: 6.8239962651727355, 2: 4.959959280624364, 3: 18.317042606516292, 4: 14.014381591562799, 5: 10.546176046176047, 6: 5.983217355710193}
Training deepset/gbert-base for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.226551,0.701357,0.610236,0.652632
2,0.316000,0.184481,0.743697,0.69685,0.719512
3,0.146700,0.205251,0.761194,0.80315,0.781609
4,0.057100,0.24832,0.804979,0.76378,0.783838
5,0.035600,0.263757,0.762774,0.822835,0.791667
6,0.035600,0.265842,0.803213,0.787402,0.795229
7,0.012200,0.288448,0.756654,0.783465,0.769826
8,0.006100,0.305892,0.805785,0.767717,0.78629
9,0.002600,0.295085,0.762452,0.783465,0.772816
10,0.001800,0.290955,0.790698,0.80315,0.796875


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3120.17 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.95      0.95      0.95        55
    Krankenhaus       0.90      0.71      0.79       117
       Personal       0.64      0.69      0.67        13
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.83      0.54      0.66        35
 mediz. Service       0.85      0.74      0.79        77

      micro avg       0.88      0.75      0.81       315
      macro avg       0.86      0.76      0.80       315
   weighted avg       0.88      0.75      0.81       315

Precision Score: 0.8843283582089553
Recall Score: 0.7523809523809524
F1 Score: 0.8130360205831904
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 3907.33 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3294.13 examples/s]
  trainer = Trainer(


['O' 'B-Arzt' 'B-mediz. Service' 'B-Personal' 'B-Pflegepersonal'
 'B-anderer Service' 'B-Krankenhaus']
{0: 0.1596439493228484, 1: 6.8239962651727355, 2: 4.959959280624364, 3: 18.317042606516292, 4: 14.014381591562799, 5: 10.546176046176047, 6: 5.983217355710193}
Training deepset/gbert-base for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.212552,0.793103,0.633858,0.704595
2,0.324100,0.199718,0.798122,0.669291,0.728051
3,0.147800,0.208771,0.785124,0.748031,0.766129
4,0.063800,0.260315,0.826291,0.692913,0.753747
5,0.039000,0.279518,0.751938,0.76378,0.757812
6,0.039000,0.274944,0.782258,0.76378,0.772908
7,0.017500,0.331395,0.768,0.755906,0.761905
8,0.008000,0.366493,0.748092,0.771654,0.75969
9,0.003100,0.384498,0.767932,0.716535,0.741344
10,0.002300,0.356591,0.760148,0.811024,0.784762


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 2652.07 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.93      0.95      0.94        55
    Krankenhaus       0.88      0.70      0.78       117
       Personal       0.75      0.69      0.72        13
 Pflegepersonal       0.85      0.94      0.89        18
anderer Service       0.75      0.51      0.61        35
 mediz. Service       0.73      0.75      0.74        77

      micro avg       0.83      0.75      0.79       315
      macro avg       0.82      0.76      0.78       315
   weighted avg       0.83      0.75      0.78       315

Precision Score: 0.8309859154929577
Recall Score: 0.7492063492063492
F1 Score: 0.7879799666110183
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

In [5]:
for epoch in [5, 6, 7, 8, 10, 12]:
    print(f'training and results for gBert for {epoch} epochs:')
    ate_cat_model(data, "deepset/gbert-base", rn1=42, rn2=42, epochs=epoch)
    print()
# GPU: NVIDIA GeForce RTX 2080 Ti

training and results for gBert for 5 epochs:


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archit

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 4318.33 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3759.90 examples/s]
  trainer = Trainer(


['B-Arzt' 'B-mediz. Service' 'B-anderer Service' 'O' 'B-Pflegepersonal'
 'B-Personal' 'B-Krankenhaus']
{0: 6.8239962651727355, 1: 4.959959280624364, 2: 10.546176046176047, 3: 0.1596439493228484, 4: 14.014381591562799, 5: 18.317042606516292, 6: 5.983217355710193}
Training deepset/gbert-base for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.246846,0.707692,0.543307,0.614699
2,0.327300,0.212871,0.724444,0.641732,0.680585
3,0.140400,0.210702,0.777778,0.716535,0.745902
4,0.057200,0.257154,0.786325,0.724409,0.754098
5,0.039900,0.25016,0.768627,0.771654,0.770138


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3609.62 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.93      0.90        55
    Krankenhaus       0.83      0.77      0.80       117
       Personal       0.73      0.62      0.67        13
 Pflegepersonal       0.85      0.94      0.89        18
anderer Service       0.73      0.46      0.56        35
 mediz. Service       0.83      0.78      0.81        77

      micro avg       0.83      0.77      0.80       315
      macro avg       0.81      0.75      0.77       315
   weighted avg       0.83      0.77      0.79       315

Precision Score: 0.8316151202749141
Recall Score: 0.7682539682539683
F1 Score: 0.7986798679867988
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 4504.47 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3601.49 examples/s]
  trainer = Trainer(


['B-Arzt' 'B-mediz. Service' 'B-anderer Service' 'O' 'B-Pflegepersonal'
 'B-Personal' 'B-Krankenhaus']
{0: 6.8239962651727355, 1: 4.959959280624364, 2: 10.546176046176047, 3: 0.1596439493228484, 4: 14.014381591562799, 5: 18.317042606516292, 6: 5.983217355710193}
Training deepset/gbert-base for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.231296,0.669767,0.566929,0.614072
2,0.342800,0.193988,0.786802,0.610236,0.687361
3,0.153400,0.217835,0.764228,0.740157,0.752
4,0.061300,0.230028,0.780488,0.755906,0.768
5,0.038200,0.261165,0.75,0.791339,0.770115
6,0.038200,0.272857,0.752941,0.755906,0.75442


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3512.04 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.83      0.95      0.88        55
    Krankenhaus       0.90      0.75      0.82       117
       Personal       0.55      0.46      0.50        13
 Pflegepersonal       0.85      0.94      0.89        18
anderer Service       0.60      0.43      0.50        35
 mediz. Service       0.81      0.75      0.78        77

      micro avg       0.82      0.75      0.78       315
      macro avg       0.75      0.71      0.73       315
   weighted avg       0.81      0.75      0.78       315

Precision Score: 0.8166089965397924
Recall Score: 0.7492063492063492
F1 Score: 0.7814569536423841
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 4462.18 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3681.68 examples/s]
  trainer = Trainer(


['B-Arzt' 'B-mediz. Service' 'B-anderer Service' 'O' 'B-Pflegepersonal'
 'B-Personal' 'B-Krankenhaus']
{0: 6.8239962651727355, 1: 4.959959280624364, 2: 10.546176046176047, 3: 0.1596439493228484, 4: 14.014381591562799, 5: 18.317042606516292, 6: 5.983217355710193}
Training deepset/gbert-base for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.23276,0.683486,0.586614,0.631356
2,0.329200,0.197348,0.740909,0.641732,0.687764
3,0.140900,0.252614,0.8,0.692913,0.742616
4,0.056700,0.269615,0.790179,0.69685,0.740586
5,0.034600,0.252002,0.767068,0.751969,0.759443
6,0.034600,0.280947,0.747036,0.744094,0.745562
7,0.011600,0.280761,0.768293,0.744094,0.756


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3628.72 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.87      0.95      0.90        55
    Krankenhaus       0.92      0.84      0.88       117
       Personal       0.60      0.69      0.64        13
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.64      0.51      0.57        35
 mediz. Service       0.87      0.81      0.84        77

      micro avg       0.86      0.81      0.84       315
      macro avg       0.82      0.79      0.80       315
   weighted avg       0.86      0.81      0.83       315

Precision Score: 0.8619528619528619
Recall Score: 0.8126984126984127
F1 Score: 0.8366013071895425
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 4475.93 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3662.13 examples/s]
  trainer = Trainer(


['B-Arzt' 'B-mediz. Service' 'B-anderer Service' 'O' 'B-Pflegepersonal'
 'B-Personal' 'B-Krankenhaus']
{0: 6.8239962651727355, 1: 4.959959280624364, 2: 10.546176046176047, 3: 0.1596439493228484, 4: 14.014381591562799, 5: 18.317042606516292, 6: 5.983217355710193}
Training deepset/gbert-base for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.239389,0.709845,0.53937,0.612975
2,0.327300,0.199491,0.76652,0.685039,0.723493
3,0.145400,0.23549,0.760684,0.700787,0.729508
4,0.058500,0.248969,0.75,0.720472,0.73494
5,0.036100,0.273916,0.728625,0.771654,0.749522
6,0.036100,0.286386,0.772,0.759843,0.765873
7,0.010300,0.304171,0.735632,0.755906,0.745631
8,0.006900,0.30404,0.749004,0.740157,0.744554


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3556.06 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.93      0.90        55
    Krankenhaus       0.93      0.67      0.78       117
       Personal       0.90      0.69      0.78        13
 Pflegepersonal       0.85      0.94      0.89        18
anderer Service       0.86      0.51      0.64        35
 mediz. Service       0.74      0.77      0.75        77

      micro avg       0.85      0.74      0.79       315
      macro avg       0.86      0.75      0.79       315
   weighted avg       0.86      0.74      0.78       315

Precision Score: 0.8498168498168498
Recall Score: 0.7365079365079366
F1 Score: 0.7891156462585033
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 4433.42 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3616.00 examples/s]
  trainer = Trainer(


['B-Arzt' 'B-mediz. Service' 'B-anderer Service' 'O' 'B-Pflegepersonal'
 'B-Personal' 'B-Krankenhaus']
{0: 6.8239962651727355, 1: 4.959959280624364, 2: 10.546176046176047, 3: 0.1596439493228484, 4: 14.014381591562799, 5: 18.317042606516292, 6: 5.983217355710193}
Training deepset/gbert-base for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.226547,0.701357,0.610236,0.652632
2,0.316000,0.184469,0.743697,0.69685,0.719512
3,0.146700,0.205209,0.7603,0.799213,0.779271
4,0.057000,0.248834,0.804979,0.76378,0.783838
5,0.035600,0.264401,0.76,0.822835,0.79017
6,0.035600,0.265459,0.803213,0.787402,0.795229
7,0.012100,0.287513,0.761364,0.791339,0.776062
8,0.006100,0.303159,0.805668,0.783465,0.794411
9,0.002700,0.297265,0.761538,0.779528,0.770428
10,0.001800,0.292881,0.78626,0.811024,0.79845


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3587.73 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.95      0.95      0.95        55
    Krankenhaus       0.90      0.71      0.79       117
       Personal       0.64      0.69      0.67        13
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.83      0.54      0.66        35
 mediz. Service       0.85      0.75      0.80        77

      micro avg       0.88      0.76      0.82       315
      macro avg       0.86      0.76      0.81       315
   weighted avg       0.88      0.76      0.81       315

Precision Score: 0.8847583643122676
Recall Score: 0.7555555555555555
F1 Score: 0.8150684931506849
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 4453.10 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3727.48 examples/s]
  trainer = Trainer(


['B-Arzt' 'B-mediz. Service' 'B-anderer Service' 'O' 'B-Pflegepersonal'
 'B-Personal' 'B-Krankenhaus']
{0: 6.8239962651727355, 1: 4.959959280624364, 2: 10.546176046176047, 3: 0.1596439493228484, 4: 14.014381591562799, 5: 18.317042606516292, 6: 5.983217355710193}
Training deepset/gbert-base for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.212973,0.793103,0.633858,0.704595
2,0.324000,0.199952,0.798122,0.669291,0.728051
3,0.147200,0.198577,0.787755,0.759843,0.773547
4,0.070400,0.261719,0.733906,0.673228,0.702259
5,0.039500,0.300803,0.753086,0.720472,0.736419
6,0.039500,0.311564,0.802632,0.720472,0.759336
7,0.013800,0.316394,0.770992,0.795276,0.782946
8,0.007500,0.327303,0.75,0.755906,0.752941
9,0.003200,0.363462,0.755102,0.728346,0.741483
10,0.002700,0.33089,0.776515,0.807087,0.791506


Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3555.20 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.93      0.96      0.95        55
    Krankenhaus       0.86      0.74      0.80       117
       Personal       0.69      0.69      0.69        13
 Pflegepersonal       0.89      0.94      0.92        18
anderer Service       0.71      0.57      0.63        35
 mediz. Service       0.76      0.75      0.76        77

      micro avg       0.83      0.77      0.80       315
      macro avg       0.81      0.78      0.79       315
   weighted avg       0.83      0.77      0.80       315

Precision Score: 0.8299319727891157
Recall Score: 0.7746031746031746
F1 Score: 0.8013136288998358
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

In [6]:
for epoch in [5, 6, 7, 8, 10, 12]:
    print(f'training and results for gBert for {epoch} epochs:')
    ate_cat_model(data, "deepset/gbert-base", rn1=42, rn2=42, epochs=epoch, save = True)
    print()
# GPU: NVIDIA A30

training and results for gBert for 5 epochs:


Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 5764.09 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6581.29 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-Personal' 'B-mediz. Service' 'B-Pflegepersonal'
 'B-Arzt' 'B-anderer Service' 'O']
{0: 5.983217355710193, 1: 18.317042606516292, 2: 4.959959280624364, 3: 14.014381591562799, 4: 6.8239962651727355, 5: 10.546176046176047, 6: 0.1596439493228484}
Training deepset/gbert-base for 5 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.234593,0.74,0.582677,0.651982
2,0.311400,0.185264,0.763948,0.700787,0.731006
3,0.138500,0.197016,0.770213,0.712598,0.740286
4,0.057000,0.234102,0.753138,0.708661,0.730223
5,0.037000,0.238148,0.759494,0.708661,0.733198



Best Model saved at: ./saved_models/ate_cat_deepset_gbert-base_42_42_5

Tokenizer for best Model saved at: ./saved_tokenizers/ate_cat_deepset_gbert-base_42_42_5
Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 6029.36 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.96      0.92        55
    Krankenhaus       0.89      0.64      0.75       117
       Personal       0.69      0.69      0.69        13
 Pflegepersonal       1.00      0.83      0.91        18
anderer Service       0.86      0.51      0.64        35
 mediz. Service       0.91      0.75      0.82        77

      micro avg       0.89      0.72      0.80       315
      macro avg       0.87      0.73      0.79       315
   weighted avg       0.89      0.72      0.79       315

Precision Score: 0.8871595330739299
Recall Score: 0.7238095238095238
F1 Score: 0.7972027972027971
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7269.30 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6481.21 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-Personal' 'B-mediz. Service' 'B-Pflegepersonal'
 'B-Arzt' 'B-anderer Service' 'O']
{0: 5.983217355710193, 1: 18.317042606516292, 2: 4.959959280624364, 3: 14.014381591562799, 4: 6.8239962651727355, 5: 10.546176046176047, 6: 0.1596439493228484}
Training deepset/gbert-base for 6 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.242891,0.691943,0.574803,0.627957
2,0.339200,0.194512,0.78744,0.641732,0.707158
3,0.147200,0.238592,0.790179,0.69685,0.740586
4,0.055800,0.260866,0.768908,0.720472,0.743902
5,0.035600,0.279168,0.740157,0.740157,0.740157
6,0.035600,0.286898,0.744,0.732283,0.738095



Best Model saved at: ./saved_models/ate_cat_deepset_gbert-base_42_42_6

Tokenizer for best Model saved at: ./saved_tokenizers/ate_cat_deepset_gbert-base_42_42_6
Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5026.54 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.94      0.89      0.92        55
    Krankenhaus       0.90      0.62      0.74       117
       Personal       0.75      0.69      0.72        13
 Pflegepersonal       0.94      0.94      0.94        18
anderer Service       0.83      0.43      0.57        35
 mediz. Service       0.90      0.73      0.81        77

      micro avg       0.90      0.70      0.78       315
      macro avg       0.88      0.72      0.78       315
   weighted avg       0.90      0.70      0.78       315

Precision Score: 0.9012345679012346
Recall Score: 0.6952380952380952
F1 Score: 0.7849462365591398
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7561.77 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6416.22 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-Personal' 'B-mediz. Service' 'B-Pflegepersonal'
 'B-Arzt' 'B-anderer Service' 'O']
{0: 5.983217355710193, 1: 18.317042606516292, 2: 4.959959280624364, 3: 14.014381591562799, 4: 6.8239962651727355, 5: 10.546176046176047, 6: 0.1596439493228484}
Training deepset/gbert-base for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.232705,0.687204,0.570866,0.623656
2,0.329600,0.197159,0.751131,0.653543,0.698947
3,0.139900,0.2433,0.81106,0.692913,0.747346
4,0.056300,0.247694,0.757322,0.712598,0.73428
5,0.032800,0.260235,0.752896,0.767717,0.760234
6,0.032800,0.267127,0.746154,0.76378,0.754864
7,0.010600,0.277248,0.770161,0.751969,0.760956



Best Model saved at: ./saved_models/ate_cat_deepset_gbert-base_42_42_7

Tokenizer for best Model saved at: ./saved_tokenizers/ate_cat_deepset_gbert-base_42_42_7
Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5383.88 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.87      0.95      0.90        55
    Krankenhaus       0.94      0.80      0.87       117
       Personal       0.75      0.69      0.72        13
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.57      0.37      0.45        35
 mediz. Service       0.85      0.78      0.81        77

      micro avg       0.87      0.78      0.82       315
      macro avg       0.83      0.76      0.79       315
   weighted avg       0.86      0.78      0.81       315

Precision Score: 0.8657243816254417
Recall Score: 0.7777777777777778
F1 Score: 0.8193979933110369
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7234.94 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6367.42 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-Personal' 'B-mediz. Service' 'B-Pflegepersonal'
 'B-Arzt' 'B-anderer Service' 'O']
{0: 5.983217355710193, 1: 18.317042606516292, 2: 4.959959280624364, 3: 14.014381591562799, 4: 6.8239962651727355, 5: 10.546176046176047, 6: 0.1596439493228484}
Training deepset/gbert-base for 8 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.239388,0.709845,0.53937,0.612975
2,0.327300,0.19949,0.76652,0.685039,0.723493
3,0.145400,0.235501,0.759494,0.708661,0.733198
4,0.058300,0.247387,0.75,0.720472,0.73494
5,0.036500,0.273745,0.718978,0.775591,0.746212
6,0.036500,0.281704,0.774194,0.755906,0.76494
7,0.011000,0.296285,0.723485,0.751969,0.737452
8,0.006900,0.295873,0.742063,0.73622,0.73913



Best Model saved at: ./saved_models/ate_cat_deepset_gbert-base_42_42_8

Tokenizer for best Model saved at: ./saved_tokenizers/ate_cat_deepset_gbert-base_42_42_8
Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5962.97 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.91      0.89        55
    Krankenhaus       0.92      0.67      0.77       117
       Personal       0.90      0.69      0.78        13
 Pflegepersonal       0.85      0.94      0.89        18
anderer Service       0.89      0.49      0.63        35
 mediz. Service       0.74      0.77      0.75        77

      micro avg       0.85      0.73      0.78       315
      macro avg       0.86      0.74      0.79       315
   weighted avg       0.86      0.73      0.78       315

Precision Score: 0.8487084870848709
Recall Score: 0.7301587301587301
F1 Score: 0.7849829351535836
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7385.63 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 5776.25 examples/s]


['B-Krankenhaus' 'B-Personal' 'B-mediz. Service' 'B-Pflegepersonal'
 'B-Arzt' 'B-anderer Service' 'O']
{0: 5.983217355710193, 1: 18.317042606516292, 2: 4.959959280624364, 3: 14.014381591562799, 4: 6.8239962651727355, 5: 10.546176046176047, 6: 0.1596439493228484}


  trainer = Trainer(


Training deepset/gbert-base for 10 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.226547,0.701357,0.610236,0.652632
2,0.316000,0.184468,0.743697,0.69685,0.719512
3,0.146700,0.205205,0.7603,0.799213,0.779271
4,0.057000,0.248863,0.804979,0.76378,0.783838
5,0.035600,0.264433,0.76,0.822835,0.79017
6,0.035600,0.265452,0.803213,0.787402,0.795229
7,0.012100,0.287487,0.761364,0.791339,0.776062
8,0.006100,0.302971,0.805668,0.783465,0.794411
9,0.002700,0.297659,0.764479,0.779528,0.77193
10,0.001800,0.292972,0.78626,0.811024,0.79845



Best Model saved at: ./saved_models/ate_cat_deepset_gbert-base_42_42_10

Tokenizer for best Model saved at: ./saved_tokenizers/ate_cat_deepset_gbert-base_42_42_10
Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 6008.78 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.95      0.95      0.95        55
    Krankenhaus       0.90      0.72      0.80       117
       Personal       0.64      0.69      0.67        13
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.83      0.54      0.66        35
 mediz. Service       0.85      0.75      0.80        77

      micro avg       0.89      0.76      0.82       315
      macro avg       0.86      0.77      0.81       315
   weighted avg       0.88      0.76      0.81       315

Precision Score: 0.8851851851851852
Recall Score: 0.7587301587301587
F1 Score: 0.8170940170940171
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7536.98 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6369.82 examples/s]
  trainer = Trainer(


['B-Krankenhaus' 'B-Personal' 'B-mediz. Service' 'B-Pflegepersonal'
 'B-Arzt' 'B-anderer Service' 'O']
{0: 5.983217355710193, 1: 18.317042606516292, 2: 4.959959280624364, 3: 14.014381591562799, 4: 6.8239962651727355, 5: 10.546176046176047, 6: 0.1596439493228484}
Training deepset/gbert-base for 12 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.212159,0.793103,0.633858,0.704595
2,0.324000,0.200212,0.802817,0.673228,0.732334
3,0.147300,0.207513,0.792531,0.751969,0.771717
4,0.063300,0.249735,0.827273,0.716535,0.767932
5,0.037100,0.268996,0.729008,0.751969,0.74031
6,0.037100,0.275405,0.797571,0.775591,0.786427
7,0.017000,0.302324,0.769841,0.76378,0.766798
8,0.007500,0.305909,0.755474,0.814961,0.784091
9,0.004700,0.317125,0.732342,0.775591,0.753346
10,0.001300,0.313503,0.774908,0.826772,0.8



Best Model saved at: ./saved_models/ate_cat_deepset_gbert-base_42_42_12

Tokenizer for best Model saved at: ./saved_tokenizers/ate_cat_deepset_gbert-base_42_42_12
Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 5884.48 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.95      0.95      0.95        55
    Krankenhaus       0.90      0.70      0.79       117
       Personal       0.75      0.69      0.72        13
 Pflegepersonal       0.81      0.94      0.87        18
anderer Service       0.78      0.51      0.62        35
 mediz. Service       0.74      0.83      0.78        77

      micro avg       0.84      0.77      0.80       315
      macro avg       0.82      0.77      0.79       315
   weighted avg       0.84      0.77      0.80       315

Precision Score: 0.8373702422145328
Recall Score: 0.7682539682539683
F1 Score: 0.8013245033112584
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

In [7]:
ate_cat_model(data, "deepset/gbert-base", rn1=42, rn2=42, epochs=7, save = True) #v1

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 5087.50 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4241.46 examples/s]
  trainer = Trainer(


['B-Personal' 'O' 'B-Arzt' 'B-Krankenhaus' 'B-mediz. Service'
 'B-Pflegepersonal' 'B-anderer Service']
{0: 18.317042606516292, 1: 0.1596439493228484, 2: 6.8239962651727355, 3: 5.983217355710193, 4: 4.959959280624364, 5: 14.014381591562799, 6: 10.546176046176047}
Training deepset/gbert-base for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.23644,0.702326,0.594488,0.643923
2,0.316500,0.20887,0.790698,0.669291,0.724947
3,0.139500,0.213957,0.783898,0.728346,0.755102
4,0.055400,0.239321,0.742188,0.748031,0.745098
5,0.032200,0.283362,0.744275,0.767717,0.755814
6,0.032200,0.27922,0.768627,0.771654,0.770138
7,0.011300,0.297911,0.780083,0.740157,0.759596



Best Model saved at: ./saved_models/ate_cat_deepset_gbert-base_42_42_7

Tokenizer for best Model saved at: ./saved_tokenizers/ate_cat_deepset_gbert-base_42_42_7
Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3922.93 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.87      0.95      0.90        55
    Krankenhaus       0.91      0.73      0.81       117
       Personal       0.69      0.69      0.69        13
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.76      0.54      0.63        35
 mediz. Service       0.83      0.70      0.76        77

      micro avg       0.86      0.75      0.80       315
      macro avg       0.84      0.76      0.80       315
   weighted avg       0.86      0.75      0.80       315

Precision Score: 0.8644688644688645
Recall Score: 0.7492063492063492
F1 Score: 0.8027210884353742
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

In [5]:
ate_cat_model(data, "deepset/gbert-base", rn1=42, rn2=42, epochs=7, save = True) #v2

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archit

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 2457.18 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 3790.96 examples/s]
  trainer = Trainer(


['O' 'B-Krankenhaus' 'B-Personal' 'B-Pflegepersonal' 'B-mediz. Service'
 'B-Arzt' 'B-anderer Service']
{0: 0.1596439493228484, 1: 5.983217355710193, 2: 18.317042606516292, 3: 14.014381591562799, 4: 4.959959280624364, 5: 6.8239962651727355, 6: 10.546176046176047}
Training deepset/gbert-base for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.224845,0.727273,0.629921,0.675105
2,0.320800,0.215142,0.766816,0.673228,0.716981
3,0.145000,0.240616,0.76834,0.783465,0.775828
4,0.058600,0.298305,0.764192,0.688976,0.724638
5,0.033100,0.30455,0.732394,0.818898,0.773234
6,0.033100,0.298057,0.755474,0.814961,0.784091
7,0.012300,0.310792,0.754941,0.751969,0.753452



Best Model saved at: ./saved_models/ate_cat_deepset_gbert-base_42_42_7

Tokenizer for best Model saved at: ./saved_tokenizers/ate_cat_deepset_gbert-base_42_42_7
Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3455.84 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.88      0.91      0.89        55
    Krankenhaus       0.91      0.73      0.81       117
       Personal       0.69      0.69      0.69        13
 Pflegepersonal       0.89      0.94      0.92        18
anderer Service       0.81      0.63      0.71        35
 mediz. Service       0.76      0.78      0.77        77

      micro avg       0.84      0.77      0.81       315
      macro avg       0.83      0.78      0.80       315
   weighted avg       0.85      0.77      0.80       315

Precision Score: 0.84375
Recall Score: 0.7714285714285715
F1 Score: 0.8059701492537314
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O', 'O', 'O'

In [5]:
ate_cat_model(data, "deepset/gbert-base", rn1=42, rn2=42, epochs=7, save = True) # v3

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archit

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 7324.76 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 6698.15 examples/s]
  trainer = Trainer(


['B-Personal' 'B-Arzt' 'B-mediz. Service' 'O' 'B-Krankenhaus'
 'B-Pflegepersonal' 'B-anderer Service']
{0: 18.317042606516292, 1: 6.8239962651727355, 2: 4.959959280624364, 3: 0.1596439493228484, 4: 5.983217355710193, 5: 14.014381591562799, 6: 10.546176046176047}
Training deepset/gbert-base for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.220122,0.761468,0.653543,0.70339
2,0.334400,0.203986,0.747863,0.688976,0.717213
3,0.145100,0.242743,0.769547,0.73622,0.752515
4,0.061600,0.275232,0.800905,0.69685,0.745263
5,0.035900,0.290093,0.741573,0.779528,0.760077
6,0.035900,0.272546,0.792683,0.767717,0.78
7,0.014700,0.277809,0.768924,0.759843,0.764356



Best Model saved at: ./saved_models/ate_cat_deepset_gbert-base_42_42_7

Tokenizer for best Model saved at: ./saved_tokenizers/ate_cat_deepset_gbert-base_42_42_7
Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 6173.43 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.91      0.96      0.94        55
    Krankenhaus       0.94      0.70      0.80       117
       Personal       0.75      0.69      0.72        13
 Pflegepersonal       1.00      0.94      0.97        18
anderer Service       0.88      0.43      0.58        35
 mediz. Service       0.86      0.74      0.80        77

      micro avg       0.91      0.74      0.81       315
      macro avg       0.89      0.75      0.80       315
   weighted avg       0.91      0.74      0.81       315

Precision Score: 0.9066147859922179
Recall Score: 0.7396825396825397
F1 Score: 0.8146853146853148
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O

In [5]:
ate_cat_model(data, "deepset/gbert-base", rn1=42, rn2=42, epochs=7, save = True) # v4

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archit

Mapping the data


Map: 100%|██████████| 808/808 [00:00<00:00, 2590.64 examples/s]
Map: 100%|██████████| 101/101 [00:00<00:00, 4461.04 examples/s]
  trainer = Trainer(


['B-anderer Service' 'B-Krankenhaus' 'B-Arzt' 'B-mediz. Service'
 'B-Pflegepersonal' 'O' 'B-Personal']
{0: 10.546176046176047, 1: 5.983217355710193, 2: 6.8239962651727355, 3: 4.959959280624364, 4: 14.014381591562799, 5: 0.1596439493228484, 6: 18.317042606516292}
Training deepset/gbert-base for 7 epochs with random seeds 42, 42



Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.244711,0.721951,0.582677,0.64488
2,0.328600,0.19473,0.764706,0.716535,0.739837
3,0.142800,0.222282,0.741935,0.724409,0.733068
4,0.056000,0.225319,0.802372,0.799213,0.800789
5,0.033500,0.262696,0.739623,0.771654,0.755299
6,0.033500,0.279393,0.759542,0.783465,0.771318
7,0.011700,0.301292,0.74031,0.751969,0.746094



Best Model saved at: ./saved_models/ate_cat_deepset_gbert-base_42_42_7

Tokenizer for best Model saved at: ./saved_tokenizers/ate_cat_deepset_gbert-base_42_42_7
Evaluating on test data



Map: 100%|██████████| 102/102 [00:00<00:00, 3829.66 examples/s]


Unique predicted label IDs: {0, 2, 3, 4, 5, 6, 7}
Expected label IDs: {0, 1, 2, 3, 4, 5, 6, 7}
Classification Report:
                 precision    recall  f1-score   support

           Arzt       0.93      0.91      0.92        55
    Krankenhaus       0.92      0.77      0.84       117
       Personal       0.75      0.69      0.72        13
 Pflegepersonal       1.00      0.83      0.91        18
anderer Service       0.60      0.43      0.50        35
 mediz. Service       0.81      0.81      0.81        77

      micro avg       0.86      0.77      0.81       315
      macro avg       0.83      0.74      0.78       315
   weighted avg       0.85      0.77      0.81       315

Precision Score: 0.8576512455516014
Recall Score: 0.765079365079365
F1 Score: 0.8087248322147651
Tokens     : ['Nun', 'sind', '3', 'Jahre', 'seit', 'der', 'Operation', 'vergangen', 'und', 'es', 'mir', 'gut', '.']
True Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Pred Labels: ['O'