## Aspect-Based Sentiment Analysis (ABSA) Training and Fine Tuning for Large Language Models on German hospital reviews

In [1]:
import pandas as pd
import numpy as np
import torch

# need the sys package to load modules from another directory:
import sys
sys.path.append('../')
from functions.absa_model_train import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("Is CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("GPU device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

Is CUDA available: True
CUDA version: 12.6
GPU device name: NVIDIA A30


In [3]:
# Load the dataset into a DataFrame
data_ano = pd.read_csv("./data/hospitalABSA/patient_review_labels_absa_ano.csv")
data = pd.read_csv("./data/hospitalABSA/patient_review_labels_absa.csv") 

In [4]:
models = ["google-bert/bert-base-german-cased","dbmdz/bert-base-german-cased", "dbmdz/bert-base-german-uncased",
          "FacebookAI/xlm-roberta-base", "TUM/GottBERT_base_best", "TUM/GottBERT_filtered_base_best", "TUM/GottBERT_base_last",
          "distilbert/distilbert-base-german-cased", "GerMedBERT/medbert-512", "deepset/gbert-base"] #"aari1995/German_Sentiment"

### Train ABSA Model with new training, validation, test split

train for 5, 6, 7, 8, 10, 12, 20 epochs

Performance for DBMDZ Bert and best model German Sentiment Bert:

DBMDZ BERT:

In [5]:
for epoch in [5, 6, 7, 8, 10, 12, 20]:
    print(f'training and results for DBMDZ Bert for {epoch} epochs:')
    absa_model(data, "dbmdz/bert-base-german-cased", rn1=42, rn2=42, epochs=epoch)
    print()
# GPU: Tesla V100-PCIE-32GB

training and results for DBMDZ Bert for 5 epochs:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 1891.67 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 1982.96 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 1919.84 examples/s]


Training results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8775,0.823585,0.821705,0.834985,0.821705,0.819118,"{0: 54, 1: 19, 2: 56}"
2,0.5226,0.698223,0.821705,0.828427,0.821705,0.823069,"{0: 48, 1: 26, 2: 55}"
3,0.4583,0.923424,0.829457,0.838143,0.829457,0.829963,"{0: 50, 1: 22, 2: 57}"
4,0.2598,0.739634,0.844961,0.851961,0.844961,0.847079,"{0: 47, 1: 26, 2: 56}"
5,0.2097,0.715512,0.852713,0.857065,0.852713,0.85429,"{0: 45, 1: 27, 2: 57}"


Evaluation results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



{'eval_loss': 1.2322924137115479, 'eval_accuracy': 0.7908496732026143, 'eval_precision': 0.8236116824352119, 'eval_recall': 0.7908496732026143, 'eval_f1': 0.7946308724832215, 'eval_class_distribution': {0: 44, 1: 44, 2: 65}, 'eval_runtime': 2.0986, 'eval_samples_per_second': 72.905, 'eval_steps_per_second': 36.691, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.71      0.89      0.79        36
     Neutral       0.63      0.79      0.70        33
     Positiv       0.94      0.75      0.83        84

    accuracy                           0.79       153
   macro avg       0.76      0.81      0.78       153
weighted avg       0.82      0.79      0.80       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 45, 1: 41, 2: 67}
Negativ Precision Score: 0.7111111111111111
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.7901234567901234

Neutral Precision Score: 0.6341463414634146
Neutral Recall Score: 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2186.33 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2160.29 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2258.55 examples/s]


Training results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8628,0.594286,0.852713,0.86487,0.852713,0.851777,"{0: 53, 1: 21, 2: 55}"
2,0.5718,0.667186,0.852713,0.853387,0.852713,0.851459,"{0: 46, 1: 23, 2: 60}"
3,0.4849,1.062155,0.813953,0.819575,0.813953,0.810662,"{0: 49, 1: 19, 2: 61}"
4,0.2677,0.938772,0.813953,0.816432,0.813953,0.813751,"{0: 47, 1: 24, 2: 58}"
5,0.2461,1.102084,0.790698,0.786002,0.790698,0.787067,"{0: 40, 1: 24, 2: 65}"
6,0.1817,1.120284,0.813953,0.813146,0.813953,0.809584,"{0: 45, 1: 20, 2: 64}"


Evaluation results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



{'eval_loss': 0.9100037813186646, 'eval_accuracy': 0.7712418300653595, 'eval_precision': 0.8034671979308657, 'eval_recall': 0.7712418300653595, 'eval_f1': 0.775077559259586, 'eval_class_distribution': {0: 51, 1: 36, 2: 66}, 'eval_runtime': 2.0961, 'eval_samples_per_second': 72.992, 'eval_steps_per_second': 36.734, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.65      0.94      0.77        36
     Neutral       0.77      0.73      0.75        33
     Positiv       0.93      0.77      0.84        84

    accuracy                           0.80       153
   macro avg       0.79      0.82      0.79       153
weighted avg       0.83      0.80      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 52, 1: 31, 2: 70}
Negativ Precision Score: 0.6538461538461539
Negativ Recall Score: 0.9444444444444444
Negativ F1 Score: 0.7727272727272727

Neutral Precision Score: 0.7741935483870968
Neutral Recall Score: 0

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2099.29 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 1926.87 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 1967.74 examples/s]


Training results for dbmdz/bert-base-german-cased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9323,0.983021,0.782946,0.80353,0.782946,0.781586,"{0: 57, 1: 19, 2: 53}"
2,0.6214,0.876099,0.782946,0.785036,0.782946,0.783746,"{0: 44, 1: 27, 2: 58}"
3,0.5055,1.122199,0.806202,0.812309,0.806202,0.805519,"{0: 50, 1: 22, 2: 57}"
4,0.3355,1.001165,0.79845,0.81319,0.79845,0.802197,"{0: 39, 1: 35, 2: 55}"
5,0.2598,1.011833,0.821705,0.819909,0.821705,0.819558,"{0: 43, 1: 23, 2: 63}"
6,0.1652,1.000776,0.852713,0.871313,0.852713,0.854823,"{0: 34, 1: 37, 2: 58}"
7,0.1052,1.036825,0.837209,0.839404,0.837209,0.837177,"{0: 38, 1: 30, 2: 61}"


Evaluation results for dbmdz/bert-base-german-cased with 7 epochs and random seeds: 42, 42



{'eval_loss': 1.352640151977539, 'eval_accuracy': 0.7973856209150327, 'eval_precision': 0.8236643990181995, 'eval_recall': 0.7973856209150327, 'eval_f1': 0.8039852109512676, 'eval_class_distribution': {0: 34, 1: 46, 2: 73}, 'eval_runtime': 2.087, 'eval_samples_per_second': 73.311, 'eval_steps_per_second': 36.895, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.84      0.72      0.78        36
     Neutral       0.54      0.85      0.66        33
     Positiv       0.91      0.76      0.83        84

    accuracy                           0.77       153
   macro avg       0.76      0.78      0.76       153
weighted avg       0.82      0.77      0.78       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 31, 1: 52, 2: 70}
Negativ Precision Score: 0.8387096774193549
Negativ Recall Score: 0.7222222222222222
Negativ F1 Score: 0.7761194029850746

Neutral Precision Score: 0.5384615384615384
Neutral Recall Score: 0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2111.18 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 1946.44 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 1946.96 examples/s]


Training results for dbmdz/bert-base-german-cased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9435,0.974116,0.75969,0.814685,0.75969,0.755048,"{0: 67, 1: 14, 2: 48}"
2,0.5976,0.926697,0.806202,0.805427,0.806202,0.805525,"{0: 43, 1: 25, 2: 61}"
3,0.5134,0.985255,0.821705,0.821201,0.821705,0.81936,"{0: 46, 1: 22, 2: 61}"
4,0.3574,0.971022,0.813953,0.817937,0.813953,0.812766,"{0: 49, 1: 22, 2: 58}"
5,0.3452,0.847421,0.829457,0.833153,0.829457,0.830438,"{0: 46, 1: 26, 2: 57}"
6,0.2568,0.986991,0.844961,0.854681,0.844961,0.847538,"{0: 40, 1: 33, 2: 56}"
7,0.186,0.994476,0.829457,0.833043,0.829457,0.830386,"{0: 46, 1: 26, 2: 57}"
8,0.1296,1.073395,0.837209,0.839252,0.837209,0.837742,"{0: 45, 1: 26, 2: 58}"


Evaluation results for dbmdz/bert-base-german-cased with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.1393948793411255, 'eval_accuracy': 0.8104575163398693, 'eval_precision': 0.8375052294680265, 'eval_recall': 0.8104575163398693, 'eval_f1': 0.8174305891025281, 'eval_class_distribution': {0: 34, 1: 46, 2: 73}, 'eval_runtime': 2.1013, 'eval_samples_per_second': 72.813, 'eval_steps_per_second': 36.644, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.91      0.81      0.85        36
     Neutral       0.55      0.85      0.67        33
     Positiv       0.93      0.77      0.84        84

    accuracy                           0.80       153
   macro avg       0.79      0.81      0.79       153
weighted avg       0.84      0.80      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 32, 1: 51, 2: 70}
Negativ Precision Score: 0.90625
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.8529411764705882

Neutral Precision Score: 0.5490196078431373
Neutral Recall Score: 0.848484848

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2101.58 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 1960.61 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 1980.77 examples/s]


Training results for dbmdz/bert-base-german-cased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8635,0.890572,0.782946,0.823679,0.782946,0.783298,"{0: 62, 1: 17, 2: 50}"
2,0.5847,0.634543,0.837209,0.836887,0.837209,0.83633,"{0: 43, 1: 24, 2: 62}"
3,0.5203,0.990032,0.837209,0.847914,0.837209,0.834562,"{0: 48, 1: 18, 2: 63}"
4,0.3209,0.961376,0.813953,0.8128,0.813953,0.812597,"{0: 42, 1: 24, 2: 63}"
5,0.261,0.921905,0.837209,0.835239,0.837209,0.835885,"{0: 42, 1: 25, 2: 62}"
6,0.2304,0.878696,0.852713,0.85955,0.852713,0.854463,"{0: 47, 1: 27, 2: 55}"
7,0.1771,0.982737,0.860465,0.859755,0.860465,0.859239,"{0: 45, 1: 24, 2: 60}"
8,0.097,1.107647,0.852713,0.853794,0.852713,0.853163,"{0: 41, 1: 28, 2: 60}"
9,0.0131,1.056028,0.852713,0.860357,0.852713,0.855489,"{0: 41, 1: 31, 2: 57}"
10,0.0044,1.092083,0.860465,0.863673,0.860465,0.861746,"{0: 42, 1: 29, 2: 58}"


Evaluation results for dbmdz/bert-base-german-cased with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.939442753791809, 'eval_accuracy': 0.7581699346405228, 'eval_precision': 0.7797868624634968, 'eval_recall': 0.7581699346405228, 'eval_f1': 0.7630753353973168, 'eval_class_distribution': {0: 39, 1: 43, 2: 71}, 'eval_runtime': 2.1053, 'eval_samples_per_second': 72.673, 'eval_steps_per_second': 36.574, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.74      0.81      0.77        36
     Neutral       0.60      0.79      0.68        33
     Positiv       0.87      0.74      0.80        84

    accuracy                           0.76       153
   macro avg       0.74      0.78      0.75       153
weighted avg       0.78      0.76      0.77       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 39, 1: 43, 2: 71}
Negativ Precision Score: 0.7435897435897436
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.7733333333333333

Neutral Precision Score: 0.6046511627906976
Neutral Recall Score: 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2088.82 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 1907.53 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 1935.72 examples/s]


Training results for dbmdz/bert-base-german-cased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9362,0.9848,0.782946,0.855766,0.782946,0.773352,"{0: 67, 1: 10, 2: 52}"
2,0.6389,0.790838,0.790698,0.803284,0.790698,0.793488,"{0: 49, 1: 28, 2: 52}"
3,0.5574,0.841592,0.852713,0.860543,0.852713,0.853048,"{0: 50, 1: 23, 2: 56}"
4,0.3571,0.868256,0.813953,0.823449,0.813953,0.814294,"{0: 51, 1: 22, 2: 56}"
5,0.31,0.874951,0.844961,0.85222,0.844961,0.847202,"{0: 44, 1: 30, 2: 55}"
6,0.2375,0.957673,0.844961,0.859221,0.844961,0.84917,"{0: 44, 1: 32, 2: 53}"
7,0.1788,1.045677,0.852713,0.855747,0.852713,0.852683,"{0: 47, 1: 24, 2: 58}"
8,0.104,1.025267,0.852713,0.855119,0.852713,0.853586,"{0: 42, 1: 29, 2: 58}"
9,0.0213,1.102681,0.860465,0.86478,0.860465,0.862101,"{0: 43, 1: 29, 2: 57}"
10,0.0226,1.098436,0.868217,0.87089,0.868217,0.869221,"{0: 42, 1: 29, 2: 58}"


Evaluation results for dbmdz/bert-base-german-cased with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.8613201379776, 'eval_accuracy': 0.7843137254901961, 'eval_precision': 0.796696871149207, 'eval_recall': 0.7843137254901961, 'eval_f1': 0.786331173604651, 'eval_class_distribution': {0: 43, 1: 37, 2: 73}, 'eval_runtime': 2.0841, 'eval_samples_per_second': 73.412, 'eval_steps_per_second': 36.946, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.66      0.86      0.75        36
     Neutral       0.66      0.70      0.68        33
     Positiv       0.85      0.71      0.77        84

    accuracy                           0.75       153
   macro avg       0.72      0.76      0.73       153
weighted avg       0.76      0.75      0.75       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 47, 1: 35, 2: 71}
Negativ Precision Score: 0.6595744680851063
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.7469879518072289

Neutral Precision Score: 0.6571428571428571
Neutral Recall Score: 0.69

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2106.97 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 1963.95 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 1971.55 examples/s]


Training results for dbmdz/bert-base-german-cased with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8693,0.902262,0.790698,0.850511,0.790698,0.792097,"{0: 66, 1: 16, 2: 47}"
2,0.5618,0.889389,0.806202,0.804079,0.806202,0.804389,"{0: 42, 1: 24, 2: 63}"
3,0.5087,1.067636,0.829457,0.829222,0.829457,0.828823,"{0: 39, 1: 28, 2: 62}"
4,0.2932,1.044339,0.821705,0.823492,0.821705,0.822333,"{0: 41, 1: 29, 2: 59}"
5,0.3175,0.764195,0.852713,0.850881,0.852713,0.851104,"{0: 43, 1: 24, 2: 62}"
6,0.2461,1.074237,0.829457,0.852762,0.829457,0.831733,"{0: 33, 1: 39, 2: 57}"
7,0.1702,1.371791,0.790698,0.811466,0.790698,0.792962,"{0: 55, 1: 23, 2: 51}"
8,0.1815,1.225169,0.829457,0.831725,0.829457,0.829173,"{0: 45, 1: 23, 2: 61}"
9,0.0847,1.217201,0.837209,0.856437,0.837209,0.839337,"{0: 54, 1: 24, 2: 51}"
10,0.0406,1.230207,0.852713,0.852112,0.852713,0.852325,"{0: 42, 1: 26, 2: 61}"


Evaluation results for dbmdz/bert-base-german-cased with 20 epochs and random seeds: 42, 42



{'eval_loss': 2.3642523288726807, 'eval_accuracy': 0.7516339869281046, 'eval_precision': 0.7854735205118837, 'eval_recall': 0.7516339869281046, 'eval_f1': 0.7564240310920836, 'eval_class_distribution': {0: 42, 1: 46, 2: 65}, 'eval_runtime': 2.0867, 'eval_samples_per_second': 73.321, 'eval_steps_per_second': 36.9, 'epoch': 20.0}
              precision    recall  f1-score   support

     Negativ       0.70      0.89      0.78        36
     Neutral       0.64      0.82      0.72        33
     Positiv       0.91      0.70      0.79        84

    accuracy                           0.77       153
   macro avg       0.75      0.80      0.76       153
weighted avg       0.80      0.77      0.77       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 46, 1: 42, 2: 65}
Negativ Precision Score: 0.6956521739130435
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.7804878048780488

Neutral Precision Score: 0.6428571428571429
Neutral Recall Score: 0

In [5]:
for epoch in [5, 6, 7, 8, 10, 12, 20]:
    print(f'training and results for DBMDZ Bert for {epoch} epochs:')
    absa_model(data, "dbmdz/bert-base-german-cased", rn1=42, rn2=42, epochs=epoch)
    print()
# GPU: NVIDIA A30    

training and results for DBMDZ Bert for 5 epochs:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3562.93 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3884.12 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3959.36 examples/s]


Training results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9036,0.778352,0.790698,0.831414,0.790698,0.792857,"{0: 61, 1: 18, 2: 50}"
2,0.5651,0.818578,0.813953,0.813419,0.813953,0.811981,"{0: 41, 1: 23, 2: 65}"
3,0.4782,1.042544,0.837209,0.838417,0.837209,0.835719,"{0: 44, 1: 22, 2: 63}"
4,0.2545,1.001236,0.829457,0.828705,0.829457,0.828876,"{0: 41, 1: 26, 2: 62}"
5,0.2323,1.030233,0.821705,0.822724,0.821705,0.822133,"{0: 42, 1: 28, 2: 59}"


Evaluation results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



{'eval_loss': 1.1650968790054321, 'eval_accuracy': 0.8104575163398693, 'eval_precision': 0.8107912559201494, 'eval_recall': 0.8104575163398693, 'eval_f1': 0.8098780571201213, 'eval_class_distribution': {0: 40, 1: 31, 2: 82}, 'eval_runtime': 2.3602, 'eval_samples_per_second': 64.826, 'eval_steps_per_second': 32.625, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.74      0.89      0.81        36
     Neutral       0.72      0.70      0.71        33
     Positiv       0.88      0.82      0.85        84

    accuracy                           0.81       153
   macro avg       0.78      0.80      0.79       153
weighted avg       0.82      0.81      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 43, 1: 32, 2: 78}
Negativ Precision Score: 0.7441860465116279
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.810126582278481

Neutral Precision Score: 0.71875
Neutral Recall Score: 0.6969696969

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4068.28 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3896.20 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3872.81 examples/s]


Training results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8546,0.635352,0.837209,0.854386,0.837209,0.833845,"{0: 53, 1: 17, 2: 59}"
2,0.5834,0.727823,0.837209,0.841625,0.837209,0.832996,"{0: 42, 1: 19, 2: 68}"
3,0.4948,0.812102,0.837209,0.838044,0.837209,0.830533,"{0: 47, 1: 18, 2: 64}"
4,0.3001,0.691616,0.852713,0.851183,0.852713,0.851479,"{0: 40, 1: 26, 2: 63}"
5,0.2264,0.870079,0.844961,0.842247,0.844961,0.842344,"{0: 43, 1: 23, 2: 63}"
6,0.1832,0.87351,0.829457,0.825575,0.829457,0.826311,"{0: 43, 1: 23, 2: 63}"


Evaluation results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



{'eval_loss': 1.1648941040039062, 'eval_accuracy': 0.8104575163398693, 'eval_precision': 0.8195113600995952, 'eval_recall': 0.8104575163398693, 'eval_f1': 0.8120520120713143, 'eval_class_distribution': {0: 42, 1: 36, 2: 75}, 'eval_runtime': 2.3337, 'eval_samples_per_second': 65.561, 'eval_steps_per_second': 32.995, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.72      0.86      0.78        36
     Neutral       0.68      0.70      0.69        33
     Positiv       0.88      0.80      0.84        84

    accuracy                           0.79       153
   macro avg       0.76      0.79      0.77       153
weighted avg       0.80      0.79      0.79       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 43, 1: 34, 2: 76}
Negativ Precision Score: 0.7209302325581395
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.7848101265822784

Neutral Precision Score: 0.6764705882352942
Neutral Recall Score: 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4084.40 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3880.08 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3905.26 examples/s]


Training results for dbmdz/bert-base-german-cased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8976,0.97234,0.767442,0.807671,0.767442,0.769215,"{0: 62, 1: 20, 2: 47}"
2,0.5717,0.593936,0.852713,0.855675,0.852713,0.851629,"{0: 47, 1: 22, 2: 60}"
3,0.5104,0.834662,0.852713,0.859883,0.852713,0.851635,"{0: 50, 1: 21, 2: 58}"
4,0.2985,0.876528,0.821705,0.824993,0.821705,0.822488,"{0: 46, 1: 26, 2: 57}"
5,0.2658,0.93846,0.829457,0.830237,0.829457,0.829625,"{0: 44, 1: 26, 2: 59}"
6,0.2105,1.058202,0.829457,0.849955,0.829457,0.835505,"{0: 41, 1: 35, 2: 53}"
7,0.1692,1.023799,0.829457,0.847585,0.829457,0.834858,"{0: 42, 1: 34, 2: 53}"


Evaluation results for dbmdz/bert-base-german-cased with 7 epochs and random seeds: 42, 42



{'eval_loss': 1.2077969312667847, 'eval_accuracy': 0.7712418300653595, 'eval_precision': 0.7781525594416708, 'eval_recall': 0.7712418300653595, 'eval_f1': 0.7699208802303276, 'eval_class_distribution': {0: 47, 1: 28, 2: 78}, 'eval_runtime': 2.3653, 'eval_samples_per_second': 64.685, 'eval_steps_per_second': 32.554, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.72      0.92      0.80        36
     Neutral       0.72      0.64      0.68        33
     Positiv       0.88      0.82      0.85        84

    accuracy                           0.80       153
   macro avg       0.78      0.79      0.78       153
weighted avg       0.81      0.80      0.80       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 46, 1: 29, 2: 78}
Negativ Precision Score: 0.717391304347826
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.8048780487804879

Neutral Precision Score: 0.7241379310344828
Neutral Recall Score: 0

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4055.90 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3915.83 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3985.72 examples/s]


Training results for dbmdz/bert-base-german-cased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9319,0.966613,0.775194,0.839325,0.775194,0.773871,"{0: 67, 1: 14, 2: 48}"
2,0.6032,1.087493,0.744186,0.74047,0.744186,0.740091,"{0: 37, 1: 25, 2: 67}"
3,0.5406,0.975943,0.813953,0.819287,0.813953,0.814349,"{0: 48, 1: 23, 2: 58}"
4,0.3102,0.860637,0.813953,0.830998,0.813953,0.818067,"{0: 49, 1: 29, 2: 51}"
5,0.3445,0.937159,0.829457,0.843196,0.829457,0.832202,"{0: 50, 1: 27, 2: 52}"
6,0.244,0.865802,0.852713,0.863559,0.852713,0.855679,"{0: 41, 1: 33, 2: 55}"
7,0.1618,0.909843,0.852713,0.857119,0.852713,0.853843,"{0: 46, 1: 27, 2: 56}"
8,0.0696,0.971765,0.852713,0.856015,0.852713,0.852998,"{0: 47, 1: 25, 2: 57}"


Evaluation results for dbmdz/bert-base-german-cased with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.306235671043396, 'eval_accuracy': 0.7908496732026143, 'eval_precision': 0.8126481361775479, 'eval_recall': 0.7908496732026143, 'eval_f1': 0.7965811965811966, 'eval_class_distribution': {0: 39, 1: 42, 2: 72}, 'eval_runtime': 2.2902, 'eval_samples_per_second': 66.806, 'eval_steps_per_second': 33.621, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.76      0.86      0.81        36
     Neutral       0.60      0.79      0.68        33
     Positiv       0.93      0.76      0.84        84

    accuracy                           0.79       153
   macro avg       0.76      0.80      0.78       153
weighted avg       0.82      0.79      0.80       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 41, 1: 43, 2: 69}
Negativ Precision Score: 0.7560975609756098
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8051948051948052

Neutral Precision Score: 0.6046511627906976
Neutral Recall Score: 0

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4018.45 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3909.77 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3909.12 examples/s]


Training results for dbmdz/bert-base-german-cased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8833,1.014879,0.775194,0.827673,0.775194,0.776978,"{0: 64, 1: 16, 2: 49}"
2,0.5636,0.675081,0.837209,0.83631,0.837209,0.836675,"{0: 42, 1: 26, 2: 61}"
3,0.515,0.938221,0.837209,0.858515,0.837209,0.838281,"{0: 55, 1: 20, 2: 54}"
4,0.2815,0.872746,0.860465,0.85989,0.860465,0.859904,"{0: 40, 1: 27, 2: 62}"
5,0.2645,0.964401,0.829457,0.82746,0.829457,0.82745,"{0: 41, 1: 24, 2: 64}"
6,0.2104,1.042262,0.844961,0.850884,0.844961,0.846525,"{0: 38, 1: 31, 2: 60}"
7,0.1732,0.809677,0.875969,0.877643,0.875969,0.876589,"{0: 43, 1: 28, 2: 58}"
8,0.1016,0.967563,0.844961,0.844921,0.844961,0.844196,"{0: 44, 1: 24, 2: 61}"
9,0.042,1.047411,0.868217,0.874535,0.868217,0.869942,"{0: 46, 1: 28, 2: 55}"
10,0.0161,1.042477,0.860465,0.863313,0.860465,0.861409,"{0: 44, 1: 28, 2: 57}"


Evaluation results for dbmdz/bert-base-german-cased with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.5276323556900024, 'eval_accuracy': 0.7908496732026143, 'eval_precision': 0.8052272799427928, 'eval_recall': 0.7908496732026143, 'eval_f1': 0.793962209473637, 'eval_class_distribution': {0: 41, 1: 39, 2: 73}, 'eval_runtime': 2.3516, 'eval_samples_per_second': 65.062, 'eval_steps_per_second': 32.744, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.80      0.89      0.84        36
     Neutral       0.65      0.79      0.71        33
     Positiv       0.90      0.79      0.84        84

    accuracy                           0.81       153
   macro avg       0.78      0.82      0.80       153
weighted avg       0.82      0.81      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 40, 2: 73}
Negativ Precision Score: 0.8
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8421052631578947

Neutral Precision Score: 0.65
Neutral Recall Score: 0.7878787878787878
Neutral F1

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4084.16 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3848.42 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3938.41 examples/s]


Training results for dbmdz/bert-base-german-cased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9623,0.888917,0.751938,0.794722,0.751938,0.745918,"{0: 63, 1: 13, 2: 53}"
2,0.6434,0.914787,0.782946,0.788805,0.782946,0.784966,"{0: 43, 1: 30, 2: 56}"
3,0.5927,1.112766,0.806202,0.806738,0.806202,0.804479,"{0: 44, 1: 22, 2: 63}"
4,0.2981,0.814169,0.829457,0.842111,0.829457,0.830347,"{0: 52, 1: 25, 2: 52}"
5,0.3566,1.084527,0.806202,0.80847,0.806202,0.800338,"{0: 46, 1: 18, 2: 65}"
6,0.2676,1.316612,0.782946,0.822771,0.782946,0.790897,"{0: 38, 1: 42, 2: 49}"
7,0.2238,1.204969,0.806202,0.81623,0.806202,0.805988,"{0: 52, 1: 22, 2: 55}"
8,0.1848,1.267827,0.813953,0.82687,0.813953,0.817562,"{0: 44, 1: 32, 2: 53}"
9,0.0582,1.354712,0.829457,0.840002,0.829457,0.832159,"{0: 46, 1: 30, 2: 53}"
10,0.0272,1.373917,0.837209,0.846952,0.837209,0.839451,"{0: 47, 1: 29, 2: 53}"


Evaluation results for dbmdz/bert-base-german-cased with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.7890936136245728, 'eval_accuracy': 0.7908496732026143, 'eval_precision': 0.8093365253077976, 'eval_recall': 0.7908496732026143, 'eval_f1': 0.7934078182244728, 'eval_class_distribution': {0: 43, 1: 40, 2: 70}, 'eval_runtime': 2.3293, 'eval_samples_per_second': 65.685, 'eval_steps_per_second': 33.057, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.65      0.86      0.74        36
     Neutral       0.67      0.79      0.72        33
     Positiv       0.89      0.70      0.79        84

    accuracy                           0.76       153
   macro avg       0.74      0.78      0.75       153
weighted avg       0.79      0.76      0.76       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 48, 1: 39, 2: 66}
Negativ Precision Score: 0.6458333333333334
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.7380952380952381

Neutral Precision Score: 0.6666666666666666
Neutral Recall Score:

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3940.85 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3608.71 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3788.98 examples/s]


Training results for dbmdz/bert-base-german-cased with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8603,0.757781,0.79845,0.823357,0.79845,0.796864,"{0: 58, 1: 18, 2: 53}"
2,0.553,0.96099,0.79845,0.797229,0.79845,0.797506,"{0: 42, 1: 25, 2: 62}"
3,0.5391,1.19297,0.782946,0.787853,0.782946,0.778391,"{0: 47, 1: 18, 2: 64}"
4,0.3083,0.940864,0.806202,0.808154,0.806202,0.805832,"{0: 45, 1: 23, 2: 61}"
5,0.2837,1.060678,0.813953,0.821991,0.813953,0.810088,"{0: 49, 1: 18, 2: 62}"
6,0.2364,1.066856,0.813953,0.820377,0.813953,0.815171,"{0: 48, 1: 26, 2: 55}"
7,0.1828,1.214766,0.829457,0.828295,0.829457,0.828042,"{0: 45, 1: 24, 2: 60}"
8,0.1453,0.952656,0.852713,0.864365,0.852713,0.856193,"{0: 43, 1: 32, 2: 54}"
9,0.0791,1.342412,0.79845,0.820764,0.79845,0.804238,"{0: 38, 1: 37, 2: 54}"
10,0.035,1.510587,0.837209,0.844835,0.837209,0.839445,"{0: 43, 1: 31, 2: 55}"


Evaluation results for dbmdz/bert-base-german-cased with 20 epochs and random seeds: 42, 42



{'eval_loss': 2.3628265857696533, 'eval_accuracy': 0.7647058823529411, 'eval_precision': 0.7896599021166495, 'eval_recall': 0.7647058823529411, 'eval_f1': 0.7677723721067374, 'eval_class_distribution': {0: 48, 1: 37, 2: 68}, 'eval_runtime': 2.3246, 'eval_samples_per_second': 65.817, 'eval_steps_per_second': 33.123, 'epoch': 20.0}
              precision    recall  f1-score   support

     Negativ       0.65      0.94      0.77        36
     Neutral       0.66      0.64      0.65        33
     Positiv       0.93      0.76      0.84        84

    accuracy                           0.78       153
   macro avg       0.75      0.78      0.75       153
weighted avg       0.80      0.78      0.78       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 52, 1: 32, 2: 69}
Negativ Precision Score: 0.6538461538461539
Negativ Recall Score: 0.9444444444444444
Negativ F1 Score: 0.7727272727272727

Neutral Precision Score: 0.65625
Neutral Recall Score: 0.63636363

In [5]:
for epoch in [5, 6, 7, 8, 10, 12, 20]:
    print(f'training and results for DBMDZ Bert for {epoch} epochs:')
    absa_model(data, "dbmdz/bert-base-german-cased", rn1=42, rn2=42, epochs=epoch)
    print()
# GPU: NVIDIA GeForce RTX 2080 Ti

training and results for DBMDZ Bert for 5 epochs:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
tensor([1.0957, 1.3467, 0.7436])tral, positive): 


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2048.36 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2150.15 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2137.66 examples/s]


Training results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,1.0279,0.797409,0.527132,0.736008,0.527132,0.463515,"{0: 1, 1: 63, 2: 65}"
2,0.8365,0.763438,0.806202,0.812771,0.806202,0.805434,"{0: 49, 1: 21, 2: 59}"
3,0.6887,1.076902,0.751938,0.782393,0.751938,0.749415,"{0: 61, 1: 17, 2: 51}"
4,0.4155,1.020229,0.79845,0.807628,0.79845,0.799494,"{0: 49, 1: 28, 2: 52}"
5,0.3619,1.032804,0.806202,0.81698,0.806202,0.808413,"{0: 43, 1: 33, 2: 53}"


Evaluation results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



{'eval_loss': 1.2022868394851685, 'eval_accuracy': 0.7973856209150327, 'eval_precision': 0.825676000612062, 'eval_recall': 0.7973856209150327, 'eval_f1': 0.8031415924841531, 'eval_class_distribution': {0: 39, 1: 45, 2: 69}, 'eval_runtime': 2.4474, 'eval_samples_per_second': 62.515, 'eval_steps_per_second': 31.462, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.74      0.81      0.77        36
     Neutral       0.57      0.79      0.66        33
     Positiv       0.93      0.75      0.83        84

    accuracy                           0.77       153
   macro avg       0.75      0.78      0.75       153
weighted avg       0.81      0.77      0.78       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 39, 1: 46, 2: 68}
Negativ Precision Score: 0.7435897435897436
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.7733333333333333

Neutral Precision Score: 0.5652173913043478
Neutral Recall Score: 0

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2141.97 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 1992.74 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2012.84 examples/s]


Training results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8523,0.894074,0.782946,0.807729,0.782946,0.780471,"{0: 58, 1: 17, 2: 54}"
2,0.5819,0.67374,0.860465,0.866387,0.860465,0.859211,"{0: 34, 1: 31, 2: 64}"
3,0.483,0.816776,0.852713,0.870152,0.852713,0.849479,"{0: 53, 1: 17, 2: 59}"
4,0.3237,0.732794,0.868217,0.867982,0.868217,0.865988,"{0: 44, 1: 22, 2: 63}"
5,0.2346,0.759032,0.875969,0.876038,0.875969,0.875904,"{0: 43, 1: 26, 2: 60}"
6,0.1066,0.804046,0.868217,0.868725,0.868217,0.867873,"{0: 45, 1: 25, 2: 59}"


Evaluation results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



{'eval_loss': 1.4877865314483643, 'eval_accuracy': 0.7973856209150327, 'eval_precision': 0.8130293851958126, 'eval_recall': 0.7973856209150327, 'eval_f1': 0.8001833698428126, 'eval_class_distribution': {0: 40, 1: 41, 2: 72}, 'eval_runtime': 2.4002, 'eval_samples_per_second': 63.745, 'eval_steps_per_second': 32.081, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.78      0.86      0.82        36
     Neutral       0.60      0.79      0.68        33
     Positiv       0.91      0.76      0.83        84

    accuracy                           0.79       153
   macro avg       0.76      0.80      0.78       153
weighted avg       0.81      0.79      0.80       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 43, 2: 70}
Negativ Precision Score: 0.775
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8157894736842105

Neutral Precision Score: 0.6046511627906976
Neutral Recall Score: 0.78787878787

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2338.88 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2144.20 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2165.62 examples/s]


Training results for dbmdz/bert-base-german-cased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9035,0.894085,0.767442,0.798863,0.767442,0.762026,"{0: 59, 1: 14, 2: 56}"
2,0.574,0.81382,0.806202,0.812537,0.806202,0.806828,"{0: 37, 1: 33, 2: 59}"
3,0.5207,1.017331,0.821705,0.836483,0.821705,0.819235,"{0: 53, 1: 18, 2: 58}"
4,0.2863,0.903589,0.829457,0.839133,0.829457,0.828291,"{0: 51, 1: 20, 2: 58}"
5,0.3057,0.931583,0.844961,0.855982,0.844961,0.8442,"{0: 51, 1: 20, 2: 58}"
6,0.2108,0.917511,0.844961,0.849123,0.844961,0.846566,"{0: 44, 1: 28, 2: 57}"
7,0.1225,0.914326,0.844961,0.84581,0.844961,0.843528,"{0: 47, 1: 23, 2: 59}"


Evaluation results for dbmdz/bert-base-german-cased with 7 epochs and random seeds: 42, 42



{'eval_loss': 1.4353011846542358, 'eval_accuracy': 0.7843137254901961, 'eval_precision': 0.8103871885442826, 'eval_recall': 0.7843137254901961, 'eval_f1': 0.7874475493591642, 'eval_class_distribution': {0: 43, 1: 43, 2: 67}, 'eval_runtime': 2.4388, 'eval_samples_per_second': 62.735, 'eval_steps_per_second': 31.573, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.78      0.86      0.82        36
     Neutral       0.57      0.88      0.69        33
     Positiv       0.94      0.69      0.79        84

    accuracy                           0.77       153
   macro avg       0.76      0.81      0.77       153
weighted avg       0.82      0.77      0.78       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 51, 2: 62}
Negativ Precision Score: 0.775
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8157894736842105

Neutral Precision Score: 0.5686274509803921
Neutral Recall Score: 0.87878787878

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2255.97 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2119.81 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2131.08 examples/s]


Training results for dbmdz/bert-base-german-cased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.87,0.830425,0.790698,0.833203,0.790698,0.781053,"{0: 62, 1: 12, 2: 55}"
2,0.62,0.724596,0.837209,0.83996,0.837209,0.837449,"{0: 46, 1: 24, 2: 59}"
3,0.4875,0.942663,0.813953,0.820855,0.813953,0.812992,"{0: 50, 1: 21, 2: 58}"
4,0.2757,0.978996,0.813953,0.826129,0.813953,0.815003,"{0: 52, 1: 23, 2: 54}"
5,0.2528,0.914009,0.852713,0.857864,0.852713,0.853246,"{0: 48, 1: 24, 2: 57}"
6,0.1871,1.015353,0.821705,0.834804,0.821705,0.822159,"{0: 33, 1: 35, 2: 61}"
7,0.1233,0.949347,0.852713,0.85359,0.852713,0.851817,"{0: 44, 1: 23, 2: 62}"
8,0.0722,0.957005,0.868217,0.870239,0.868217,0.867583,"{0: 46, 1: 23, 2: 60}"


Evaluation results for dbmdz/bert-base-german-cased with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.3486802577972412, 'eval_accuracy': 0.8235294117647058, 'eval_precision': 0.8264042459088898, 'eval_recall': 0.8235294117647058, 'eval_f1': 0.8244064386776851, 'eval_class_distribution': {0: 38, 1: 35, 2: 80}, 'eval_runtime': 2.404, 'eval_samples_per_second': 63.644, 'eval_steps_per_second': 32.03, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.81      0.81      0.81        36
     Neutral       0.73      0.82      0.77        33
     Positiv       0.86      0.82      0.84        84

    accuracy                           0.82       153
   macro avg       0.80      0.82      0.81       153
weighted avg       0.82      0.82      0.82       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 36, 1: 37, 2: 80}
Negativ Precision Score: 0.8055555555555556
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.8055555555555556

Neutral Precision Score: 0.7297297297297297
Neutral Recall Score: 0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2179.11 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2069.19 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2082.45 examples/s]


Training results for dbmdz/bert-base-german-cased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,1.0294,0.777805,0.689922,0.68988,0.689922,0.682433,"{0: 50, 1: 17, 2: 62}"
2,0.7801,0.962645,0.775194,0.785988,0.775194,0.769714,"{0: 29, 1: 29, 2: 71}"
3,0.6854,1.380004,0.75969,0.765568,0.75969,0.758527,"{0: 45, 1: 20, 2: 64}"
4,0.4271,1.088892,0.806202,0.810779,0.806202,0.80766,"{0: 45, 1: 28, 2: 56}"
5,0.3054,1.286147,0.767442,0.767053,0.767442,0.764576,"{0: 36, 1: 26, 2: 67}"
6,0.2149,1.222044,0.837209,0.839374,0.837209,0.838023,"{0: 41, 1: 29, 2: 59}"
7,0.1274,1.379163,0.79845,0.801088,0.79845,0.798055,"{0: 37, 1: 31, 2: 61}"
8,0.0723,1.507279,0.782946,0.789477,0.782946,0.78254,"{0: 50, 1: 22, 2: 57}"
9,0.0185,1.505304,0.806202,0.811836,0.806202,0.807995,"{0: 41, 1: 31, 2: 57}"
10,0.0093,1.474956,0.821705,0.828657,0.821705,0.823629,"{0: 40, 1: 32, 2: 57}"


Evaluation results for dbmdz/bert-base-german-cased with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.6584608554840088, 'eval_accuracy': 0.7777777777777778, 'eval_precision': 0.7888076153767539, 'eval_recall': 0.7777777777777778, 'eval_f1': 0.7813976788294946, 'eval_class_distribution': {0: 34, 1: 40, 2: 79}, 'eval_runtime': 2.386, 'eval_samples_per_second': 64.124, 'eval_steps_per_second': 32.271, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.82      0.64      0.72        36
     Neutral       0.57      0.64      0.60        33
     Positiv       0.78      0.82      0.80        84

    accuracy                           0.74       153
   macro avg       0.72      0.70      0.71       153
weighted avg       0.75      0.74      0.74       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 28, 1: 37, 2: 88}
Negativ Precision Score: 0.8214285714285714
Negativ Recall Score: 0.6388888888888888
Negativ F1 Score: 0.71875

Neutral Precision Score: 0.5675675675675675
Neutral Recall Score: 0.636363636

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2254.41 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2076.64 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2114.66 examples/s]


Training results for dbmdz/bert-base-german-cased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.99,0.959278,0.767442,0.811902,0.767442,0.766511,"{0: 64, 1: 17, 2: 48}"
2,0.6546,0.703413,0.79845,0.807774,0.79845,0.797899,"{0: 52, 1: 22, 2: 55}"
3,0.5942,0.844083,0.829457,0.841003,0.829457,0.83009,"{0: 52, 1: 23, 2: 54}"
4,0.3291,0.727475,0.852713,0.858804,0.852713,0.853074,"{0: 49, 1: 24, 2: 56}"
5,0.3138,0.889308,0.860465,0.864213,0.860465,0.859374,"{0: 48, 1: 22, 2: 59}"
6,0.255,1.175668,0.813953,0.822991,0.813953,0.810403,"{0: 53, 1: 19, 2: 57}"
7,0.1406,1.36152,0.806202,0.835534,0.806202,0.807326,"{0: 58, 1: 19, 2: 52}"
8,0.0959,1.362457,0.821705,0.832609,0.821705,0.821175,"{0: 52, 1: 21, 2: 56}"
9,0.0533,1.166212,0.860465,0.865208,0.860465,0.860739,"{0: 48, 1: 24, 2: 57}"
10,0.0251,1.241231,0.844961,0.853032,0.844961,0.845382,"{0: 50, 1: 23, 2: 56}"


Evaluation results for dbmdz/bert-base-german-cased with 12 epochs and random seeds: 42, 42



{'eval_loss': 2.1658685207366943, 'eval_accuracy': 0.7647058823529411, 'eval_precision': 0.7879465447839513, 'eval_recall': 0.7647058823529411, 'eval_f1': 0.7687012259949794, 'eval_class_distribution': {0: 41, 1: 43, 2: 69}, 'eval_runtime': 2.3559, 'eval_samples_per_second': 64.942, 'eval_steps_per_second': 32.683, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.72      0.81      0.76        36
     Neutral       0.56      0.73      0.63        33
     Positiv       0.89      0.74      0.81        84

    accuracy                           0.75       153
   macro avg       0.72      0.76      0.73       153
weighted avg       0.78      0.75      0.76       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 43, 2: 70}
Negativ Precision Score: 0.725
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.7631578947368421

Neutral Precision Score: 0.5581395348837209
Neutral Recall Score: 0.7272727272

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2221.59 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2151.10 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2151.86 examples/s]


Training results for dbmdz/bert-base-german-cased with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.855,0.900918,0.775194,0.825655,0.775194,0.763045,"{0: 65, 1: 11, 2: 53}"
2,0.6191,0.904751,0.806202,0.808376,0.806202,0.803525,"{0: 34, 1: 30, 2: 65}"
3,0.5334,1.084814,0.790698,0.81243,0.790698,0.772799,"{0: 55, 1: 11, 2: 63}"
4,0.3037,0.784985,0.837209,0.855427,0.837209,0.838953,"{0: 54, 1: 23, 2: 52}"
5,0.2601,1.204425,0.821705,0.82827,0.821705,0.824028,"{0: 43, 1: 30, 2: 56}"
6,0.1627,0.989953,0.844961,0.848547,0.844961,0.84367,"{0: 35, 1: 30, 2: 64}"
7,0.1207,1.100035,0.844961,0.857685,0.844961,0.84375,"{0: 53, 1: 20, 2: 56}"
8,0.1043,1.439318,0.813953,0.814016,0.813953,0.812205,"{0: 47, 1: 23, 2: 59}"
9,0.0307,0.966829,0.875969,0.878263,0.875969,0.876631,"{0: 44, 1: 28, 2: 57}"
10,0.0224,1.150619,0.852713,0.858361,0.852713,0.850239,"{0: 49, 1: 20, 2: 60}"


Evaluation results for dbmdz/bert-base-german-cased with 20 epochs and random seeds: 42, 42



{'eval_loss': 2.1362428665161133, 'eval_accuracy': 0.7320261437908496, 'eval_precision': 0.754393659442453, 'eval_recall': 0.7320261437908496, 'eval_f1': 0.736079077632229, 'eval_class_distribution': {0: 43, 1: 41, 2: 69}, 'eval_runtime': 2.4203, 'eval_samples_per_second': 63.216, 'eval_steps_per_second': 31.815, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.75      0.83      0.79        36
     Neutral       0.59      0.73      0.65        33
     Positiv       0.88      0.75      0.81        84

    accuracy                           0.76       153
   macro avg       0.74      0.77      0.75       153
weighted avg       0.78      0.76      0.77       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 41, 2: 72}
Negativ Precision Score: 0.75
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.7894736842105263

Neutral Precision Score: 0.5853658536585366
Neutral Recall Score: 0.7272727272727

Best model: German Sentiment Bert

In [7]:
for epoch in [5, 6, 7, 8]:
    print(f'training and results for German Sentiment Bert for {epoch} epochs:')
    absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=epoch)
    print()
# GPU: Tesla V100-PCIE-32GB

training and results for DBMDZ Bert for 5 epochs:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 2112.98 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 1964.34 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 1921.75 examples/s]


Training results for aari1995/German_Sentiment with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8809,0.688051,0.875969,0.879576,0.875969,0.875312,"{0: 48, 1: 23, 2: 58}"
2,0.55,0.635333,0.860465,0.865495,0.860465,0.861826,"{0: 39, 1: 31, 2: 59}"
3,0.4297,0.908124,0.852713,0.863738,0.852713,0.856287,"{0: 45, 1: 30, 2: 54}"
4,0.2409,0.824012,0.875969,0.87696,0.875969,0.875282,"{0: 46, 1: 24, 2: 59}"
5,0.2111,0.964227,0.860465,0.863238,0.860465,0.86137,"{0: 44, 1: 28, 2: 57}"


Evaluation results for aari1995/German_Sentiment with 5 epochs and random seeds: 42, 42



{'eval_loss': 0.9809753894805908, 'eval_accuracy': 0.8300653594771242, 'eval_precision': 0.8281968677625521, 'eval_recall': 0.8300653594771242, 'eval_f1': 0.828152955398613, 'eval_class_distribution': {0: 39, 1: 29, 2: 85}, 'eval_runtime': 5.8332, 'eval_samples_per_second': 26.229, 'eval_steps_per_second': 13.2, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.82      0.92      0.87        36
     Neutral       0.76      0.67      0.71        33
     Positiv       0.87      0.87      0.87        84

    accuracy                           0.84       153
   macro avg       0.82      0.82      0.82       153
weighted avg       0.83      0.84      0.83       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 29, 2: 84}
Negativ Precision Score: 0.825
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.868421052631579

Neutral Precision Score: 0.7586206896551724
Neutral Recall Score: 0.666666666666666

Map: 100%|██████████| 1111/1111 [00:00<00:00, 2079.61 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 1912.32 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 1960.85 examples/s]


Training results for aari1995/German_Sentiment with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8563,0.737111,0.844961,0.860879,0.844961,0.840824,"{0: 54, 1: 17, 2: 58}"
2,0.5236,0.637032,0.868217,0.869634,0.868217,0.868555,"{0: 40, 1: 29, 2: 60}"
3,0.4743,0.605733,0.906977,0.910442,0.906977,0.906194,"{0: 46, 1: 22, 2: 61}"
4,0.2143,0.566951,0.875969,0.880657,0.875969,0.877412,"{0: 45, 1: 28, 2: 56}"
5,0.1255,0.753115,0.891473,0.89739,0.891473,0.893341,"{0: 40, 1: 31, 2: 58}"
6,0.0461,0.805806,0.891473,0.894508,0.891473,0.892493,"{0: 44, 1: 28, 2: 57}"


Evaluation results for aari1995/German_Sentiment with 6 epochs and random seeds: 42, 42



{'eval_loss': 0.8249726891517639, 'eval_accuracy': 0.8758169934640523, 'eval_precision': 0.8747129482423601, 'eval_recall': 0.8758169934640523, 'eval_f1': 0.8751861264351191, 'eval_class_distribution': {0: 37, 1: 32, 2: 84}, 'eval_runtime': 5.8267, 'eval_samples_per_second': 26.259, 'eval_steps_per_second': 13.215, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.87      0.92      0.89        36
     Neutral       0.74      0.70      0.72        33
     Positiv       0.92      0.92      0.92        84

    accuracy                           0.87       153
   macro avg       0.84      0.84      0.84       153
weighted avg       0.87      0.87      0.87       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 38, 1: 31, 2: 84}
Negativ Precision Score: 0.868421052631579
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.8918918918918919

Neutral Precision Score: 0.7419354838709677
Neutral Recall Score: 0

Map: 100%|██████████| 1111/1111 [00:00<00:00, 2066.45 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 1912.82 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 1945.22 examples/s]


Training results for aari1995/German_Sentiment with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8406,0.677277,0.875969,0.881229,0.875969,0.874742,"{0: 48, 1: 21, 2: 60}"
2,0.5148,0.561571,0.852713,0.853139,0.852713,0.852835,"{0: 41, 1: 28, 2: 60}"
3,0.5142,0.705701,0.875969,0.881024,0.875969,0.87198,"{0: 49, 1: 19, 2: 61}"
4,0.2957,0.641458,0.891473,0.89366,0.891473,0.890966,"{0: 47, 1: 24, 2: 58}"
5,0.2029,0.870397,0.868217,0.870965,0.868217,0.86932,"{0: 41, 1: 29, 2: 59}"
6,0.134,0.751749,0.883721,0.891589,0.883721,0.886558,"{0: 41, 1: 31, 2: 57}"
7,0.0695,0.721296,0.899225,0.901084,0.899225,0.899628,"{0: 45, 1: 26, 2: 58}"


Evaluation results for aari1995/German_Sentiment with 7 epochs and random seeds: 42, 42



{'eval_loss': 1.2591543197631836, 'eval_accuracy': 0.8300653594771242, 'eval_precision': 0.8355957767722473, 'eval_recall': 0.8300653594771242, 'eval_f1': 0.8315628387694315, 'eval_class_distribution': {0: 39, 1: 36, 2: 78}, 'eval_runtime': 5.8325, 'eval_samples_per_second': 26.232, 'eval_steps_per_second': 13.202, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.87      0.92      0.89        36
     Neutral       0.70      0.85      0.77        33
     Positiv       0.95      0.85      0.89        84

    accuracy                           0.86       153
   macro avg       0.84      0.87      0.85       153
weighted avg       0.88      0.86      0.87       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 38, 1: 40, 2: 75}
Negativ Precision Score: 0.868421052631579
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.8918918918918919

Neutral Precision Score: 0.7
Neutral Recall Score: 0.84848484848484

Map: 100%|██████████| 1111/1111 [00:00<00:00, 2082.14 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 1942.34 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2008.04 examples/s]


Training results for aari1995/German_Sentiment with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8982,0.744844,0.860465,0.861388,0.860465,0.857753,"{0: 46, 1: 21, 2: 62}"
2,0.5533,0.500796,0.891473,0.900517,0.891473,0.893963,"{0: 45, 1: 30, 2: 54}"
3,0.4948,0.765039,0.868217,0.878776,0.868217,0.865375,"{0: 52, 1: 19, 2: 58}"
4,0.2549,0.649222,0.875969,0.887438,0.875969,0.879569,"{0: 44, 1: 31, 2: 54}"
5,0.2389,0.759473,0.875969,0.882004,0.875969,0.876996,"{0: 48, 1: 25, 2: 56}"
6,0.1628,0.756356,0.883721,0.882957,0.883721,0.88302,"{0: 43, 1: 25, 2: 61}"
7,0.1415,0.770984,0.883721,0.891336,0.883721,0.884564,"{0: 49, 1: 26, 2: 54}"
8,0.0586,0.778513,0.891473,0.896541,0.891473,0.892253,"{0: 47, 1: 27, 2: 55}"


Evaluation results for aari1995/German_Sentiment with 8 epochs and random seeds: 42, 42



{'eval_loss': 0.6822752356529236, 'eval_accuracy': 0.8366013071895425, 'eval_precision': 0.8523992347521759, 'eval_recall': 0.8366013071895425, 'eval_f1': 0.8411792070558765, 'eval_class_distribution': {0: 33, 1: 42, 2: 78}, 'eval_runtime': 5.7832, 'eval_samples_per_second': 26.456, 'eval_steps_per_second': 13.314, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.91      0.86      0.89        36
     Neutral       0.66      0.82      0.73        33
     Positiv       0.91      0.85      0.88        84

    accuracy                           0.84       153
   macro avg       0.83      0.84      0.83       153
weighted avg       0.86      0.84      0.85       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 34, 1: 41, 2: 78}
Negativ Precision Score: 0.9117647058823529
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8857142857142857

Neutral Precision Score: 0.6585365853658537
Neutral Recall Score: 

In [5]:
for epoch in [10, 12, 20]:
    print(f'training and results for German Sentiment Bert for {epoch} epochs:')
    absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=epoch)
    print()
# GPU: Tesla V100-PCIE-32GB, 10, 12 epochs saved (21.05.20205)

training and results for DBMDZ Bert for 10 epochs:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 1528.58 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2067.98 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2081.26 examples/s]


Training results for aari1995/German_Sentiment with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8523,0.70509,0.875969,0.878495,0.875969,0.874823,"{0: 48, 1: 23, 2: 58}"
2,0.528,0.614368,0.883721,0.883492,0.883721,0.883509,"{0: 43, 1: 26, 2: 60}"
3,0.4638,1.01926,0.844961,0.860479,0.844961,0.834611,"{0: 52, 1: 14, 2: 63}"
4,0.2759,0.626463,0.891473,0.893885,0.891473,0.891735,"{0: 46, 1: 26, 2: 57}"
5,0.176,0.834789,0.899225,0.899161,0.899225,0.897316,"{0: 41, 1: 23, 2: 65}"
6,0.1102,0.659578,0.922481,0.92359,0.922481,0.922393,"{0: 45, 1: 25, 2: 59}"
7,0.0759,0.667169,0.906977,0.90967,0.906977,0.907683,"{0: 45, 1: 27, 2: 57}"
8,0.0292,0.703687,0.906977,0.907636,0.906977,0.907057,"{0: 44, 1: 26, 2: 59}"
9,0.0017,0.704382,0.914729,0.914982,0.914729,0.914437,"{0: 44, 1: 25, 2: 60}"
10,0.0008,0.711589,0.914729,0.914982,0.914729,0.914437,"{0: 44, 1: 25, 2: 60}"


Evaluation results for aari1995/German_Sentiment with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.0469062328338623, 'eval_accuracy': 0.8627450980392157, 'eval_precision': 0.8671141729965259, 'eval_recall': 0.8627450980392157, 'eval_f1': 0.8642881509920799, 'eval_class_distribution': {0: 37, 1: 36, 2: 80}, 'eval_runtime': 6.2679, 'eval_samples_per_second': 24.41, 'eval_steps_per_second': 12.285, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.86      0.89      0.88        36
     Neutral       0.69      0.76      0.72        33
     Positiv       0.91      0.87      0.89        84

    accuracy                           0.85       153
   macro avg       0.82      0.84      0.83       153
weighted avg       0.85      0.85      0.85       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 37, 1: 36, 2: 80}
Negativ Precision Score: 0.8648648648648649
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8767123287671232

Neutral Precision Score: 0.6944444444444444
Neutral Recall Score: 

Map: 100%|██████████| 1111/1111 [00:00<00:00, 2287.81 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2118.45 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2186.91 examples/s]


Training results for aari1995/German_Sentiment with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8587,0.668938,0.883721,0.884843,0.883721,0.883099,"{0: 46, 1: 24, 2: 59}"
2,0.5252,0.560682,0.875969,0.876837,0.875969,0.876334,"{0: 43, 1: 27, 2: 59}"
3,0.5044,1.000584,0.860465,0.861812,0.860465,0.857978,"{0: 46, 1: 21, 2: 62}"
4,0.3079,0.742252,0.875969,0.87893,0.875969,0.874885,"{0: 47, 1: 22, 2: 60}"
5,0.2485,0.819943,0.891473,0.891577,0.891473,0.891274,"{0: 40, 1: 28, 2: 61}"
6,0.1941,0.813111,0.883721,0.893488,0.883721,0.885089,"{0: 50, 1: 24, 2: 55}"
7,0.1245,0.82467,0.899225,0.907392,0.899225,0.900514,"{0: 49, 1: 24, 2: 56}"
8,0.05,0.696972,0.914729,0.915519,0.914729,0.914874,"{0: 44, 1: 26, 2: 59}"
9,0.0214,0.762072,0.906977,0.907982,0.906977,0.906858,"{0: 45, 1: 25, 2: 59}"
10,0.0018,0.810492,0.899225,0.901448,0.899225,0.899124,"{0: 46, 1: 24, 2: 59}"


Evaluation results for aari1995/German_Sentiment with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.2087804079055786, 'eval_accuracy': 0.8496732026143791, 'eval_precision': 0.8498477215699044, 'eval_recall': 0.8496732026143791, 'eval_f1': 0.8497042980919709, 'eval_class_distribution': {0: 37, 1: 33, 2: 83}, 'eval_runtime': 5.7842, 'eval_samples_per_second': 26.451, 'eval_steps_per_second': 13.312, 'epoch': 11.0}
              precision    recall  f1-score   support

     Negativ       0.89      0.86      0.87        36
     Neutral       0.71      0.82      0.76        33
     Positiv       0.93      0.88      0.90        84

    accuracy                           0.86       153
   macro avg       0.84      0.85      0.85       153
weighted avg       0.87      0.86      0.86       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 35, 1: 38, 2: 80}
Negativ Precision Score: 0.8857142857142857
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8732394366197183

Neutral Precision Score: 0.7105263157894737
Neutral Recall Score:

Map: 100%|██████████| 1111/1111 [00:00<00:00, 2198.16 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2058.09 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2029.68 examples/s]


Training results for aari1995/German_Sentiment with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8728,0.813533,0.852713,0.865438,0.852713,0.84983,"{0: 52, 1: 18, 2: 59}"
2,0.5518,0.845349,0.829457,0.831668,0.829457,0.826141,"{0: 35, 1: 25, 2: 69}"
3,0.5204,0.793244,0.868217,0.871262,0.868217,0.866021,"{0: 48, 1: 21, 2: 60}"
4,0.2919,0.7591,0.852713,0.857641,0.852713,0.854298,"{0: 45, 1: 28, 2: 56}"
5,0.2679,0.561113,0.906977,0.915164,0.906977,0.904989,"{0: 50, 1: 20, 2: 59}"
6,0.1769,0.855669,0.891473,0.892189,0.891473,0.891734,"{0: 41, 1: 28, 2: 60}"
7,0.0901,0.853548,0.906977,0.908656,0.906977,0.906851,"{0: 45, 1: 24, 2: 60}"
8,0.1096,1.121755,0.860465,0.861878,0.860465,0.861088,"{0: 42, 1: 28, 2: 59}"
9,0.0385,1.548141,0.844961,0.858809,0.844961,0.848915,"{0: 38, 1: 34, 2: 57}"
10,0.0124,0.983451,0.899225,0.900654,0.899225,0.898439,"{0: 45, 1: 23, 2: 61}"


Evaluation results for aari1995/German_Sentiment with 20 epochs and random seeds: 42, 42



{'eval_loss': 1.3620851039886475, 'eval_accuracy': 0.8366013071895425, 'eval_precision': 0.8397894675368287, 'eval_recall': 0.8366013071895425, 'eval_f1': 0.8371428957512576, 'eval_class_distribution': {0: 40, 1: 34, 2: 79}, 'eval_runtime': 5.7601, 'eval_samples_per_second': 26.562, 'eval_steps_per_second': 13.368, 'epoch': 16.0}
              precision    recall  f1-score   support

     Negativ       0.82      0.92      0.87        36
     Neutral       0.72      0.70      0.71        33
     Positiv       0.89      0.86      0.87        84

    accuracy                           0.84       153
   macro avg       0.81      0.82      0.82       153
weighted avg       0.84      0.84      0.84       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 32, 2: 81}
Negativ Precision Score: 0.825
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.868421052631579

Neutral Precision Score: 0.71875
Neutral Recall Score: 0.696969696969697
Neutra

In [5]:
for epoch in [5, 6, 7, 8]:
    print(f'training and results for German Sentiment Bert for {epoch} epochs:')
    absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=epoch)
    print()
# GPU: NVIDIA GeForce RTX 2080 Ti    

training and results for DBMDZ Bert for 5 epochs:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 1172.01 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2185.81 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2237.93 examples/s]


Training results for aari1995/German_Sentiment with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8823,0.721462,0.875969,0.888727,0.875969,0.874627,"{0: 51, 1: 19, 2: 59}"
2,0.5643,0.48372,0.868217,0.870841,0.868217,0.868716,"{0: 39, 1: 30, 2: 60}"
3,0.5064,0.586305,0.875969,0.876837,0.875969,0.876334,"{0: 43, 1: 27, 2: 59}"
4,0.2086,0.593073,0.883721,0.886732,0.883721,0.88365,"{0: 47, 1: 24, 2: 58}"
5,0.1784,0.725956,0.860465,0.863998,0.860465,0.861749,"{0: 44, 1: 28, 2: 57}"


Evaluation results for aari1995/German_Sentiment with 5 epochs and random seeds: 42, 42



{'eval_loss': 0.8373995423316956, 'eval_accuracy': 0.8496732026143791, 'eval_precision': 0.8689605028614317, 'eval_recall': 0.8496732026143791, 'eval_f1': 0.8543234685178419, 'eval_class_distribution': {0: 33, 1: 44, 2: 76}, 'eval_runtime': 5.5374, 'eval_samples_per_second': 27.63, 'eval_steps_per_second': 13.906, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.91      0.81      0.85        36
     Neutral       0.60      0.91      0.72        33
     Positiv       0.96      0.81      0.88        84

    accuracy                           0.83       153
   macro avg       0.82      0.84      0.82       153
weighted avg       0.87      0.83      0.84       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 32, 1: 50, 2: 71}
Negativ Precision Score: 0.90625
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.8529411764705882

Neutral Precision Score: 0.6
Neutral Recall Score: 0.9090909090909091
Neutral 

Map: 100%|██████████| 1111/1111 [00:00<00:00, 2408.93 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2225.73 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2265.01 examples/s]


Training results for aari1995/German_Sentiment with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8745,0.648956,0.868217,0.885253,0.868217,0.867078,"{0: 54, 1: 19, 2: 56}"
2,0.5151,0.484807,0.883721,0.883979,0.883721,0.88295,"{0: 45, 1: 24, 2: 60}"
3,0.4706,0.64732,0.883721,0.884501,0.883721,0.882684,"{0: 43, 1: 23, 2: 63}"
4,0.2648,0.582936,0.899225,0.901448,0.899225,0.899124,"{0: 46, 1: 24, 2: 59}"
5,0.1992,0.779451,0.883721,0.883721,0.883721,0.883721,"{0: 42, 1: 27, 2: 60}"
6,0.1066,0.748869,0.891473,0.891542,0.891473,0.891407,"{0: 43, 1: 26, 2: 60}"


Evaluation results for aari1995/German_Sentiment with 6 epochs and random seeds: 42, 42



{'eval_loss': 0.9918785095214844, 'eval_accuracy': 0.8366013071895425, 'eval_precision': 0.8354838447006709, 'eval_recall': 0.8366013071895425, 'eval_f1': 0.8359001850692479, 'eval_class_distribution': {0: 35, 1: 32, 2: 86}, 'eval_runtime': 5.5212, 'eval_samples_per_second': 27.711, 'eval_steps_per_second': 13.946, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.85      0.81      0.83        36
     Neutral       0.68      0.64      0.66        33
     Positiv       0.86      0.90      0.88        84

    accuracy                           0.82       153
   macro avg       0.80      0.78      0.79       153
weighted avg       0.82      0.82      0.82       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 34, 1: 31, 2: 88}
Negativ Precision Score: 0.8529411764705882
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.8285714285714286

Neutral Precision Score: 0.6774193548387096
Neutral Recall Score: 

Map: 100%|██████████| 1111/1111 [00:00<00:00, 2384.26 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2246.09 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2241.53 examples/s]


Training results for aari1995/German_Sentiment with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8686,0.61173,0.891473,0.891541,0.891473,0.890727,"{0: 44, 1: 24, 2: 61}"
2,0.5555,0.569717,0.860465,0.864491,0.860465,0.861585,"{0: 46, 1: 26, 2: 57}"
3,0.4825,0.803119,0.844961,0.864683,0.844961,0.84873,"{0: 51, 1: 28, 2: 50}"
4,0.2388,0.702527,0.868217,0.87728,0.868217,0.867032,"{0: 50, 1: 20, 2: 59}"
5,0.2337,0.757945,0.852713,0.857793,0.852713,0.854196,"{0: 46, 1: 27, 2: 56}"
6,0.1407,1.141204,0.829457,0.839399,0.829457,0.832374,"{0: 44, 1: 31, 2: 54}"
7,0.0719,1.019477,0.852713,0.858992,0.852713,0.854525,"{0: 45, 1: 29, 2: 55}"


Evaluation results for aari1995/German_Sentiment with 7 epochs and random seeds: 42, 42



{'eval_loss': 0.7267366647720337, 'eval_accuracy': 0.8627450980392157, 'eval_precision': 0.8606281412216186, 'eval_recall': 0.8627450980392157, 'eval_f1': 0.86056310735906, 'eval_class_distribution': {0: 35, 1: 29, 2: 89}, 'eval_runtime': 5.5687, 'eval_samples_per_second': 27.475, 'eval_steps_per_second': 13.827, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.88      0.83      0.86        36
     Neutral       0.79      0.70      0.74        33
     Positiv       0.87      0.93      0.90        84

    accuracy                           0.86       153
   macro avg       0.85      0.82      0.83       153
weighted avg       0.85      0.86      0.85       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 34, 1: 29, 2: 90}
Negativ Precision Score: 0.8823529411764706
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.8571428571428571

Neutral Precision Score: 0.7931034482758621
Neutral Recall Score: 0.

Map: 100%|██████████| 1111/1111 [00:00<00:00, 2395.86 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2247.03 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2259.58 examples/s]


Training results for aari1995/German_Sentiment with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8911,0.716446,0.868217,0.869417,0.868217,0.866342,"{0: 47, 1: 22, 2: 60}"
2,0.5442,0.536765,0.860465,0.86162,0.860465,0.86081,"{0: 44, 1: 26, 2: 59}"
3,0.534,0.861129,0.860465,0.871327,0.860465,0.858525,"{0: 51, 1: 19, 2: 59}"
4,0.2818,0.729262,0.868217,0.87336,0.868217,0.866647,"{0: 49, 1: 21, 2: 59}"
5,0.2519,0.827291,0.875969,0.882724,0.875969,0.874091,"{0: 49, 1: 20, 2: 60}"
6,0.1676,0.703367,0.891473,0.896471,0.891473,0.891826,"{0: 48, 1: 24, 2: 57}"
7,0.0964,0.811817,0.883721,0.887125,0.883721,0.884469,"{0: 46, 1: 25, 2: 58}"
8,0.0603,0.810468,0.883721,0.887125,0.883721,0.884469,"{0: 46, 1: 25, 2: 58}"


Evaluation results for aari1995/German_Sentiment with 8 epochs and random seeds: 42, 42



{'eval_loss': 0.9524356722831726, 'eval_accuracy': 0.8431372549019608, 'eval_precision': 0.844600929467074, 'eval_recall': 0.8431372549019608, 'eval_f1': 0.8435228108625825, 'eval_class_distribution': {0: 38, 1: 34, 2: 81}, 'eval_runtime': 5.5209, 'eval_samples_per_second': 27.713, 'eval_steps_per_second': 13.947, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.87      0.92      0.89        36
     Neutral       0.71      0.76      0.74        33
     Positiv       0.93      0.88      0.90        84

    accuracy                           0.86       153
   macro avg       0.84      0.85      0.84       153
weighted avg       0.87      0.86      0.86       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 38, 1: 35, 2: 80}
Negativ Precision Score: 0.868421052631579
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.8918918918918919

Neutral Precision Score: 0.7142857142857143
Neutral Recall Score: 0.

In [None]:
for epoch in [10, 12, 20]:
    print(f'training and results for German Sentiment Bert for {epoch} epochs:')
    absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=epoch)
    print()
# GPU: NVIDIA GeForce RTX 2080 Ti    

training and results for DBMDZ Bert for 10 epochs:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 1544.44 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2139.16 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2175.34 examples/s]


Training results for aari1995/German_Sentiment with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8395,0.791142,0.860465,0.865477,0.860465,0.85883,"{0: 49, 1: 21, 2: 59}"
2,0.5341,0.854222,0.837209,0.844961,0.837209,0.838044,"{0: 36, 1: 33, 2: 60}"
3,0.5082,0.742958,0.868217,0.879837,0.868217,0.866869,"{0: 50, 1: 19, 2: 60}"
4,0.2749,0.765735,0.852713,0.857564,0.852713,0.854547,"{0: 41, 1: 30, 2: 58}"
5,0.1764,0.815526,0.875969,0.879838,0.875969,0.877259,"{0: 40, 1: 30, 2: 59}"
6,0.1184,0.767304,0.899225,0.900804,0.899225,0.899732,"{0: 44, 1: 27, 2: 58}"
7,0.0595,0.658802,0.930233,0.931396,0.930233,0.929347,"{0: 44, 1: 23, 2: 62}"
8,0.0288,0.727754,0.906977,0.90835,0.906977,0.907127,"{0: 45, 1: 26, 2: 58}"
9,0.0032,0.718983,0.922481,0.922194,0.922481,0.921549,"{0: 43, 1: 24, 2: 62}"
10,0.0011,0.717965,0.922481,0.922194,0.922481,0.921549,"{0: 43, 1: 24, 2: 62}"


Evaluation results for aari1995/German_Sentiment with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.3482502698898315, 'eval_accuracy': 0.8562091503267973, 'eval_precision': 0.854113139508677, 'eval_recall': 0.8562091503267973, 'eval_f1': 0.8546529723000311, 'eval_class_distribution': {0: 36, 1: 30, 2: 87}, 'eval_runtime': 6.226, 'eval_samples_per_second': 24.575, 'eval_steps_per_second': 12.368, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.89      0.89      0.89        36
     Neutral       0.77      0.70      0.73        33
     Positiv       0.87      0.90      0.89        84

    accuracy                           0.86       153
   macro avg       0.84      0.83      0.84       153
weighted avg       0.85      0.86      0.85       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 36, 1: 30, 2: 87}
Negativ Precision Score: 0.8888888888888888
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8888888888888888

Neutral Precision Score: 0.7666666666666667
Neutral Recall Score: 0

Map: 100%|██████████| 1111/1111 [00:00<00:00, 2400.48 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2276.37 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2256.02 examples/s]


Training results for aari1995/German_Sentiment with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8497,0.632475,0.875969,0.875969,0.875969,0.875969,"{0: 42, 1: 27, 2: 60}"
2,0.5716,0.470746,0.868217,0.868853,0.868217,0.8683,"{0: 44, 1: 26, 2: 59}"
3,0.4932,0.62057,0.875969,0.885332,0.875969,0.875285,"{0: 51, 1: 21, 2: 57}"
4,0.2626,0.663084,0.891473,0.892821,0.891473,0.890486,"{0: 46, 1: 23, 2: 60}"
5,0.2514,1.012527,0.860465,0.861878,0.860465,0.861088,"{0: 42, 1: 28, 2: 59}"
6,0.1733,0.967994,0.860465,0.872171,0.860465,0.864105,"{0: 39, 1: 33, 2: 57}"
7,0.1356,0.689719,0.899225,0.900623,0.899225,0.899837,"{0: 42, 1: 28, 2: 59}"
8,0.0731,0.741803,0.899225,0.90336,0.899225,0.900654,"{0: 41, 1: 30, 2: 58}"
9,0.0147,0.996164,0.891473,0.891473,0.891473,0.891473,"{0: 42, 1: 27, 2: 60}"
10,0.0063,1.108757,0.883721,0.884437,0.883721,0.883982,"{0: 41, 1: 28, 2: 60}"


Evaluation results for aari1995/German_Sentiment with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.1062209606170654, 'eval_accuracy': 0.8562091503267973, 'eval_precision': 0.8640330576294835, 'eval_recall': 0.8562091503267973, 'eval_f1': 0.8591271587212916, 'eval_class_distribution': {0: 37, 1: 37, 2: 79}, 'eval_runtime': 6.5334, 'eval_samples_per_second': 23.418, 'eval_steps_per_second': 11.786, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.89      0.89      0.89        36
     Neutral       0.67      0.79      0.72        33
     Positiv       0.94      0.87      0.90        84

    accuracy                           0.86       153
   macro avg       0.83      0.85      0.84       153
weighted avg       0.87      0.86      0.86       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 36, 1: 39, 2: 78}
Negativ Precision Score: 0.8888888888888888
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8888888888888888

Neutral Precision Score: 0.6666666666666666
Neutral Recall Score:

Map: 100%|██████████| 1111/1111 [00:00<00:00, 2325.14 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2105.80 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2156.86 examples/s]


Training results for aari1995/German_Sentiment with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8642,0.6025,0.883721,0.886762,0.883721,0.882243,"{0: 48, 1: 22, 2: 59}"
2,0.5475,0.718769,0.875969,0.88268,0.875969,0.87705,"{0: 37, 1: 32, 2: 60}"
3,0.5522,0.570527,0.891473,0.897188,0.891473,0.890652,"{0: 47, 1: 21, 2: 61}"
4,0.3046,0.612081,0.868217,0.872502,0.868217,0.869262,"{0: 46, 1: 27, 2: 56}"
5,0.2334,0.659868,0.875969,0.874395,0.875969,0.873891,"{0: 44, 1: 23, 2: 62}"
6,0.1562,0.614664,0.891473,0.900194,0.891473,0.89335,"{0: 48, 1: 27, 2: 54}"
7,0.0801,0.769258,0.906977,0.911498,0.906977,0.906686,"{0: 48, 1: 23, 2: 58}"
8,0.1055,0.897929,0.868217,0.876876,0.868217,0.870811,"{0: 41, 1: 32, 2: 56}"
9,0.0168,0.818693,0.914729,0.929083,0.914729,0.915143,"{0: 52, 1: 21, 2: 56}"
10,0.0019,0.762654,0.922481,0.924614,0.922481,0.921983,"{0: 45, 1: 23, 2: 61}"


In [5]:
absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=20)
# GPU: NVIDIA GeForce RTX 2080 Ti  

Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
tensor([1.0957, 1.3467, 0.7436])tral, positive): 


Map: 100%|██████████| 1111/1111 [00:00<00:00, 1579.81 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2193.47 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 2217.16 examples/s]


Training results for aari1995/German_Sentiment with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8642,0.6025,0.883721,0.886762,0.883721,0.882243,"{0: 48, 1: 22, 2: 59}"
2,0.5475,0.718769,0.875969,0.88268,0.875969,0.87705,"{0: 37, 1: 32, 2: 60}"
3,0.5522,0.570527,0.891473,0.897188,0.891473,0.890652,"{0: 47, 1: 21, 2: 61}"
4,0.3046,0.612081,0.868217,0.872502,0.868217,0.869262,"{0: 46, 1: 27, 2: 56}"
5,0.2334,0.659868,0.875969,0.874395,0.875969,0.873891,"{0: 44, 1: 23, 2: 62}"
6,0.1562,0.614664,0.891473,0.900194,0.891473,0.89335,"{0: 48, 1: 27, 2: 54}"
7,0.0801,0.769258,0.906977,0.911498,0.906977,0.906686,"{0: 48, 1: 23, 2: 58}"
8,0.1055,0.897929,0.868217,0.876876,0.868217,0.870811,"{0: 41, 1: 32, 2: 56}"
9,0.0168,0.818693,0.914729,0.929083,0.914729,0.915143,"{0: 52, 1: 21, 2: 56}"
10,0.0019,0.762654,0.922481,0.924614,0.922481,0.921983,"{0: 45, 1: 23, 2: 61}"


Evaluation results for aari1995/German_Sentiment with 20 epochs and random seeds: 42, 42



{'eval_loss': 1.3664731979370117, 'eval_accuracy': 0.8431372549019608, 'eval_precision': 0.8596078431372548, 'eval_recall': 0.8431372549019608, 'eval_f1': 0.8474435812060673, 'eval_class_distribution': {0: 36, 1: 42, 2: 75}, 'eval_runtime': 6.4489, 'eval_samples_per_second': 23.725, 'eval_steps_per_second': 11.94, 'epoch': 15.0}
              precision    recall  f1-score   support

     Negativ       0.86      0.83      0.85        36
     Neutral       0.67      0.85      0.75        33
     Positiv       0.93      0.85      0.89        84

    accuracy                           0.84       153
   macro avg       0.82      0.84      0.83       153
weighted avg       0.86      0.84      0.85       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 35, 1: 42, 2: 76}
Negativ Precision Score: 0.8571428571428571
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.8450704225352113

Neutral Precision Score: 0.6666666666666666
Neutral Recall Score: 

In [7]:
for epoch in [5, 7]:
    print(f'training and results for German Sentiment Bert for {epoch} epochs:')
    absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=epoch, save = True)
    print()
# GPU: NVIDIA A30 

training and results for German Sentiment Bert for 5 epochs:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 3818.94 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3804.02 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3725.59 examples/s]


Training results for aari1995/German_Sentiment with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8824,0.97153,0.821705,0.873684,0.821705,0.802826,"{0: 62, 1: 9, 2: 58}"
2,0.5375,0.553547,0.868217,0.870688,0.868217,0.8683,"{0: 46, 1: 24, 2: 59}"
3,0.452,0.770169,0.891473,0.891541,0.891473,0.890727,"{0: 44, 1: 24, 2: 61}"
4,0.2103,0.805933,0.883721,0.884319,0.883721,0.882582,"{0: 45, 1: 23, 2: 61}"
5,0.2026,0.936767,0.868217,0.871997,0.868217,0.869577,"{0: 43, 1: 29, 2: 57}"



Best Model saved at: ./saved_models/absa_aari1995_German_Sentiment_42_42_5

Tokenizer for best Model saved at: ./saved_tokenizers/absa_aari1995_German_Sentiment_42_42_5
Evaluation results for aari1995/German_Sentiment with 5 epochs and random seeds: 42, 42



{'eval_loss': 0.9783487915992737, 'eval_accuracy': 0.8562091503267973, 'eval_precision': 0.85623111989894, 'eval_recall': 0.8562091503267973, 'eval_f1': 0.8561623054644532, 'eval_class_distribution': {0: 35, 1: 33, 2: 85}, 'eval_runtime': 7.317, 'eval_samples_per_second': 20.91, 'eval_steps_per_second': 10.523, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.88      0.83      0.86        36
     Neutral       0.75      0.82      0.78        33
     Positiv       0.92      0.90      0.91        84

    accuracy                           0.87       153
   macro avg       0.85      0.85      0.85       153
weighted avg       0.87      0.87      0.87       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 34, 1: 36, 2: 83}
Negativ Precision Score: 0.8823529411764706
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.8571428571428571

Neutral Precision Score: 0.75
Neutral Recall Score: 0.8181818181818182

Map: 100%|██████████| 1111/1111 [00:00<00:00, 4005.98 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3929.05 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3907.31 examples/s]


Training results for aari1995/German_Sentiment with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9038,0.656261,0.883721,0.884589,0.883721,0.884086,"{0: 43, 1: 27, 2: 59}"
2,0.5465,0.574743,0.868217,0.885456,0.868217,0.870272,"{0: 53, 1: 22, 2: 54}"
3,0.5327,0.616116,0.883721,0.884141,0.883721,0.883155,"{0: 44, 1: 24, 2: 61}"
4,0.3257,0.765709,0.875969,0.885395,0.875969,0.873925,"{0: 49, 1: 19, 2: 61}"
5,0.2288,0.692132,0.899225,0.901921,0.899225,0.898858,"{0: 46, 1: 23, 2: 60}"
6,0.1859,0.842588,0.868217,0.872111,0.868217,0.869267,"{0: 45, 1: 28, 2: 56}"
7,0.0937,0.867013,0.883721,0.885831,0.883721,0.883819,"{0: 46, 1: 25, 2: 58}"



Best Model saved at: ./saved_models/absa_aari1995_German_Sentiment_42_42_7

Tokenizer for best Model saved at: ./saved_tokenizers/absa_aari1995_German_Sentiment_42_42_7
Evaluation results for aari1995/German_Sentiment with 7 epochs and random seeds: 42, 42



{'eval_loss': 0.8971391916275024, 'eval_accuracy': 0.8627450980392157, 'eval_precision': 0.8630566438582937, 'eval_recall': 0.8627450980392157, 'eval_f1': 0.8624587035926511, 'eval_class_distribution': {0: 39, 1: 32, 2: 82}, 'eval_runtime': 7.3091, 'eval_samples_per_second': 20.933, 'eval_steps_per_second': 10.535, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.85      0.92      0.88        36
     Neutral       0.78      0.76      0.77        33
     Positiv       0.93      0.90      0.92        84

    accuracy                           0.88       153
   macro avg       0.85      0.86      0.85       153
weighted avg       0.88      0.88      0.88       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 39, 1: 32, 2: 82}
Negativ Precision Score: 0.8461538461538461
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.88

Neutral Precision Score: 0.78125
Neutral Recall Score: 0.7575757575757576
Neutra

In [6]:
for epoch in [5, 6, 7, 8]:
    print(f'training and results for German Sentiment Bert for {epoch} epochs:')
    absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=epoch, save = True)
    print()
# GPU: NVIDIA A30 

training and results for DBMDZ Bert for 5 epochs:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 4060.47 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3651.01 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3812.03 examples/s]


Training results for aari1995/German_Sentiment with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8824,0.97153,0.821705,0.873684,0.821705,0.802826,"{0: 62, 1: 9, 2: 58}"
2,0.5375,0.553547,0.868217,0.870688,0.868217,0.8683,"{0: 46, 1: 24, 2: 59}"
3,0.452,0.770169,0.891473,0.891541,0.891473,0.890727,"{0: 44, 1: 24, 2: 61}"
4,0.2103,0.805933,0.883721,0.884319,0.883721,0.882582,"{0: 45, 1: 23, 2: 61}"
5,0.2026,0.936767,0.868217,0.871997,0.868217,0.869577,"{0: 43, 1: 29, 2: 57}"



Best Model saved at: ./saved_models/absa_aari1995_German_Sentiment_42_42_5

Tokenizer for best Model saved at: ./saved_tokenizers/absa_aari1995_German_Sentiment_42_42_5
Evaluation results for aari1995/German_Sentiment with 5 epochs and random seeds: 42, 42



{'eval_loss': 0.9783487915992737, 'eval_accuracy': 0.8562091503267973, 'eval_precision': 0.85623111989894, 'eval_recall': 0.8562091503267973, 'eval_f1': 0.8561623054644532, 'eval_class_distribution': {0: 35, 1: 33, 2: 85}, 'eval_runtime': 7.1772, 'eval_samples_per_second': 21.317, 'eval_steps_per_second': 10.728, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.88      0.83      0.86        36
     Neutral       0.75      0.82      0.78        33
     Positiv       0.92      0.90      0.91        84

    accuracy                           0.87       153
   macro avg       0.85      0.85      0.85       153
weighted avg       0.87      0.87      0.87       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 34, 1: 36, 2: 83}
Negativ Precision Score: 0.8823529411764706
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.8571428571428571

Neutral Precision Score: 0.75
Neutral Recall Score: 0.81818181818181

Map: 100%|██████████| 1111/1111 [00:00<00:00, 4274.94 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3949.61 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4026.28 examples/s]


Training results for aari1995/German_Sentiment with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9119,0.623253,0.883721,0.908965,0.883721,0.882207,"{0: 55, 1: 17, 2: 57}"
2,0.5104,0.740861,0.837209,0.848215,0.837209,0.839176,"{0: 36, 1: 34, 2: 59}"
3,0.4436,0.715509,0.868217,0.870328,0.868217,0.865894,"{0: 47, 1: 21, 2: 61}"
4,0.244,0.627744,0.906977,0.906579,0.906977,0.906446,"{0: 43, 1: 25, 2: 61}"
5,0.1854,0.68801,0.906977,0.908279,0.906977,0.906126,"{0: 45, 1: 23, 2: 61}"
6,0.0935,0.642033,0.922481,0.923643,0.922481,0.922088,"{0: 45, 1: 24, 2: 60}"



Best Model saved at: ./saved_models/absa_aari1995_German_Sentiment_42_42_6

Tokenizer for best Model saved at: ./saved_tokenizers/absa_aari1995_German_Sentiment_42_42_6
Evaluation results for aari1995/German_Sentiment with 6 epochs and random seeds: 42, 42



{'eval_loss': 0.9785120487213135, 'eval_accuracy': 0.8562091503267973, 'eval_precision': 0.8614384165095456, 'eval_recall': 0.8562091503267973, 'eval_f1': 0.8579112116996729, 'eval_class_distribution': {0: 38, 1: 36, 2: 79}, 'eval_runtime': 7.2178, 'eval_samples_per_second': 21.198, 'eval_steps_per_second': 10.668, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.83      0.94      0.88        36
     Neutral       0.64      0.64      0.64        33
     Positiv       0.90      0.85      0.87        84

    accuracy                           0.82       153
   macro avg       0.79      0.81      0.80       153
weighted avg       0.83      0.82      0.82       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 41, 1: 33, 2: 79}
Negativ Precision Score: 0.8292682926829268
Negativ Recall Score: 0.9444444444444444
Negativ F1 Score: 0.8831168831168831

Neutral Precision Score: 0.6363636363636364
Neutral Recall Score: 

Map: 100%|██████████| 1111/1111 [00:00<00:00, 4206.91 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3987.74 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4020.55 examples/s]


Training results for aari1995/German_Sentiment with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9038,0.656261,0.883721,0.884589,0.883721,0.884086,"{0: 43, 1: 27, 2: 59}"
2,0.5465,0.574743,0.868217,0.885456,0.868217,0.870272,"{0: 53, 1: 22, 2: 54}"
3,0.5327,0.616116,0.883721,0.884141,0.883721,0.883155,"{0: 44, 1: 24, 2: 61}"
4,0.3257,0.765709,0.875969,0.885395,0.875969,0.873925,"{0: 49, 1: 19, 2: 61}"
5,0.2288,0.692132,0.899225,0.901921,0.899225,0.898858,"{0: 46, 1: 23, 2: 60}"
6,0.1859,0.842588,0.868217,0.872111,0.868217,0.869267,"{0: 45, 1: 28, 2: 56}"
7,0.0937,0.867013,0.883721,0.885831,0.883721,0.883819,"{0: 46, 1: 25, 2: 58}"



Best Model saved at: ./saved_models/absa_aari1995_German_Sentiment_42_42_7

Tokenizer for best Model saved at: ./saved_tokenizers/absa_aari1995_German_Sentiment_42_42_7
Evaluation results for aari1995/German_Sentiment with 7 epochs and random seeds: 42, 42



{'eval_loss': 0.8971391916275024, 'eval_accuracy': 0.8627450980392157, 'eval_precision': 0.8630566438582937, 'eval_recall': 0.8627450980392157, 'eval_f1': 0.8624587035926511, 'eval_class_distribution': {0: 39, 1: 32, 2: 82}, 'eval_runtime': 7.208, 'eval_samples_per_second': 21.226, 'eval_steps_per_second': 10.683, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.85      0.92      0.88        36
     Neutral       0.78      0.76      0.77        33
     Positiv       0.93      0.90      0.92        84

    accuracy                           0.88       153
   macro avg       0.85      0.86      0.85       153
weighted avg       0.88      0.88      0.88       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 39, 1: 32, 2: 82}
Negativ Precision Score: 0.8461538461538461
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.88

Neutral Precision Score: 0.78125
Neutral Recall Score: 0.7575757575757576
Neutral

Map: 100%|██████████| 1111/1111 [00:00<00:00, 4158.96 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4054.84 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3993.93 examples/s]


Training results for aari1995/German_Sentiment with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8976,0.819189,0.837209,0.868564,0.837209,0.823483,"{0: 60, 1: 12, 2: 57}"
2,0.5459,0.549485,0.891473,0.889679,0.891473,0.889848,"{0: 43, 1: 24, 2: 62}"
3,0.4719,0.827644,0.868217,0.868833,0.868217,0.86638,"{0: 45, 1: 22, 2: 62}"
4,0.2458,0.690967,0.891473,0.893065,0.891473,0.891744,"{0: 45, 1: 26, 2: 58}"
5,0.2218,0.732186,0.899225,0.899166,0.899225,0.898415,"{0: 44, 1: 24, 2: 61}"
6,0.0898,0.896576,0.875969,0.876773,0.875969,0.876278,"{0: 41, 1: 28, 2: 60}"
7,0.0453,0.849441,0.899225,0.901475,0.899225,0.89794,"{0: 46, 1: 22, 2: 61}"
8,0.025,0.877132,0.891473,0.895371,0.891473,0.890744,"{0: 47, 1: 22, 2: 60}"



Best Model saved at: ./saved_models/absa_aari1995_German_Sentiment_42_42_8

Tokenizer for best Model saved at: ./saved_tokenizers/absa_aari1995_German_Sentiment_42_42_8
Evaluation results for aari1995/German_Sentiment with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.1299880743026733, 'eval_accuracy': 0.8496732026143791, 'eval_precision': 0.8480319973744039, 'eval_recall': 0.8496732026143791, 'eval_f1': 0.8485100417318726, 'eval_class_distribution': {0: 35, 1: 31, 2: 87}, 'eval_runtime': 7.226, 'eval_samples_per_second': 21.173, 'eval_steps_per_second': 10.656, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.86      0.83      0.85        36
     Neutral       0.72      0.70      0.71        33
     Positiv       0.86      0.88      0.87        84

    accuracy                           0.83       153
   macro avg       0.81      0.80      0.81       153
weighted avg       0.83      0.83      0.83       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 35, 1: 32, 2: 86}
Negativ Precision Score: 0.8571428571428571
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.8450704225352113

Neutral Precision Score: 0.71875
Neutral Recall Score: 0.6969696969

In [6]:
for epoch in [10, 12, 20]:
    print(f'training and results for German Sentiment Bert for {epoch} epochs:')
    absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=epoch, save = True)
    print()
# GPU: NVIDIA A30 

training and results for German Sentiment Bert for 10 epochs:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 4003.22 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3574.45 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3683.54 examples/s]


Training results for aari1995/German_Sentiment with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8687,0.963993,0.829457,0.883016,0.829457,0.818445,"{0: 63, 1: 11, 2: 55}"
2,0.5082,0.935839,0.829457,0.849099,0.829457,0.83093,"{0: 32, 1: 37, 2: 60}"
3,0.5221,0.983847,0.837209,0.840277,0.837209,0.832491,"{0: 50, 1: 19, 2: 60}"
4,0.2651,0.948324,0.844961,0.850181,0.844961,0.846534,"{0: 40, 1: 31, 2: 58}"
5,0.2229,0.756907,0.883721,0.886772,0.883721,0.882852,"{0: 46, 1: 22, 2: 61}"
6,0.1861,0.911469,0.844961,0.8537,0.844961,0.847947,"{0: 44, 1: 30, 2: 55}"
7,0.1104,0.816184,0.868217,0.87861,0.868217,0.870515,"{0: 49, 1: 26, 2: 54}"
8,0.0865,0.929037,0.868217,0.877087,0.868217,0.870558,"{0: 48, 1: 26, 2: 55}"
9,0.0122,0.855249,0.899225,0.910749,0.899225,0.900007,"{0: 51, 1: 22, 2: 56}"
10,0.0036,0.903134,0.875969,0.884216,0.875969,0.878393,"{0: 47, 1: 27, 2: 55}"



Best Model saved at: ./saved_models/absa_aari1995_German_Sentiment_42_42_10

Tokenizer for best Model saved at: ./saved_tokenizers/absa_aari1995_German_Sentiment_42_42_10
Evaluation results for aari1995/German_Sentiment with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.1450538635253906, 'eval_accuracy': 0.8562091503267973, 'eval_precision': 0.8597936536611651, 'eval_recall': 0.8562091503267973, 'eval_f1': 0.8575058384023574, 'eval_class_distribution': {0: 34, 1: 36, 2: 83}, 'eval_runtime': 7.149, 'eval_samples_per_second': 21.402, 'eval_steps_per_second': 10.771, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.87      0.92      0.89        36
     Neutral       0.72      0.79      0.75        33
     Positiv       0.91      0.86      0.88        84

    accuracy                           0.86       153
   macro avg       0.83      0.85      0.84       153
weighted avg       0.86      0.86      0.86       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 38, 1: 36, 2: 79}
Negativ Precision Score: 0.868421052631579
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.8918918918918919

Neutral Precision Score: 0.7222222222222222
Neutral Recall Score: 0

Map: 100%|██████████| 1111/1111 [00:00<00:00, 4033.49 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3879.08 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3880.14 examples/s]


Training results for aari1995/German_Sentiment with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8989,0.918514,0.837209,0.862563,0.837209,0.829621,"{0: 56, 1: 14, 2: 59}"
2,0.516,0.599222,0.852713,0.849629,0.852713,0.849907,"{0: 43, 1: 23, 2: 63}"
3,0.4661,0.60164,0.875969,0.874638,0.875969,0.874997,"{0: 43, 1: 25, 2: 61}"
4,0.2661,0.573434,0.906977,0.913837,0.906977,0.90835,"{0: 48, 1: 25, 2: 56}"
5,0.1979,0.679561,0.899225,0.9121,0.899225,0.899771,"{0: 52, 1: 22, 2: 55}"
6,0.1193,0.704647,0.899225,0.904631,0.899225,0.899777,"{0: 48, 1: 24, 2: 57}"
7,0.0948,0.755952,0.914729,0.926194,0.914729,0.914681,"{0: 51, 1: 21, 2: 57}"
8,0.0619,0.963016,0.891473,0.911882,0.891473,0.886217,"{0: 54, 1: 16, 2: 59}"
9,0.0001,0.644698,0.922481,0.929311,0.922481,0.922133,"{0: 49, 1: 22, 2: 58}"
10,0.0089,0.756722,0.914729,0.919517,0.914729,0.914569,"{0: 48, 1: 23, 2: 58}"



Best Model saved at: ./saved_models/absa_aari1995_German_Sentiment_42_42_12

Tokenizer for best Model saved at: ./saved_tokenizers/absa_aari1995_German_Sentiment_42_42_12
Evaluation results for aari1995/German_Sentiment with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.2433797121047974, 'eval_accuracy': 0.8496732026143791, 'eval_precision': 0.8541262658909718, 'eval_recall': 0.8496732026143791, 'eval_f1': 0.8500083794201441, 'eval_class_distribution': {0: 42, 1: 33, 2: 78}, 'eval_runtime': 7.2263, 'eval_samples_per_second': 21.173, 'eval_steps_per_second': 10.655, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.77      0.94      0.85        36
     Neutral       0.70      0.70      0.70        33
     Positiv       0.93      0.85      0.89        84

    accuracy                           0.84       153
   macro avg       0.80      0.83      0.81       153
weighted avg       0.85      0.84      0.84       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 44, 1: 33, 2: 76}
Negativ Precision Score: 0.7727272727272727
Negativ Recall Score: 0.9444444444444444
Negativ F1 Score: 0.85

Neutral Precision Score: 0.696969696969697
Neutral Recall Score: 0.696969696969

Map: 100%|██████████| 1111/1111 [00:00<00:00, 4016.76 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3714.27 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3914.51 examples/s]


Training results for aari1995/German_Sentiment with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8808,1.006432,0.821705,0.884794,0.821705,0.809369,"{0: 65, 1: 10, 2: 54}"
2,0.5463,0.70961,0.860465,0.860465,0.860465,0.860465,"{0: 42, 1: 27, 2: 60}"
3,0.5244,0.786635,0.875969,0.896885,0.875969,0.871794,"{0: 53, 1: 16, 2: 60}"
4,0.2522,0.85272,0.868217,0.877711,0.868217,0.865526,"{0: 51, 1: 19, 2: 59}"
5,0.3405,0.835548,0.875969,0.889401,0.875969,0.873973,"{0: 49, 1: 18, 2: 62}"
6,0.1771,0.738415,0.844961,0.875502,0.844961,0.852286,"{0: 37, 1: 39, 2: 53}"
7,0.1289,0.886112,0.891473,0.903614,0.891473,0.893073,"{0: 51, 1: 24, 2: 54}"
8,0.0758,0.966485,0.852713,0.859588,0.852713,0.855141,"{0: 40, 1: 31, 2: 58}"
9,0.034,1.430055,0.837209,0.853782,0.837209,0.841637,"{0: 37, 1: 35, 2: 57}"
10,0.0399,1.096365,0.883721,0.904813,0.883721,0.880644,"{0: 55, 1: 17, 2: 57}"



Best Model saved at: ./saved_models/absa_aari1995_German_Sentiment_42_42_20

Tokenizer for best Model saved at: ./saved_tokenizers/absa_aari1995_German_Sentiment_42_42_20
Evaluation results for aari1995/German_Sentiment with 20 epochs and random seeds: 42, 42



{'eval_loss': 1.3768686056137085, 'eval_accuracy': 0.8431372549019608, 'eval_precision': 0.8470219666814094, 'eval_recall': 0.8431372549019608, 'eval_f1': 0.8445081084122354, 'eval_class_distribution': {0: 38, 1: 35, 2: 80}, 'eval_runtime': 7.1628, 'eval_samples_per_second': 21.36, 'eval_steps_per_second': 10.75, 'epoch': 19.0}
              precision    recall  f1-score   support

     Negativ       0.80      0.89      0.84        36
     Neutral       0.66      0.70      0.68        33
     Positiv       0.91      0.85      0.88        84

    accuracy                           0.82       153
   macro avg       0.79      0.81      0.80       153
weighted avg       0.83      0.82      0.83       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 35, 2: 78}
Negativ Precision Score: 0.8
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8421052631578947

Neutral Precision Score: 0.6571428571428571
Neutral Recall Score: 0.69696969696969

#### all Models, train for 5, 6, 7, 8, 10, 12, 20 epochs:

In [5]:
for model in models:
    print(f'training and results for {model}:')
    absa_model(data, model, rn1=42, rn2=42, epochs=5)
    print()

training and results for google-bert/bert-base-german-cased:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2390.47 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3813.27 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3710.98 examples/s]


Training results for google-bert/bert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8296,0.987348,0.751938,0.781377,0.751938,0.754374,"{0: 56, 1: 28, 2: 45}"
2,0.4975,0.774625,0.821705,0.834225,0.821705,0.824619,"{0: 47, 1: 30, 2: 52}"
3,0.4684,1.091386,0.790698,0.794574,0.790698,0.788501,"{0: 45, 1: 20, 2: 64}"
4,0.2287,0.996374,0.813953,0.816359,0.813953,0.81484,"{0: 42, 1: 29, 2: 58}"
5,0.1862,1.054386,0.806202,0.812882,0.806202,0.80834,"{0: 42, 1: 31, 2: 56}"


Evaluation results for google-bert/bert-base-german-cased with 5 epochs and random seeds: 42, 42



{'eval_loss': 1.162582516670227, 'eval_accuracy': 0.7450980392156863, 'eval_precision': 0.7918716074000222, 'eval_recall': 0.7450980392156863, 'eval_f1': 0.746741949965157, 'eval_class_distribution': {0: 54, 1: 40, 2: 59}, 'eval_runtime': 2.3363, 'eval_samples_per_second': 65.488, 'eval_steps_per_second': 32.958, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.60      0.81      0.69        36
     Neutral       0.58      0.79      0.67        33
     Positiv       0.92      0.65      0.76        84

    accuracy                           0.72       153
   macro avg       0.70      0.75      0.71       153
weighted avg       0.77      0.72      0.73       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 48, 1: 45, 2: 60}
Negativ Precision Score: 0.6041666666666666
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.6904761904761905

Neutral Precision Score: 0.5777777777777777
Neutral Recall Score: 0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4072.73 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3950.97 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3963.02 examples/s]


Training results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8579,0.607639,0.844961,0.858825,0.844961,0.843355,"{0: 54, 1: 20, 2: 55}"
2,0.5551,0.743898,0.837209,0.83879,0.837209,0.836884,"{0: 38, 1: 27, 2: 64}"
3,0.492,0.903522,0.837209,0.835397,0.837209,0.833369,"{0: 44, 1: 21, 2: 64}"
4,0.2605,0.880561,0.837209,0.838488,0.837209,0.837269,"{0: 45, 1: 25, 2: 59}"
5,0.2015,0.872499,0.837209,0.838359,0.837209,0.837702,"{0: 42, 1: 28, 2: 59}"


Evaluation results for dbmdz/bert-base-german-cased with 5 epochs and random seeds: 42, 42



{'eval_loss': 0.997826099395752, 'eval_accuracy': 0.7647058823529411, 'eval_precision': 0.7945879187120263, 'eval_recall': 0.7647058823529411, 'eval_f1': 0.7671448865777926, 'eval_class_distribution': {0: 53, 1: 33, 2: 67}, 'eval_runtime': 2.3417, 'eval_samples_per_second': 65.337, 'eval_steps_per_second': 32.882, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.64      0.97      0.77        36
     Neutral       0.79      0.67      0.72        33
     Positiv       0.93      0.77      0.84        84

    accuracy                           0.80       153
   macro avg       0.78      0.80      0.78       153
weighted avg       0.83      0.80      0.80       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 55, 1: 28, 2: 70}
Negativ Precision Score: 0.6363636363636364
Negativ Recall Score: 0.9722222222222222
Negativ F1 Score: 0.7692307692307693

Neutral Precision Score: 0.7857142857142857
Neutral Recall Score: 0

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3821.78 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3695.57 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3710.61 examples/s]


Training results for dbmdz/bert-base-german-uncased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8492,0.683184,0.844961,0.848891,0.844961,0.844586,"{0: 48, 1: 23, 2: 58}"
2,0.5615,0.58389,0.829457,0.840033,0.829457,0.832542,"{0: 40, 1: 33, 2: 56}"
3,0.4485,0.777486,0.852713,0.855932,0.852713,0.853683,"{0: 40, 1: 30, 2: 59}"
4,0.2257,0.67998,0.860465,0.863138,0.860465,0.861469,"{0: 42, 1: 29, 2: 58}"
5,0.2182,0.7898,0.844961,0.854681,0.844961,0.847538,"{0: 40, 1: 33, 2: 56}"


Evaluation results for dbmdz/bert-base-german-uncased with 5 epochs and random seeds: 42, 42



{'eval_loss': 1.0990517139434814, 'eval_accuracy': 0.8169934640522876, 'eval_precision': 0.8299243206054351, 'eval_recall': 0.8169934640522876, 'eval_f1': 0.8203795299412042, 'eval_class_distribution': {0: 38, 1: 40, 2: 75}, 'eval_runtime': 2.3581, 'eval_samples_per_second': 64.883, 'eval_steps_per_second': 32.653, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.82      0.89      0.85        36
     Neutral       0.68      0.82      0.74        33
     Positiv       0.92      0.81      0.86        84

    accuracy                           0.83       153
   macro avg       0.80      0.84      0.82       153
weighted avg       0.84      0.83      0.83       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 39, 1: 40, 2: 74}
Negativ Precision Score: 0.8205128205128205
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8533333333333334

Neutral Precision Score: 0.675
Neutral Recall Score: 0.81818181818

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4702.33 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4125.58 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4198.80 examples/s]


Training results for FacebookAI/xlm-roberta-base with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8995,0.633655,0.79845,0.846253,0.79845,0.801034,"{0: 63, 1: 18, 2: 48}"
2,0.7669,0.958312,0.790698,0.790303,0.790698,0.787535,"{0: 40, 1: 22, 2: 67}"
3,0.7834,1.03833,0.829457,0.839276,0.829457,0.830989,"{0: 50, 1: 25, 2: 54}"
4,0.48,1.15733,0.806202,0.810956,0.806202,0.808082,"{0: 43, 1: 29, 2: 57}"
5,0.4995,1.159907,0.806202,0.810956,0.806202,0.808082,"{0: 43, 1: 29, 2: 57}"


Evaluation results for FacebookAI/xlm-roberta-base with 5 epochs and random seeds: 42, 42



{'eval_loss': 1.1958260536193848, 'eval_accuracy': 0.7973856209150327, 'eval_precision': 0.8035030726757454, 'eval_recall': 0.7973856209150327, 'eval_f1': 0.7955541330407017, 'eval_class_distribution': {0: 47, 1: 29, 2: 77}, 'eval_runtime': 2.2587, 'eval_samples_per_second': 67.739, 'eval_steps_per_second': 34.091, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.70      0.89      0.78        36
     Neutral       0.63      0.58      0.60        33
     Positiv       0.88      0.81      0.84        84

    accuracy                           0.78       153
   macro avg       0.74      0.76      0.74       153
weighted avg       0.79      0.78      0.78       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 46, 1: 30, 2: 77}
Negativ Precision Score: 0.6956521739130435
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.7804878048780488

Neutral Precision Score: 0.6333333333333333
Neutral Recall Score: 

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3606.38 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4509.25 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4552.98 examples/s]


Training results for TUM/GottBERT_base_best with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8872,0.654201,0.860465,0.867849,0.860465,0.860479,"{0: 50, 1: 23, 2: 56}"
2,0.5388,0.510353,0.868217,0.867489,0.868217,0.867765,"{0: 42, 1: 26, 2: 61}"
3,0.4743,0.825377,0.844961,0.852485,0.844961,0.842293,"{0: 49, 1: 19, 2: 61}"
4,0.2875,0.759579,0.844961,0.847627,0.844961,0.842292,"{0: 49, 1: 21, 2: 59}"
5,0.1987,0.784528,0.837209,0.84186,0.837209,0.838667,"{0: 45, 1: 28, 2: 56}"


Evaluation results for TUM/GottBERT_base_best with 5 epochs and random seeds: 42, 42



{'eval_loss': 0.8579771518707275, 'eval_accuracy': 0.8235294117647058, 'eval_precision': 0.8285151667504609, 'eval_recall': 0.8235294117647058, 'eval_f1': 0.8247620851883427, 'eval_class_distribution': {0: 39, 1: 36, 2: 78}, 'eval_runtime': 2.2787, 'eval_samples_per_second': 67.143, 'eval_steps_per_second': 33.791, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.76      0.86      0.81        36
     Neutral       0.66      0.70      0.68        33
     Positiv       0.88      0.81      0.84        84

    accuracy                           0.80       153
   macro avg       0.77      0.79      0.78       153
weighted avg       0.80      0.80      0.80       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 41, 1: 35, 2: 77}
Negativ Precision Score: 0.7560975609756098
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8051948051948052

Neutral Precision Score: 0.6571428571428571
Neutral Recall Score: 

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4938.26 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4490.21 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4421.90 examples/s]


Training results for TUM/GottBERT_filtered_base_best with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8376,0.72531,0.844961,0.856285,0.844961,0.846382,"{0: 51, 1: 24, 2: 54}"
2,0.5302,0.590942,0.883721,0.884948,0.883721,0.883406,"{0: 45, 1: 24, 2: 60}"
3,0.4733,0.643328,0.883721,0.887434,0.883721,0.883974,"{0: 47, 1: 24, 2: 58}"
4,0.3074,0.692748,0.883721,0.885469,0.883721,0.883973,"{0: 45, 1: 25, 2: 59}"
5,0.2824,0.721027,0.891473,0.894203,0.891473,0.891869,"{0: 46, 1: 25, 2: 58}"


Evaluation results for TUM/GottBERT_filtered_base_best with 5 epochs and random seeds: 42, 42



{'eval_loss': 1.2548445463180542, 'eval_accuracy': 0.803921568627451, 'eval_precision': 0.8144595274007039, 'eval_recall': 0.803921568627451, 'eval_f1': 0.8063563119168004, 'eval_class_distribution': {0: 39, 1: 39, 2: 75}, 'eval_runtime': 2.2681, 'eval_samples_per_second': 67.457, 'eval_steps_per_second': 33.949, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.82      0.89      0.85        36
     Neutral       0.66      0.82      0.73        33
     Positiv       0.90      0.79      0.84        84

    accuracy                           0.82       153
   macro avg       0.79      0.83      0.81       153
weighted avg       0.83      0.82      0.82       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 39, 1: 41, 2: 73}
Negativ Precision Score: 0.8205128205128205
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8533333333333334

Neutral Precision Score: 0.6585365853658537
Neutral Recall Score: 0.

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be abl

Training results for TUM/GottBERT_base_last with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9112,1.300491,0.728682,0.812171,0.728682,0.701999,"{0: 75, 1: 6, 2: 48}"
2,0.577,0.600182,0.868217,0.868397,0.868217,0.867761,"{0: 39, 1: 28, 2: 62}"
3,0.4619,0.64395,0.860465,0.858948,0.860465,0.858245,"{0: 42, 1: 23, 2: 64}"
4,0.3103,0.655676,0.860465,0.865701,0.860465,0.861973,"{0: 41, 1: 31, 2: 57}"
5,0.2481,0.619495,0.875969,0.891052,0.875969,0.878868,"{0: 46, 1: 32, 2: 51}"


Evaluation results for TUM/GottBERT_base_last with 5 epochs and random seeds: 42, 42



{'eval_loss': 1.1922639608383179, 'eval_accuracy': 0.803921568627451, 'eval_precision': 0.828454172366621, 'eval_recall': 0.803921568627451, 'eval_f1': 0.80917534477906, 'eval_class_distribution': {0: 40, 1: 43, 2: 70}, 'eval_runtime': 2.2634, 'eval_samples_per_second': 67.596, 'eval_steps_per_second': 34.019, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.73      0.92      0.81        36
     Neutral       0.66      0.76      0.70        33
     Positiv       0.93      0.77      0.84        84

    accuracy                           0.80       153
   macro avg       0.77      0.82      0.79       153
weighted avg       0.82      0.80      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 45, 1: 38, 2: 70}
Negativ Precision Score: 0.7333333333333333
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.8148148148148148

Neutral Precision Score: 0.6578947368421053
Neutral Recall Score: 0.757

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4786.27 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4567.65 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4419.04 examples/s]


Training results for distilbert/distilbert-base-german-cased with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8227,0.785686,0.790698,0.824226,0.790698,0.78805,"{0: 62, 1: 18, 2: 49}"
2,0.5838,0.760451,0.813953,0.832567,0.813953,0.815965,"{0: 54, 1: 24, 2: 51}"
3,0.5247,0.789023,0.844961,0.865685,0.844961,0.846614,"{0: 55, 1: 23, 2: 51}"
4,0.2884,0.823489,0.852713,0.870588,0.852713,0.854412,"{0: 53, 1: 26, 2: 50}"
5,0.295,0.829586,0.852713,0.870588,0.852713,0.854412,"{0: 53, 1: 26, 2: 50}"


Evaluation results for distilbert/distilbert-base-german-cased with 5 epochs and random seeds: 42, 42



{'eval_loss': 1.183458685874939, 'eval_accuracy': 0.7647058823529411, 'eval_precision': 0.7875974685029482, 'eval_recall': 0.7647058823529411, 'eval_f1': 0.7681253712604027, 'eval_class_distribution': {0: 47, 1: 37, 2: 69}, 'eval_runtime': 1.2124, 'eval_samples_per_second': 126.192, 'eval_steps_per_second': 63.508, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.70      0.89      0.78        36
     Neutral       0.67      0.67      0.67        33
     Positiv       0.89      0.79      0.84        84

    accuracy                           0.78       153
   macro avg       0.75      0.78      0.76       153
weighted avg       0.80      0.78      0.79       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 46, 1: 33, 2: 74}
Negativ Precision Score: 0.6956521739130435
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.7804878048780488

Neutral Precision Score: 0.6666666666666666
Neutral Recall Score: 

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

Training results for GerMedBERT/medbert-512 with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9215,0.730344,0.782946,0.801155,0.782946,0.783977,"{0: 55, 1: 22, 2: 52}"
2,0.5741,0.756621,0.806202,0.815845,0.806202,0.807423,"{0: 50, 1: 26, 2: 53}"
3,0.5157,1.075526,0.790698,0.798453,0.790698,0.782654,"{0: 43, 1: 16, 2: 70}"
4,0.2696,0.816916,0.821705,0.829546,0.821705,0.823608,"{0: 47, 1: 28, 2: 54}"
5,0.2591,0.81506,0.79845,0.806847,0.79845,0.801172,"{0: 43, 1: 31, 2: 55}"


Evaluation results for GerMedBERT/medbert-512 with 5 epochs and random seeds: 42, 42



{'eval_loss': 1.0450433492660522, 'eval_accuracy': 0.7973856209150327, 'eval_precision': 0.8112036512158607, 'eval_recall': 0.7973856209150327, 'eval_f1': 0.7983848696225423, 'eval_class_distribution': {0: 44, 1: 38, 2: 71}, 'eval_runtime': 2.3457, 'eval_samples_per_second': 65.226, 'eval_steps_per_second': 32.826, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.74      0.89      0.81        36
     Neutral       0.72      0.79      0.75        33
     Positiv       0.89      0.79      0.84        84

    accuracy                           0.81       153
   macro avg       0.79      0.82      0.80       153
weighted avg       0.82      0.81      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 43, 1: 36, 2: 74}
Negativ Precision Score: 0.7441860465116279
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.810126582278481

Neutral Precision Score: 0.7222222222222222
Neutral Recall Score: 0

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Training results for deepset/gbert-base with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8154,0.737219,0.860465,0.860171,0.860465,0.859571,"{0: 44, 1: 24, 2: 61}"
2,0.4918,0.724495,0.837209,0.844581,0.837209,0.83806,"{0: 49, 1: 23, 2: 57}"
3,0.4252,0.811122,0.860465,0.860171,0.860465,0.859571,"{0: 44, 1: 24, 2: 61}"
4,0.2195,0.880146,0.852713,0.854815,0.852713,0.853498,"{0: 44, 1: 27, 2: 58}"
5,0.1645,0.874246,0.860465,0.861382,0.860465,0.860856,"{0: 43, 1: 27, 2: 59}"


Evaluation results for deepset/gbert-base with 5 epochs and random seeds: 42, 42



{'eval_loss': 1.165304183959961, 'eval_accuracy': 0.8235294117647058, 'eval_precision': 0.8342706313294549, 'eval_recall': 0.8235294117647058, 'eval_f1': 0.826791010385189, 'eval_class_distribution': {0: 35, 1: 40, 2: 78}, 'eval_runtime': 2.3404, 'eval_samples_per_second': 65.373, 'eval_steps_per_second': 32.9, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.86      0.86      0.86        36
     Neutral       0.64      0.85      0.73        33
     Positiv       0.90      0.79      0.84        84

    accuracy                           0.82       153
   macro avg       0.80      0.83      0.81       153
weighted avg       0.84      0.82      0.82       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 36, 1: 44, 2: 73}
Negativ Precision Score: 0.8611111111111112
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8611111111111112

Neutral Precision Score: 0.6363636363636364
Neutral Recall Score: 0.84

In [6]:
absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=5)

Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 3941.40 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3668.61 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3685.36 examples/s]


Training results for aari1995/German_Sentiment with 5 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8824,0.97153,0.821705,0.873684,0.821705,0.802826,"{0: 62, 1: 9, 2: 58}"
2,0.5375,0.553547,0.868217,0.870688,0.868217,0.8683,"{0: 46, 1: 24, 2: 59}"
3,0.452,0.770169,0.891473,0.891541,0.891473,0.890727,"{0: 44, 1: 24, 2: 61}"
4,0.2103,0.805933,0.883721,0.884319,0.883721,0.882582,"{0: 45, 1: 23, 2: 61}"
5,0.2026,0.936767,0.868217,0.871997,0.868217,0.869577,"{0: 43, 1: 29, 2: 57}"


Evaluation results for aari1995/German_Sentiment with 5 epochs and random seeds: 42, 42



{'eval_loss': 0.9783487915992737, 'eval_accuracy': 0.8562091503267973, 'eval_precision': 0.85623111989894, 'eval_recall': 0.8562091503267973, 'eval_f1': 0.8561623054644532, 'eval_class_distribution': {0: 35, 1: 33, 2: 85}, 'eval_runtime': 7.1764, 'eval_samples_per_second': 21.32, 'eval_steps_per_second': 10.73, 'epoch': 5.0}
              precision    recall  f1-score   support

     Negativ       0.88      0.83      0.86        36
     Neutral       0.75      0.82      0.78        33
     Positiv       0.92      0.90      0.91        84

    accuracy                           0.87       153
   macro avg       0.85      0.85      0.85       153
weighted avg       0.87      0.87      0.87       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 34, 1: 36, 2: 83}
Negativ Precision Score: 0.8823529411764706
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.8571428571428571

Neutral Precision Score: 0.75
Neutral Recall Score: 0.8181818181818182

In [6]:
for model in models:
    print(f'training and results for {model}:')
    absa_model(data, model, rn1=42, rn2=42, epochs=6)
    print()

training and results for google-bert/bert-base-german-cased:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4019.04 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3754.61 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3973.55 examples/s]


Training results for google-bert/bert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8204,0.872701,0.767442,0.815032,0.767442,0.770378,"{0: 59, 1: 29, 2: 41}"
2,0.5644,1.06251,0.782946,0.792583,0.782946,0.785126,"{0: 48, 1: 28, 2: 53}"
3,0.4538,0.909535,0.79845,0.801485,0.79845,0.798551,"{0: 47, 1: 24, 2: 58}"
4,0.2656,1.008557,0.790698,0.796415,0.790698,0.792757,"{0: 44, 1: 29, 2: 56}"
5,0.2046,1.123555,0.79845,0.804113,0.79845,0.800255,"{0: 41, 1: 31, 2: 57}"
6,0.1268,1.211345,0.79845,0.802796,0.79845,0.800131,"{0: 43, 1: 29, 2: 57}"


Evaluation results for google-bert/bert-base-german-cased with 6 epochs and random seeds: 42, 42



{'eval_loss': 1.4581663608551025, 'eval_accuracy': 0.7777777777777778, 'eval_precision': 0.7889928698752229, 'eval_recall': 0.7777777777777778, 'eval_f1': 0.7811489098319495, 'eval_class_distribution': {0: 36, 1: 40, 2: 77}, 'eval_runtime': 2.3615, 'eval_samples_per_second': 64.79, 'eval_steps_per_second': 32.607, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.74      0.78      0.76        36
     Neutral       0.64      0.76      0.69        33
     Positiv       0.87      0.79      0.82        84

    accuracy                           0.78       153
   macro avg       0.75      0.77      0.76       153
weighted avg       0.79      0.78      0.78       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 38, 1: 39, 2: 76}
Negativ Precision Score: 0.7368421052631579
Negativ Recall Score: 0.7777777777777778
Negativ F1 Score: 0.7567567567567568

Neutral Precision Score: 0.6410256410256411
Neutral Recall Score: 0

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4108.65 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3911.47 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3813.71 examples/s]


Training results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8984,0.973359,0.751938,0.803143,0.751938,0.753058,"{0: 65, 1: 17, 2: 47}"
2,0.5993,0.923055,0.782946,0.795455,0.782946,0.786003,"{0: 47, 1: 30, 2: 52}"
3,0.5108,1.02908,0.79845,0.808,0.79845,0.799121,"{0: 51, 1: 23, 2: 55}"
4,0.3009,1.030535,0.79845,0.808141,0.79845,0.799109,"{0: 51, 1: 25, 2: 53}"
5,0.2569,0.873975,0.837209,0.837543,0.837209,0.836848,"{0: 39, 1: 28, 2: 62}"
6,0.1749,0.92876,0.821705,0.82121,0.821705,0.821392,"{0: 41, 1: 27, 2: 61}"


Evaluation results for dbmdz/bert-base-german-cased with 6 epochs and random seeds: 42, 42



{'eval_loss': 1.2861956357955933, 'eval_accuracy': 0.7516339869281046, 'eval_precision': 0.771012172985136, 'eval_recall': 0.7516339869281046, 'eval_f1': 0.7528386518005895, 'eval_class_distribution': {0: 49, 1: 35, 2: 69}, 'eval_runtime': 2.3479, 'eval_samples_per_second': 65.164, 'eval_steps_per_second': 32.795, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.64      0.89      0.74        36
     Neutral       0.68      0.79      0.73        33
     Positiv       0.91      0.70      0.79        84

    accuracy                           0.76       153
   macro avg       0.74      0.79      0.76       153
weighted avg       0.80      0.76      0.77       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 50, 1: 38, 2: 65}
Negativ Precision Score: 0.64
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.7441860465116279

Neutral Precision Score: 0.6842105263157895
Neutral Recall Score: 0.7878787878787

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3996.23 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3855.94 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3867.02 examples/s]


Training results for dbmdz/bert-base-german-uncased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8432,0.867722,0.775194,0.818401,0.775194,0.776189,"{0: 63, 1: 19, 2: 47}"
2,0.5072,0.73067,0.806202,0.807288,0.806202,0.80548,"{0: 44, 1: 23, 2: 62}"
3,0.527,0.769214,0.852713,0.854343,0.852713,0.852602,"{0: 46, 1: 25, 2: 58}"
4,0.2774,0.78422,0.852713,0.854735,0.852713,0.852079,"{0: 46, 1: 23, 2: 60}"
5,0.2561,0.759085,0.829457,0.828607,0.829457,0.828685,"{0: 42, 1: 25, 2: 62}"
6,0.1582,0.728507,0.860465,0.861078,0.860465,0.860522,"{0: 40, 1: 28, 2: 61}"


Evaluation results for dbmdz/bert-base-german-uncased with 6 epochs and random seeds: 42, 42



{'eval_loss': 1.3692032098770142, 'eval_accuracy': 0.7581699346405228, 'eval_precision': 0.7765781922525108, 'eval_recall': 0.7581699346405228, 'eval_f1': 0.762696878732334, 'eval_class_distribution': {0: 41, 1: 40, 2: 72}, 'eval_runtime': 2.3325, 'eval_samples_per_second': 65.595, 'eval_steps_per_second': 33.012, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.78      0.89      0.83        36
     Neutral       0.61      0.82      0.70        33
     Positiv       0.94      0.76      0.84        84

    accuracy                           0.80       153
   macro avg       0.78      0.82      0.79       153
weighted avg       0.83      0.80      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 41, 1: 44, 2: 68}
Negativ Precision Score: 0.7804878048780488
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8311688311688312

Neutral Precision Score: 0.6136363636363636
Neutral Recall Score: 0

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4732.95 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4076.01 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4303.67 examples/s]


Training results for FacebookAI/xlm-roberta-base with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8739,0.552215,0.844961,0.84877,0.844961,0.843999,"{0: 49, 1: 23, 2: 57}"
2,0.7372,0.797765,0.821705,0.820183,0.821705,0.820159,"{0: 42, 1: 24, 2: 63}"
3,0.7377,0.773387,0.852713,0.856728,0.852713,0.851034,"{0: 48, 1: 21, 2: 60}"
4,0.4794,0.86789,0.844961,0.850083,0.844961,0.844866,"{0: 49, 1: 24, 2: 56}"
5,0.4557,0.9142,0.829457,0.832768,0.829457,0.82978,"{0: 47, 1: 25, 2: 57}"
6,0.3646,0.90034,0.837209,0.841932,0.837209,0.837653,"{0: 48, 1: 25, 2: 56}"


Evaluation results for FacebookAI/xlm-roberta-base with 6 epochs and random seeds: 42, 42



{'eval_loss': 1.1698037385940552, 'eval_accuracy': 0.8235294117647058, 'eval_precision': 0.824281805745554, 'eval_recall': 0.8235294117647058, 'eval_f1': 0.8216931313021247, 'eval_class_distribution': {0: 43, 1: 30, 2: 80}, 'eval_runtime': 2.2676, 'eval_samples_per_second': 67.473, 'eval_steps_per_second': 33.957, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.81      0.94      0.87        36
     Neutral       0.69      0.67      0.68        33
     Positiv       0.90      0.85      0.87        84

    accuracy                           0.83       153
   macro avg       0.80      0.82      0.81       153
weighted avg       0.83      0.83      0.83       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 42, 1: 32, 2: 79}
Negativ Precision Score: 0.8095238095238095
Negativ Recall Score: 0.9444444444444444
Negativ F1 Score: 0.8717948717948718

Neutral Precision Score: 0.6875
Neutral Recall Score: 0.66666666666

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4959.86 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4666.69 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4651.72 examples/s]


Training results for TUM/GottBERT_base_best with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8897,0.784115,0.813953,0.851153,0.813953,0.808713,"{0: 60, 1: 14, 2: 55}"
2,0.534,0.685484,0.860465,0.867948,0.860465,0.857423,"{0: 32, 1: 29, 2: 68}"
3,0.5193,0.664688,0.875969,0.878821,0.875969,0.875406,"{0: 47, 1: 23, 2: 59}"
4,0.3053,0.602223,0.883721,0.891543,0.883721,0.884078,"{0: 50, 1: 24, 2: 55}"
5,0.2669,0.559235,0.899225,0.900215,0.899225,0.899631,"{0: 42, 1: 28, 2: 59}"
6,0.2085,0.548349,0.875969,0.878873,0.875969,0.876931,"{0: 44, 1: 28, 2: 57}"


Evaluation results for TUM/GottBERT_base_best with 6 epochs and random seeds: 42, 42



{'eval_loss': 1.1487834453582764, 'eval_accuracy': 0.8431372549019608, 'eval_precision': 0.8580392156862745, 'eval_recall': 0.8431372549019608, 'eval_f1': 0.8467036625971144, 'eval_class_distribution': {0: 36, 1: 42, 2: 75}, 'eval_runtime': 2.2878, 'eval_samples_per_second': 66.878, 'eval_steps_per_second': 33.657, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.84      0.89      0.86        36
     Neutral       0.67      0.79      0.72        33
     Positiv       0.92      0.83      0.88        84

    accuracy                           0.84       153
   macro avg       0.81      0.84      0.82       153
weighted avg       0.85      0.84      0.84       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 38, 1: 39, 2: 76}
Negativ Precision Score: 0.8421052631578947
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8648648648648649

Neutral Precision Score: 0.6666666666666666
Neutral Recall Score: 

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4827.61 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4321.12 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4261.54 examples/s]


Training results for TUM/GottBERT_filtered_base_best with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.857,0.896841,0.775194,0.802215,0.775194,0.776722,"{0: 58, 1: 22, 2: 49}"
2,0.5292,0.846917,0.860465,0.861577,0.860465,0.858477,"{0: 38, 1: 24, 2: 67}"
3,0.4871,0.587638,0.875969,0.88245,0.875969,0.875628,"{0: 49, 1: 22, 2: 58}"
4,0.2992,0.639291,0.883721,0.884332,0.883721,0.883407,"{0: 45, 1: 25, 2: 59}"
5,0.3038,0.608101,0.899225,0.898668,0.899225,0.898855,"{0: 42, 1: 26, 2: 61}"
6,0.1992,0.632043,0.883721,0.883986,0.883721,0.883606,"{0: 44, 1: 26, 2: 59}"


Evaluation results for TUM/GottBERT_filtered_base_best with 6 epochs and random seeds: 42, 42



{'eval_loss': 1.2891265153884888, 'eval_accuracy': 0.8104575163398693, 'eval_precision': 0.8136641700611742, 'eval_recall': 0.8104575163398693, 'eval_f1': 0.8110396752207357, 'eval_class_distribution': {0: 40, 1: 34, 2: 79}, 'eval_runtime': 2.2837, 'eval_samples_per_second': 66.996, 'eval_steps_per_second': 33.717, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.78      0.89      0.83        36
     Neutral       0.66      0.64      0.65        33
     Positiv       0.86      0.82      0.84        84

    accuracy                           0.80       153
   macro avg       0.77      0.78      0.77       153
weighted avg       0.80      0.80      0.80       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 41, 1: 32, 2: 80}
Negativ Precision Score: 0.7804878048780488
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8311688311688312

Neutral Precision Score: 0.65625
Neutral Recall Score: 0.636363636

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be abl

Training results for TUM/GottBERT_base_last with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8874,0.931562,0.806202,0.830926,0.806202,0.800897,"{0: 57, 1: 15, 2: 57}"
2,0.5876,0.83382,0.852713,0.853088,0.852713,0.852658,"{0: 40, 1: 28, 2: 61}"
3,0.4702,0.826412,0.852713,0.870975,0.852713,0.847282,"{0: 54, 1: 16, 2: 59}"
4,0.2987,0.767191,0.868217,0.871894,0.868217,0.868089,"{0: 47, 1: 23, 2: 59}"
5,0.2497,0.768104,0.844961,0.845417,0.844961,0.842991,"{0: 46, 1: 22, 2: 61}"
6,0.2133,0.758869,0.860465,0.863426,0.860465,0.859381,"{0: 47, 1: 22, 2: 60}"


Evaluation results for TUM/GottBERT_base_last with 6 epochs and random seeds: 42, 42



{'eval_loss': 1.1324610710144043, 'eval_accuracy': 0.8169934640522876, 'eval_precision': 0.8301974214343272, 'eval_recall': 0.8169934640522876, 'eval_f1': 0.81928726927745, 'eval_class_distribution': {0: 40, 1: 40, 2: 73}, 'eval_runtime': 2.199, 'eval_samples_per_second': 69.578, 'eval_steps_per_second': 35.016, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.71      0.89      0.79        36
     Neutral       0.61      0.67      0.64        33
     Positiv       0.90      0.77      0.83        84

    accuracy                           0.78       153
   macro avg       0.74      0.78      0.75       153
weighted avg       0.79      0.78      0.78       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 45, 1: 36, 2: 72}
Negativ Precision Score: 0.7111111111111111
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.7901234567901234

Neutral Precision Score: 0.6111111111111112
Neutral Recall Score: 0.6

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 5119.61 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4942.05 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4896.71 examples/s]


Training results for distilbert/distilbert-base-german-cased with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.7988,0.735539,0.790698,0.824226,0.790698,0.78805,"{0: 62, 1: 18, 2: 49}"
2,0.5613,0.695231,0.837209,0.859105,0.837209,0.836639,"{0: 56, 1: 19, 2: 54}"
3,0.505,0.677091,0.860465,0.876804,0.860465,0.86201,"{0: 52, 1: 27, 2: 50}"
4,0.2799,0.68771,0.860465,0.876804,0.860465,0.86201,"{0: 52, 1: 27, 2: 50}"
5,0.251,0.737269,0.875969,0.892047,0.875969,0.877154,"{0: 53, 1: 25, 2: 51}"
6,0.1775,0.810352,0.837209,0.858393,0.837209,0.839439,"{0: 52, 1: 29, 2: 48}"


Evaluation results for distilbert/distilbert-base-german-cased with 6 epochs and random seeds: 42, 42



{'eval_loss': 1.2296180725097656, 'eval_accuracy': 0.7581699346405228, 'eval_precision': 0.7745098039215687, 'eval_recall': 0.7581699346405228, 'eval_f1': 0.7599655246714071, 'eval_class_distribution': {0: 48, 1: 33, 2: 72}, 'eval_runtime': 1.2058, 'eval_samples_per_second': 126.886, 'eval_steps_per_second': 63.858, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.71      0.83      0.77        36
     Neutral       0.67      0.67      0.67        33
     Positiv       0.86      0.80      0.83        84

    accuracy                           0.78       153
   macro avg       0.75      0.77      0.75       153
weighted avg       0.78      0.78      0.78       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 42, 1: 33, 2: 78}
Negativ Precision Score: 0.7142857142857143
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.7692307692307693

Neutral Precision Score: 0.6666666666666666
Neutral Recall Score:

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

Training results for GerMedBERT/medbert-512 with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8824,0.710477,0.767442,0.781672,0.767442,0.762594,"{0: 51, 1: 16, 2: 62}"
2,0.5445,1.052178,0.75969,0.758426,0.75969,0.755334,"{0: 36, 1: 24, 2: 69}"
3,0.499,0.989582,0.806202,0.812616,0.806202,0.803087,"{0: 39, 1: 20, 2: 70}"
4,0.2148,0.941466,0.813953,0.813576,0.813953,0.811673,"{0: 46, 1: 22, 2: 61}"
5,0.2331,1.158886,0.806202,0.805212,0.806202,0.805507,"{0: 41, 1: 26, 2: 62}"
6,0.1026,1.152936,0.813953,0.815823,0.813953,0.814685,"{0: 43, 1: 28, 2: 58}"


Evaluation results for GerMedBERT/medbert-512 with 6 epochs and random seeds: 42, 42



{'eval_loss': 1.4708349704742432, 'eval_accuracy': 0.7843137254901961, 'eval_precision': 0.8020066889632107, 'eval_recall': 0.7843137254901961, 'eval_f1': 0.7854564483745569, 'eval_class_distribution': {0: 45, 1: 39, 2: 69}, 'eval_runtime': 2.3133, 'eval_samples_per_second': 66.139, 'eval_steps_per_second': 33.285, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.65      0.83      0.73        36
     Neutral       0.66      0.82      0.73        33
     Positiv       0.88      0.69      0.77        84

    accuracy                           0.75       153
   macro avg       0.73      0.78      0.74       153
weighted avg       0.78      0.75      0.75       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 46, 1: 41, 2: 66}
Negativ Precision Score: 0.6521739130434783
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.7317073170731707

Neutral Precision Score: 0.6585365853658537
Neutral Recall Score: 

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Training results for deepset/gbert-base with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8235,0.771304,0.837209,0.844654,0.837209,0.837041,"{0: 50, 1: 22, 2: 57}"
2,0.5481,0.605326,0.860465,0.862715,0.860465,0.860653,"{0: 46, 1: 25, 2: 58}"
3,0.441,0.742509,0.868217,0.870869,0.868217,0.867459,"{0: 48, 1: 24, 2: 57}"
4,0.1913,0.61062,0.906977,0.90967,0.906977,0.907683,"{0: 45, 1: 27, 2: 57}"
5,0.1594,0.971854,0.844961,0.849143,0.844961,0.845038,"{0: 48, 1: 24, 2: 57}"
6,0.0549,0.828755,0.868217,0.873176,0.868217,0.869614,"{0: 46, 1: 27, 2: 56}"


Evaluation results for deepset/gbert-base with 6 epochs and random seeds: 42, 42



{'eval_loss': 1.2386518716812134, 'eval_accuracy': 0.8169934640522876, 'eval_precision': 0.8307317934864563, 'eval_recall': 0.8169934640522876, 'eval_f1': 0.8205443519934269, 'eval_class_distribution': {0: 37, 1: 41, 2: 75}, 'eval_runtime': 2.3528, 'eval_samples_per_second': 65.03, 'eval_steps_per_second': 32.728, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.88      0.83      0.86        36
     Neutral       0.52      0.82      0.64        33
     Positiv       0.88      0.70      0.78        84

    accuracy                           0.76       153
   macro avg       0.76      0.78      0.76       153
weighted avg       0.80      0.76      0.77       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 34, 1: 52, 2: 67}
Negativ Precision Score: 0.8823529411764706
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.8571428571428571

Neutral Precision Score: 0.5192307692307693
Neutral Recall Score: 0

In [7]:
absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=6)

Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 4081.77 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3944.60 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3886.86 examples/s]


Training results for aari1995/German_Sentiment with 6 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9119,0.623253,0.883721,0.908965,0.883721,0.882207,"{0: 55, 1: 17, 2: 57}"
2,0.5104,0.740861,0.837209,0.848215,0.837209,0.839176,"{0: 36, 1: 34, 2: 59}"
3,0.4436,0.715509,0.868217,0.870328,0.868217,0.865894,"{0: 47, 1: 21, 2: 61}"
4,0.244,0.627744,0.906977,0.906579,0.906977,0.906446,"{0: 43, 1: 25, 2: 61}"
5,0.1854,0.68801,0.906977,0.908279,0.906977,0.906126,"{0: 45, 1: 23, 2: 61}"
6,0.0935,0.642033,0.922481,0.923643,0.922481,0.922088,"{0: 45, 1: 24, 2: 60}"


Evaluation results for aari1995/German_Sentiment with 6 epochs and random seeds: 42, 42



{'eval_loss': 0.9785120487213135, 'eval_accuracy': 0.8562091503267973, 'eval_precision': 0.8614384165095456, 'eval_recall': 0.8562091503267973, 'eval_f1': 0.8579112116996729, 'eval_class_distribution': {0: 38, 1: 36, 2: 79}, 'eval_runtime': 7.2417, 'eval_samples_per_second': 21.128, 'eval_steps_per_second': 10.633, 'epoch': 6.0}
              precision    recall  f1-score   support

     Negativ       0.83      0.94      0.88        36
     Neutral       0.64      0.64      0.64        33
     Positiv       0.90      0.85      0.87        84

    accuracy                           0.82       153
   macro avg       0.79      0.81      0.80       153
weighted avg       0.83      0.82      0.82       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 41, 1: 33, 2: 79}
Negativ Precision Score: 0.8292682926829268
Negativ Recall Score: 0.9444444444444444
Negativ F1 Score: 0.8831168831168831

Neutral Precision Score: 0.6363636363636364
Neutral Recall Score: 

In [8]:
for model in models:
    print(f'training and results for {model}:')
    absa_model(data, model, rn1=42, rn2=42, epochs=7)
    print()

training and results for google-bert/bert-base-german-cased:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3882.96 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3655.08 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3714.57 examples/s]


Training results for google-bert/bert-base-german-cased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8145,0.697868,0.79845,0.809823,0.79845,0.79917,"{0: 52, 1: 24, 2: 53}"
2,0.5348,0.726392,0.790698,0.80226,0.790698,0.792402,"{0: 51, 1: 23, 2: 55}"
3,0.4649,0.946711,0.806202,0.80801,0.806202,0.806233,"{0: 45, 1: 24, 2: 60}"
4,0.2509,0.837324,0.821705,0.822997,0.821705,0.82147,"{0: 45, 1: 24, 2: 60}"
5,0.2276,1.011722,0.806202,0.807971,0.806202,0.805601,"{0: 42, 1: 23, 2: 64}"
6,0.1641,1.01327,0.806202,0.807804,0.806202,0.806802,"{0: 43, 1: 28, 2: 58}"
7,0.1076,1.045586,0.813953,0.81953,0.813953,0.815163,"{0: 43, 1: 31, 2: 55}"


Evaluation results for google-bert/bert-base-german-cased with 7 epochs and random seeds: 42, 42



{'eval_loss': 1.1440593004226685, 'eval_accuracy': 0.7712418300653595, 'eval_precision': 0.7792698308820033, 'eval_recall': 0.7712418300653595, 'eval_f1': 0.7730751061953108, 'eval_class_distribution': {0: 41, 1: 36, 2: 76}, 'eval_runtime': 2.3503, 'eval_samples_per_second': 65.097, 'eval_steps_per_second': 32.761, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.75      0.83      0.79        36
     Neutral       0.62      0.61      0.62        33
     Positiv       0.85      0.82      0.84        84

    accuracy                           0.78       153
   macro avg       0.74      0.75      0.75       153
weighted avg       0.78      0.78      0.78       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 32, 2: 81}
Negativ Precision Score: 0.75
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.7894736842105263

Neutral Precision Score: 0.625
Neutral Recall Score: 0.6060606060606061
Neutral 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4073.52 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3954.61 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3856.08 examples/s]


Training results for dbmdz/bert-base-german-cased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9322,1.065827,0.75969,0.818431,0.75969,0.755759,"{0: 66, 1: 13, 2: 50}"
2,0.5933,1.049472,0.744186,0.741648,0.744186,0.739688,"{0: 39, 1: 22, 2: 68}"
3,0.5219,1.06833,0.821705,0.829688,0.821705,0.821732,"{0: 50, 1: 22, 2: 57}"
4,0.3322,0.881616,0.806202,0.823985,0.806202,0.809993,"{0: 51, 1: 27, 2: 51}"
5,0.3412,1.016977,0.821705,0.827939,0.821705,0.823577,"{0: 42, 1: 31, 2: 56}"
6,0.218,1.035653,0.844961,0.876655,0.844961,0.850565,"{0: 39, 1: 40, 2: 50}"
7,0.1725,1.03513,0.837209,0.8623,0.837209,0.842079,"{0: 40, 1: 38, 2: 51}"


Evaluation results for dbmdz/bert-base-german-cased with 7 epochs and random seeds: 42, 42



{'eval_loss': 1.4055098295211792, 'eval_accuracy': 0.7843137254901961, 'eval_precision': 0.8106868478385506, 'eval_recall': 0.7843137254901961, 'eval_f1': 0.7901270254211431, 'eval_class_distribution': {0: 38, 1: 45, 2: 70}, 'eval_runtime': 2.3292, 'eval_samples_per_second': 65.686, 'eval_steps_per_second': 33.058, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.76      0.86      0.81        36
     Neutral       0.57      0.79      0.66        33
     Positiv       0.94      0.74      0.83        84

    accuracy                           0.78       153
   macro avg       0.75      0.80      0.76       153
weighted avg       0.82      0.78      0.79       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 41, 1: 46, 2: 66}
Negativ Precision Score: 0.7560975609756098
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8051948051948052

Neutral Precision Score: 0.5652173913043478
Neutral Recall Score: 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3896.67 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3781.69 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3702.71 examples/s]


Training results for dbmdz/bert-base-german-uncased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.84,0.790999,0.821705,0.843522,0.821705,0.820305,"{0: 57, 1: 19, 2: 53}"
2,0.5828,1.017033,0.744186,0.764707,0.744186,0.747105,"{0: 37, 1: 39, 2: 53}"
3,0.497,0.689613,0.852713,0.851761,0.852713,0.851771,"{0: 40, 1: 26, 2: 63}"
4,0.3013,0.757878,0.829457,0.839343,0.829457,0.832214,"{0: 39, 1: 33, 2: 57}"
5,0.2912,0.818664,0.829457,0.827855,0.829457,0.827817,"{0: 39, 1: 26, 2: 64}"
6,0.1781,0.936172,0.829457,0.839243,0.829457,0.832737,"{0: 41, 1: 32, 2: 56}"
7,0.1158,0.887976,0.860465,0.865547,0.860465,0.862148,"{0: 44, 1: 29, 2: 56}"


Evaluation results for dbmdz/bert-base-german-uncased with 7 epochs and random seeds: 42, 42



{'eval_loss': 1.603320837020874, 'eval_accuracy': 0.7516339869281046, 'eval_precision': 0.785293336955741, 'eval_recall': 0.7516339869281046, 'eval_f1': 0.7564902099869795, 'eval_class_distribution': {0: 46, 1: 42, 2: 65}, 'eval_runtime': 2.3507, 'eval_samples_per_second': 65.086, 'eval_steps_per_second': 32.756, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.72      0.86      0.78        36
     Neutral       0.64      0.91      0.75        33
     Positiv       0.94      0.70      0.80        84

    accuracy                           0.78       153
   macro avg       0.77      0.82      0.78       153
weighted avg       0.82      0.78      0.79       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 43, 1: 47, 2: 63}
Negativ Precision Score: 0.7209302325581395
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.7848101265822784

Neutral Precision Score: 0.6382978723404256
Neutral Recall Score: 0.

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4624.21 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4050.40 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4136.95 examples/s]


Training results for FacebookAI/xlm-roberta-base with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9216,0.708911,0.806202,0.812835,0.806202,0.805108,"{0: 50, 1: 21, 2: 58}"
2,0.7061,0.852544,0.837209,0.837209,0.837209,0.837209,"{0: 42, 1: 27, 2: 60}"
3,0.7415,1.164746,0.813953,0.819257,0.813953,0.814667,"{0: 48, 1: 24, 2: 57}"
4,0.5173,0.945621,0.852713,0.860207,0.852713,0.854514,"{0: 45, 1: 30, 2: 54}"
5,0.4757,0.966865,0.837209,0.837803,0.837209,0.837277,"{0: 44, 1: 26, 2: 59}"
6,0.3776,0.872053,0.860465,0.873088,0.860465,0.862747,"{0: 39, 1: 35, 2: 55}"
7,0.3379,0.859931,0.868217,0.878529,0.868217,0.870122,"{0: 40, 1: 34, 2: 55}"


Evaluation results for FacebookAI/xlm-roberta-base with 7 epochs and random seeds: 42, 42



{'eval_loss': 1.2322006225585938, 'eval_accuracy': 0.8169934640522876, 'eval_precision': 0.8249450602391779, 'eval_recall': 0.8169934640522876, 'eval_f1': 0.8192717086834734, 'eval_class_distribution': {0: 39, 1: 37, 2: 77}, 'eval_runtime': 2.2811, 'eval_samples_per_second': 67.073, 'eval_steps_per_second': 33.756, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.82      0.89      0.85        36
     Neutral       0.60      0.79      0.68        33
     Positiv       0.93      0.79      0.85        84

    accuracy                           0.81       153
   macro avg       0.78      0.82      0.80       153
weighted avg       0.83      0.81      0.82       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 39, 1: 43, 2: 71}
Negativ Precision Score: 0.8205128205128205
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8533333333333334

Neutral Precision Score: 0.6046511627906976
Neutral Recall Score: 

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4817.47 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4396.97 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4361.94 examples/s]


Training results for TUM/GottBERT_base_best with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8641,0.691972,0.79845,0.851249,0.79845,0.788623,"{0: 62, 1: 11, 2: 56}"
2,0.5724,0.598921,0.860465,0.864491,0.860465,0.861585,"{0: 46, 1: 26, 2: 57}"
3,0.4763,0.626405,0.860465,0.862384,0.860465,0.856953,"{0: 44, 1: 20, 2: 65}"
4,0.3464,0.661666,0.868217,0.885219,0.868217,0.869064,"{0: 54, 1: 23, 2: 52}"
5,0.2603,0.741127,0.852713,0.850767,0.852713,0.849997,"{0: 41, 1: 23, 2: 65}"
6,0.2234,0.864755,0.860465,0.886164,0.860465,0.864099,"{0: 34, 1: 39, 2: 56}"
7,0.1659,0.770394,0.868217,0.878702,0.868217,0.87117,"{0: 39, 1: 33, 2: 57}"


Evaluation results for TUM/GottBERT_base_best with 7 epochs and random seeds: 42, 42



{'eval_loss': 1.2199233770370483, 'eval_accuracy': 0.8104575163398693, 'eval_precision': 0.8222825540472599, 'eval_recall': 0.8104575163398693, 'eval_f1': 0.8135454433345665, 'eval_class_distribution': {0: 39, 1: 39, 2: 75}, 'eval_runtime': 2.3017, 'eval_samples_per_second': 66.474, 'eval_steps_per_second': 33.454, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.79      0.92      0.85        36
     Neutral       0.61      0.70      0.65        33
     Positiv       0.89      0.77      0.83        84

    accuracy                           0.79       153
   macro avg       0.76      0.80      0.77       153
weighted avg       0.80      0.79      0.79       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 42, 1: 38, 2: 73}
Negativ Precision Score: 0.7857142857142857
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.8461538461538461

Neutral Precision Score: 0.6052631578947368
Neutral Recall Score: 

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4982.42 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4603.75 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4570.06 examples/s]


Training results for TUM/GottBERT_filtered_base_best with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8259,0.977278,0.767442,0.809019,0.767442,0.7698,"{0: 62, 1: 21, 2: 46}"
2,0.5613,0.682484,0.868217,0.868217,0.868217,0.868217,"{0: 42, 1: 27, 2: 60}"
3,0.4879,0.651089,0.891473,0.894169,0.891473,0.891106,"{0: 46, 1: 23, 2: 60}"
4,0.2865,0.699712,0.883721,0.890875,0.883721,0.884222,"{0: 49, 1: 23, 2: 57}"
5,0.2924,0.735913,0.891473,0.890789,0.891473,0.891039,"{0: 42, 1: 26, 2: 61}"
6,0.1965,0.970835,0.852713,0.864224,0.852713,0.855692,"{0: 42, 1: 33, 2: 54}"
7,0.1969,0.85101,0.852713,0.860465,0.852713,0.854655,"{0: 45, 1: 30, 2: 54}"


Evaluation results for TUM/GottBERT_filtered_base_best with 7 epochs and random seeds: 42, 42



{'eval_loss': 1.1252208948135376, 'eval_accuracy': 0.7843137254901961, 'eval_precision': 0.7895209365797601, 'eval_recall': 0.7843137254901961, 'eval_f1': 0.7856395013763637, 'eval_class_distribution': {0: 40, 1: 35, 2: 78}, 'eval_runtime': 2.2783, 'eval_samples_per_second': 67.156, 'eval_steps_per_second': 33.797, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.82      0.89      0.85        36
     Neutral       0.64      0.70      0.67        33
     Positiv       0.87      0.81      0.84        84

    accuracy                           0.80       153
   macro avg       0.78      0.80      0.79       153
weighted avg       0.81      0.80      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 39, 1: 36, 2: 78}
Negativ Precision Score: 0.8205128205128205
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8533333333333334

Neutral Precision Score: 0.6388888888888888
Neutral Recall Score: 

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be abl

Training results for TUM/GottBERT_base_last with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.843,0.706404,0.813953,0.837502,0.813953,0.812199,"{0: 56, 1: 17, 2: 56}"
2,0.5584,0.677789,0.852713,0.850767,0.852713,0.849997,"{0: 41, 1: 23, 2: 65}"
3,0.4896,0.764616,0.852713,0.862729,0.852713,0.850912,"{0: 52, 1: 20, 2: 57}"
4,0.2893,0.862519,0.844961,0.852522,0.844961,0.844393,"{0: 51, 1: 23, 2: 55}"
5,0.2633,0.980617,0.829457,0.826867,0.829457,0.827835,"{0: 42, 1: 25, 2: 62}"
6,0.1973,0.925912,0.829457,0.83541,0.829457,0.831433,"{0: 40, 1: 31, 2: 58}"
7,0.1585,0.996599,0.829457,0.832803,0.829457,0.830665,"{0: 44, 1: 28, 2: 57}"


Evaluation results for TUM/GottBERT_base_last with 7 epochs and random seeds: 42, 42



{'eval_loss': 0.9821602702140808, 'eval_accuracy': 0.8366013071895425, 'eval_precision': 0.8400129282482223, 'eval_recall': 0.8366013071895425, 'eval_f1': 0.8369542978296697, 'eval_class_distribution': {0: 40, 1: 35, 2: 78}, 'eval_runtime': 2.2597, 'eval_samples_per_second': 67.708, 'eval_steps_per_second': 34.075, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.77      0.94      0.85        36
     Neutral       0.71      0.67      0.69        33
     Positiv       0.88      0.82      0.85        84

    accuracy                           0.82       153
   macro avg       0.79      0.81      0.80       153
weighted avg       0.82      0.82      0.82       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 44, 1: 31, 2: 78}
Negativ Precision Score: 0.7727272727272727
Negativ Recall Score: 0.9444444444444444
Negativ F1 Score: 0.85

Neutral Precision Score: 0.7096774193548387
Neutral Recall Score: 0.666666666666

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 5032.51 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4848.47 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4793.49 examples/s]


Training results for distilbert/distilbert-base-german-cased with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8224,0.630164,0.79845,0.813384,0.79845,0.799927,"{0: 53, 1: 24, 2: 52}"
2,0.5452,0.680577,0.821705,0.848066,0.821705,0.824731,"{0: 56, 1: 24, 2: 49}"
3,0.515,0.618437,0.829457,0.84125,0.829457,0.831894,"{0: 48, 1: 29, 2: 52}"
4,0.3069,0.636362,0.852713,0.873897,0.852713,0.854722,"{0: 54, 1: 26, 2: 49}"
5,0.284,0.665883,0.852713,0.873897,0.852713,0.854722,"{0: 54, 1: 26, 2: 49}"
6,0.198,0.76194,0.829457,0.853099,0.829457,0.831978,"{0: 51, 1: 31, 2: 47}"
7,0.1472,0.753789,0.860465,0.87964,0.860465,0.862112,"{0: 53, 1: 27, 2: 49}"


Evaluation results for distilbert/distilbert-base-german-cased with 7 epochs and random seeds: 42, 42



{'eval_loss': 1.47126305103302, 'eval_accuracy': 0.738562091503268, 'eval_precision': 0.7675507551163472, 'eval_recall': 0.738562091503268, 'eval_f1': 0.743681635088792, 'eval_class_distribution': {0: 43, 1: 43, 2: 67}, 'eval_runtime': 1.2437, 'eval_samples_per_second': 123.017, 'eval_steps_per_second': 61.911, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.63      0.89      0.74        36
     Neutral       0.61      0.70      0.65        33
     Positiv       0.89      0.68      0.77        84

    accuracy                           0.73       153
   macro avg       0.71      0.75      0.72       153
weighted avg       0.77      0.73      0.74       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 51, 1: 38, 2: 64}
Negativ Precision Score: 0.6274509803921569
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.735632183908046

Neutral Precision Score: 0.6052631578947368
Neutral Recall Score: 0.696

Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3563.20 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3393.66 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3411.87 examples/s]


Training results for GerMedBERT/medbert-512 with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8724,0.705863,0.790698,0.816109,0.790698,0.789007,"{0: 57, 1: 17, 2: 55}"
2,0.5456,0.876974,0.782946,0.790073,0.782946,0.785352,"{0: 42, 1: 31, 2: 56}"
3,0.4447,1.099862,0.79845,0.808525,0.79845,0.793985,"{0: 49, 1: 17, 2: 63}"
4,0.2279,0.901028,0.821705,0.833144,0.821705,0.822166,"{0: 52, 1: 22, 2: 55}"
5,0.2156,1.028583,0.821705,0.82252,0.821705,0.821889,"{0: 44, 1: 26, 2: 59}"
6,0.1471,0.99627,0.829457,0.831414,0.829457,0.830229,"{0: 43, 1: 28, 2: 58}"
7,0.1143,1.05112,0.837209,0.838043,0.837209,0.837561,"{0: 43, 1: 27, 2: 59}"


Evaluation results for GerMedBERT/medbert-512 with 7 epochs and random seeds: 42, 42



{'eval_loss': 1.371387243270874, 'eval_accuracy': 0.7908496732026143, 'eval_precision': 0.8091062902702453, 'eval_recall': 0.7908496732026143, 'eval_f1': 0.7925936093439869, 'eval_class_distribution': {0: 47, 1: 36, 2: 70}, 'eval_runtime': 2.355, 'eval_samples_per_second': 64.967, 'eval_steps_per_second': 32.696, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.65      0.89      0.75        36
     Neutral       0.62      0.79      0.69        33
     Positiv       0.95      0.70      0.81        84

    accuracy                           0.76       153
   macro avg       0.74      0.79      0.75       153
weighted avg       0.81      0.76      0.77       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 49, 1: 42, 2: 62}
Negativ Precision Score: 0.6530612244897959
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.7529411764705882

Neutral Precision Score: 0.6190476190476191
Neutral Recall Score: 0.

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Training results for deepset/gbert-base with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.843,0.858787,0.844961,0.859774,0.844961,0.839021,"{0: 52, 1: 16, 2: 61}"
2,0.5213,0.723457,0.860465,0.861265,0.860465,0.860261,"{0: 39, 1: 29, 2: 61}"
3,0.4875,1.070793,0.829457,0.827501,0.829457,0.8282,"{0: 43, 1: 25, 2: 61}"
4,0.2263,1.028618,0.821705,0.82739,0.821705,0.823698,"{0: 45, 1: 28, 2: 56}"
5,0.2074,1.090236,0.837209,0.838175,0.837209,0.837626,"{0: 43, 1: 27, 2: 59}"
6,0.1284,1.158467,0.829457,0.8332,0.829457,0.831015,"{0: 42, 1: 29, 2: 58}"
7,0.0754,1.181532,0.829457,0.831015,0.829457,0.830156,"{0: 42, 1: 28, 2: 59}"


Evaluation results for deepset/gbert-base with 7 epochs and random seeds: 42, 42



{'eval_loss': 0.7281306982040405, 'eval_accuracy': 0.8366013071895425, 'eval_precision': 0.8394884269497273, 'eval_recall': 0.8366013071895425, 'eval_f1': 0.8374766867466673, 'eval_class_distribution': {0: 38, 1: 35, 2: 80}, 'eval_runtime': 2.3351, 'eval_samples_per_second': 65.522, 'eval_steps_per_second': 32.975, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.86      0.86      0.86        36
     Neutral       0.61      0.67      0.64        33
     Positiv       0.88      0.85      0.86        84

    accuracy                           0.81       153
   macro avg       0.78      0.79      0.79       153
weighted avg       0.82      0.81      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 36, 1: 36, 2: 81}
Negativ Precision Score: 0.8611111111111112
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8611111111111112

Neutral Precision Score: 0.6111111111111112
Neutral Recall Score: 

In [5]:
absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=7)

Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
tensor([1.0957, 1.3467, 0.7436])tral, positive): 


Map: 100%|██████████| 1111/1111 [00:00<00:00, 2305.08 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3736.35 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3869.40 examples/s]


Training results for aari1995/German_Sentiment with 7 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9038,0.656261,0.883721,0.884589,0.883721,0.884086,"{0: 43, 1: 27, 2: 59}"
2,0.5465,0.574743,0.868217,0.885456,0.868217,0.870272,"{0: 53, 1: 22, 2: 54}"
3,0.5327,0.616116,0.883721,0.884141,0.883721,0.883155,"{0: 44, 1: 24, 2: 61}"
4,0.3257,0.765709,0.875969,0.885395,0.875969,0.873925,"{0: 49, 1: 19, 2: 61}"
5,0.2288,0.692132,0.899225,0.901921,0.899225,0.898858,"{0: 46, 1: 23, 2: 60}"
6,0.1859,0.842588,0.868217,0.872111,0.868217,0.869267,"{0: 45, 1: 28, 2: 56}"
7,0.0937,0.867013,0.883721,0.885831,0.883721,0.883819,"{0: 46, 1: 25, 2: 58}"


Evaluation results for aari1995/German_Sentiment with 7 epochs and random seeds: 42, 42



{'eval_loss': 0.8971391916275024, 'eval_accuracy': 0.8627450980392157, 'eval_precision': 0.8630566438582937, 'eval_recall': 0.8627450980392157, 'eval_f1': 0.8624587035926511, 'eval_class_distribution': {0: 39, 1: 32, 2: 82}, 'eval_runtime': 7.2966, 'eval_samples_per_second': 20.969, 'eval_steps_per_second': 10.553, 'epoch': 7.0}
              precision    recall  f1-score   support

     Negativ       0.85      0.92      0.88        36
     Neutral       0.78      0.76      0.77        33
     Positiv       0.93      0.90      0.92        84

    accuracy                           0.88       153
   macro avg       0.85      0.86      0.85       153
weighted avg       0.88      0.88      0.88       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 39, 1: 32, 2: 82}
Negativ Precision Score: 0.8461538461538461
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.88

Neutral Precision Score: 0.78125
Neutral Recall Score: 0.7575757575757576
Neutra

In [6]:
for model in models:
    print(f'training and results for {model}:')
    absa_model(data, model, rn1=42, rn2=42, epochs=8)
    print()

training and results for google-bert/bert-base-german-cased:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4057.23 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3889.11 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3951.68 examples/s]


Training results for google-bert/bert-base-german-cased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.822,0.81136,0.79845,0.821921,0.79845,0.799298,"{0: 57, 1: 22, 2: 50}"
2,0.5337,0.834208,0.806202,0.833212,0.806202,0.810542,"{0: 51, 1: 31, 2: 47}"
3,0.4618,1.005032,0.829457,0.829887,0.829457,0.829574,"{0: 43, 1: 26, 2: 60}"
4,0.2525,0.839598,0.821705,0.827535,0.821705,0.823265,"{0: 46, 1: 28, 2: 55}"
5,0.2243,1.110616,0.813953,0.813906,0.813953,0.812606,"{0: 38, 1: 26, 2: 65}"
6,0.1859,1.162672,0.79845,0.805939,0.79845,0.800943,"{0: 45, 1: 29, 2: 55}"
7,0.1198,1.208037,0.806202,0.813399,0.806202,0.808247,"{0: 47, 1: 27, 2: 55}"
8,0.0714,1.259307,0.79845,0.802568,0.79845,0.799685,"{0: 46, 1: 26, 2: 57}"


Evaluation results for google-bert/bert-base-german-cased with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.1044646501541138, 'eval_accuracy': 0.7777777777777778, 'eval_precision': 0.7872730315454775, 'eval_recall': 0.7777777777777778, 'eval_f1': 0.7804068996019461, 'eval_class_distribution': {0: 40, 1: 37, 2: 76}, 'eval_runtime': 2.3838, 'eval_samples_per_second': 64.182, 'eval_steps_per_second': 32.301, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.72      0.81      0.76        36
     Neutral       0.65      0.61      0.62        33
     Positiv       0.84      0.82      0.83        84

    accuracy                           0.77       153
   macro avg       0.74      0.74      0.74       153
weighted avg       0.77      0.77      0.77       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 31, 2: 82}
Negativ Precision Score: 0.725
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.7631578947368421

Neutral Precision Score: 0.6451612903225806
Neutral Recall Score: 0.60606060606

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3971.39 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3850.70 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3878.05 examples/s]


Training results for dbmdz/bert-base-german-cased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8826,0.738482,0.821705,0.84344,0.821705,0.822154,"{0: 56, 1: 20, 2: 53}"
2,0.576,0.610201,0.844961,0.867486,0.844961,0.848288,"{0: 53, 1: 27, 2: 49}"
3,0.5193,1.100045,0.821705,0.835822,0.821705,0.8163,"{0: 50, 1: 16, 2: 63}"
4,0.2655,0.770221,0.852713,0.85357,0.852713,0.852912,"{0: 44, 1: 26, 2: 59}"
5,0.2601,0.874548,0.852713,0.854126,0.852713,0.852424,"{0: 38, 1: 29, 2: 62}"
6,0.1877,0.893036,0.868217,0.874928,0.868217,0.869298,"{0: 37, 1: 32, 2: 60}"
7,0.1168,0.950542,0.852713,0.85203,0.852713,0.851652,"{0: 43, 1: 24, 2: 62}"
8,0.0754,0.967914,0.860465,0.859173,0.860465,0.858997,"{0: 42, 1: 24, 2: 63}"


Evaluation results for dbmdz/bert-base-german-cased with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.441780686378479, 'eval_accuracy': 0.7712418300653595, 'eval_precision': 0.8038680951530018, 'eval_recall': 0.7712418300653595, 'eval_f1': 0.7793616211916865, 'eval_class_distribution': {0: 34, 1: 48, 2: 71}, 'eval_runtime': 2.4169, 'eval_samples_per_second': 63.305, 'eval_steps_per_second': 31.86, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.83      0.83      0.83        36
     Neutral       0.57      0.82      0.68        33
     Positiv       0.90      0.75      0.82        84

    accuracy                           0.78       153
   macro avg       0.77      0.80      0.78       153
weighted avg       0.81      0.78      0.79       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 36, 1: 47, 2: 70}
Negativ Precision Score: 0.8333333333333334
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.8333333333333334

Neutral Precision Score: 0.574468085106383
Neutral Recall Score: 0.8

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3750.27 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3712.72 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3834.33 examples/s]


Training results for dbmdz/bert-base-german-uncased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8477,0.719345,0.844961,0.858727,0.844961,0.844923,"{0: 53, 1: 21, 2: 55}"
2,0.5418,0.730675,0.860465,0.867571,0.860465,0.861635,"{0: 48, 1: 27, 2: 54}"
3,0.516,0.845765,0.837209,0.837209,0.837209,0.837209,"{0: 42, 1: 27, 2: 60}"
4,0.2661,0.893135,0.829457,0.841489,0.829457,0.829089,"{0: 53, 1: 22, 2: 54}"
5,0.2031,0.900852,0.860465,0.86107,0.860465,0.8607,"{0: 43, 1: 27, 2: 59}"
6,0.1608,0.933761,0.852713,0.857242,0.852713,0.854129,"{0: 44, 1: 29, 2: 56}"
7,0.0939,0.891322,0.860465,0.864385,0.860465,0.861815,"{0: 41, 1: 30, 2: 58}"
8,0.0503,0.976547,0.860465,0.862474,0.860465,0.861189,"{0: 41, 1: 29, 2: 59}"


Evaluation results for dbmdz/bert-base-german-uncased with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.917163372039795, 'eval_accuracy': 0.7254901960784313, 'eval_precision': 0.7579185520361991, 'eval_recall': 0.7254901960784313, 'eval_f1': 0.7299436093275224, 'eval_class_distribution': {0: 48, 1: 40, 2: 65}, 'eval_runtime': 2.3712, 'eval_samples_per_second': 64.523, 'eval_steps_per_second': 32.472, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.63      0.86      0.73        36
     Neutral       0.61      0.85      0.71        33
     Positiv       0.93      0.64      0.76        84

    accuracy                           0.74       153
   macro avg       0.72      0.78      0.73       153
weighted avg       0.79      0.74      0.74       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 49, 1: 46, 2: 58}
Negativ Precision Score: 0.6326530612244898
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.7294117647058823

Neutral Precision Score: 0.6086956521739131
Neutral Recall Score: 0

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4794.04 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4214.03 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4260.38 examples/s]


Training results for FacebookAI/xlm-roberta-base with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.96,0.733855,0.790698,0.796138,0.790698,0.792435,"{0: 46, 1: 27, 2: 56}"
2,0.8546,1.031593,0.813953,0.814601,0.813953,0.811145,"{0: 37, 1: 24, 2: 68}"
3,0.7837,1.017649,0.813953,0.818571,0.813953,0.812347,"{0: 49, 1: 21, 2: 59}"
4,0.5643,1.389266,0.767442,0.777905,0.767442,0.769491,"{0: 48, 1: 29, 2: 52}"
5,0.5145,1.104014,0.813953,0.815302,0.813953,0.813136,"{0: 46, 1: 23, 2: 60}"
6,0.4392,1.173627,0.829457,0.833317,0.829457,0.830879,"{0: 43, 1: 29, 2: 57}"
7,0.3681,1.039371,0.844961,0.84798,0.844961,0.845873,"{0: 45, 1: 27, 2: 57}"
8,0.3354,1.040194,0.844961,0.850306,0.844961,0.846928,"{0: 42, 1: 30, 2: 57}"


Evaluation results for FacebookAI/xlm-roberta-base with 8 epochs and random seeds: 42, 42



{'eval_loss': 0.9596351981163025, 'eval_accuracy': 0.8496732026143791, 'eval_precision': 0.8603635786298326, 'eval_recall': 0.8496732026143791, 'eval_f1': 0.8525172231054584, 'eval_class_distribution': {0: 38, 1: 39, 2: 76}, 'eval_runtime': 2.2807, 'eval_samples_per_second': 67.085, 'eval_steps_per_second': 33.762, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.74      0.89      0.81        36
     Neutral       0.68      0.70      0.69        33
     Positiv       0.91      0.82      0.86        84

    accuracy                           0.81       153
   macro avg       0.78      0.80      0.79       153
weighted avg       0.82      0.81      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 43, 1: 34, 2: 76}
Negativ Precision Score: 0.7441860465116279
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.810126582278481

Neutral Precision Score: 0.6764705882352942
Neutral Recall Score: 0

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4288.97 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4363.15 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4253.69 examples/s]


Training results for TUM/GottBERT_base_best with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8794,0.760487,0.829457,0.860506,0.829457,0.827117,"{0: 58, 1: 16, 2: 55}"
2,0.5066,0.809799,0.829457,0.8468,0.829457,0.834453,"{0: 39, 1: 35, 2: 55}"
3,0.5362,0.629004,0.883721,0.885124,0.883721,0.881197,"{0: 45, 1: 21, 2: 63}"
4,0.318,0.536448,0.860465,0.865477,0.860465,0.85883,"{0: 49, 1: 21, 2: 59}"
5,0.2715,0.895684,0.844961,0.847129,0.844961,0.839765,"{0: 39, 1: 20, 2: 70}"
6,0.2319,0.761999,0.860465,0.875031,0.860465,0.863805,"{0: 38, 1: 35, 2: 56}"
7,0.193,0.719269,0.868217,0.86927,0.868217,0.868189,"{0: 39, 1: 28, 2: 62}"
8,0.1707,0.866967,0.837209,0.840784,0.837209,0.837464,"{0: 37, 1: 29, 2: 63}"


Evaluation results for TUM/GottBERT_base_best with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.0912635326385498, 'eval_accuracy': 0.7973856209150327, 'eval_precision': 0.8099001643890573, 'eval_recall': 0.7973856209150327, 'eval_f1': 0.7985068034526942, 'eval_class_distribution': {0: 46, 1: 34, 2: 73}, 'eval_runtime': 2.3132, 'eval_samples_per_second': 66.141, 'eval_steps_per_second': 33.287, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.72      0.92      0.80        36
     Neutral       0.71      0.76      0.74        33
     Positiv       0.90      0.77      0.83        84

    accuracy                           0.80       153
   macro avg       0.78      0.82      0.79       153
weighted avg       0.82      0.80      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 46, 1: 35, 2: 72}
Negativ Precision Score: 0.717391304347826
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.8048780487804879

Neutral Precision Score: 0.7142857142857143
Neutral Recall Score: 0

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4949.39 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4518.52 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3826.62 examples/s]


Training results for TUM/GottBERT_filtered_base_best with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.7952,0.688211,0.852713,0.861607,0.852713,0.853744,"{0: 50, 1: 24, 2: 55}"
2,0.5181,0.684829,0.883721,0.884455,0.883721,0.883463,"{0: 39, 1: 29, 2: 61}"
3,0.4586,0.584417,0.906977,0.907802,0.906977,0.907138,"{0: 44, 1: 26, 2: 59}"
4,0.2969,0.722327,0.875969,0.878879,0.875969,0.876812,"{0: 45, 1: 27, 2: 57}"
5,0.3046,0.705719,0.883721,0.883721,0.883721,0.883721,"{0: 42, 1: 27, 2: 60}"
6,0.2432,0.817558,0.875969,0.880171,0.875969,0.877319,"{0: 42, 1: 30, 2: 57}"
7,0.2147,0.722536,0.875969,0.876307,0.875969,0.87574,"{0: 44, 1: 25, 2: 60}"
8,0.1656,0.813668,0.875969,0.879215,0.875969,0.877052,"{0: 43, 1: 29, 2: 57}"


Evaluation results for TUM/GottBERT_filtered_base_best with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.1530835628509521, 'eval_accuracy': 0.8169934640522876, 'eval_precision': 0.8235219586726042, 'eval_recall': 0.8169934640522876, 'eval_f1': 0.8184065069878218, 'eval_class_distribution': {0: 41, 1: 35, 2: 77}, 'eval_runtime': 2.2839, 'eval_samples_per_second': 66.989, 'eval_steps_per_second': 33.714, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.84      0.89      0.86        36
     Neutral       0.69      0.76      0.72        33
     Positiv       0.91      0.86      0.88        84

    accuracy                           0.84       153
   macro avg       0.82      0.83      0.82       153
weighted avg       0.85      0.84      0.84       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 38, 1: 36, 2: 79}
Negativ Precision Score: 0.8421052631578947
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8648648648648649

Neutral Precision Score: 0.6944444444444444
Neutral Recall Score: 

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be abl

Training results for TUM/GottBERT_base_last with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9086,0.874375,0.79845,0.824793,0.79845,0.787943,"{0: 59, 1: 13, 2: 57}"
2,0.5697,0.81001,0.837209,0.846748,0.837209,0.839653,"{0: 38, 1: 33, 2: 58}"
3,0.5191,1.082653,0.813953,0.833942,0.813953,0.811238,"{0: 55, 1: 17, 2: 57}"
4,0.3103,0.748966,0.829457,0.828248,0.829457,0.828562,"{0: 43, 1: 25, 2: 61}"
5,0.2983,0.761917,0.852713,0.851687,0.852713,0.852115,"{0: 42, 1: 26, 2: 61}"
6,0.1917,0.913061,0.837209,0.84654,0.837209,0.840317,"{0: 43, 1: 31, 2: 55}"
7,0.1904,0.864859,0.868217,0.873853,0.868217,0.870167,"{0: 44, 1: 29, 2: 56}"
8,0.0911,0.95574,0.860465,0.867596,0.860465,0.863049,"{0: 43, 1: 30, 2: 56}"


Evaluation results for TUM/GottBERT_base_last with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.6601247787475586, 'eval_accuracy': 0.7843137254901961, 'eval_precision': 0.8048815770439631, 'eval_recall': 0.7843137254901961, 'eval_f1': 0.7886849318108682, 'eval_class_distribution': {0: 40, 1: 42, 2: 71}, 'eval_runtime': 2.3058, 'eval_samples_per_second': 66.355, 'eval_steps_per_second': 33.394, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.71      0.89      0.79        36
     Neutral       0.63      0.79      0.70        33
     Positiv       0.94      0.75      0.83        84

    accuracy                           0.79       153
   macro avg       0.76      0.81      0.78       153
weighted avg       0.82      0.79      0.80       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 45, 1: 41, 2: 67}
Negativ Precision Score: 0.7111111111111111
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.7901234567901234

Neutral Precision Score: 0.6341463414634146
Neutral Recall Score: 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4924.44 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4637.73 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4601.06 examples/s]


Training results for distilbert/distilbert-base-german-cased with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8141,0.742718,0.790698,0.832345,0.790698,0.790678,"{0: 63, 1: 19, 2: 47}"
2,0.568,0.763324,0.813953,0.835625,0.813953,0.816436,"{0: 55, 1: 23, 2: 51}"
3,0.5055,0.599447,0.875969,0.887109,0.875969,0.877011,"{0: 51, 1: 25, 2: 53}"
4,0.2837,0.762575,0.860465,0.878638,0.860465,0.862311,"{0: 53, 1: 26, 2: 50}"
5,0.2962,0.776296,0.844961,0.853919,0.844961,0.846007,"{0: 50, 1: 25, 2: 54}"
6,0.1886,0.807323,0.852713,0.862033,0.852713,0.853939,"{0: 47, 1: 30, 2: 52}"
7,0.1321,0.850481,0.860465,0.876784,0.860465,0.861919,"{0: 53, 1: 25, 2: 51}"
8,0.0919,0.851439,0.860465,0.870176,0.860465,0.861555,"{0: 49, 1: 28, 2: 52}"


Evaluation results for distilbert/distilbert-base-german-cased with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.0098521709442139, 'eval_accuracy': 0.8169934640522876, 'eval_precision': 0.8173789684729426, 'eval_recall': 0.8169934640522876, 'eval_f1': 0.816770793581566, 'eval_class_distribution': {0: 39, 1: 32, 2: 82}, 'eval_runtime': 1.2884, 'eval_samples_per_second': 118.75, 'eval_steps_per_second': 59.763, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.81      0.81      0.81        36
     Neutral       0.68      0.58      0.62        33
     Positiv       0.83      0.88      0.86        84

    accuracy                           0.80       153
   macro avg       0.77      0.75      0.76       153
weighted avg       0.79      0.80      0.79       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 36, 1: 28, 2: 89}
Negativ Precision Score: 0.8055555555555556
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.8055555555555556

Neutral Precision Score: 0.6785714285714286
Neutral Recall Score: 0

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

Training results for GerMedBERT/medbert-512 with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9033,0.842962,0.775194,0.780869,0.775194,0.774612,"{0: 48, 1: 21, 2: 60}"
2,0.5579,1.020309,0.75969,0.768518,0.75969,0.757392,"{0: 32, 1: 35, 2: 62}"
3,0.5143,1.116372,0.806202,0.805258,0.806202,0.805649,"{0: 42, 1: 26, 2: 61}"
4,0.2535,1.255738,0.767442,0.778941,0.767442,0.771011,"{0: 41, 1: 33, 2: 55}"
5,0.2594,1.263882,0.790698,0.79231,0.790698,0.791307,"{0: 43, 1: 28, 2: 58}"
6,0.1881,1.130223,0.821705,0.824743,0.821705,0.822403,"{0: 44, 1: 29, 2: 56}"
7,0.1556,1.288304,0.813953,0.816409,0.813953,0.814687,"{0: 43, 1: 29, 2: 57}"
8,0.0836,1.259637,0.806202,0.807234,0.806202,0.80664,"{0: 42, 1: 28, 2: 59}"


Evaluation results for GerMedBERT/medbert-512 with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.1535640954971313, 'eval_accuracy': 0.7777777777777778, 'eval_precision': 0.7928470656741788, 'eval_recall': 0.7777777777777778, 'eval_f1': 0.7796503130084663, 'eval_class_distribution': {0: 43, 1: 39, 2: 71}, 'eval_runtime': 2.3526, 'eval_samples_per_second': 65.035, 'eval_steps_per_second': 32.73, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.74      0.81      0.77        36
     Neutral       0.70      0.85      0.77        33
     Positiv       0.88      0.77      0.82        84

    accuracy                           0.80       153
   macro avg       0.77      0.81      0.79       153
weighted avg       0.81      0.80      0.80       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 39, 1: 40, 2: 74}
Negativ Precision Score: 0.7435897435897436
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.7733333333333333

Neutral Precision Score: 0.7
Neutral Recall Score: 0.84848484848484

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Training results for deepset/gbert-base with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8528,0.885829,0.782946,0.822379,0.782946,0.774095,"{0: 60, 1: 12, 2: 57}"
2,0.5486,0.501229,0.860465,0.859302,0.860465,0.859042,"{0: 40, 1: 25, 2: 64}"
3,0.4376,0.862168,0.844961,0.846309,0.844961,0.844059,"{0: 46, 1: 23, 2: 60}"
4,0.2653,0.824371,0.837209,0.839398,0.837209,0.836834,"{0: 47, 1: 24, 2: 58}"
5,0.2004,1.082035,0.821705,0.826926,0.821705,0.820037,"{0: 50, 1: 21, 2: 58}"
6,0.1083,0.82694,0.860465,0.864175,0.860465,0.860949,"{0: 47, 1: 25, 2: 57}"
7,0.0885,0.996552,0.844961,0.855608,0.844961,0.845859,"{0: 51, 1: 25, 2: 53}"
8,0.0258,0.939416,0.860465,0.86603,0.860465,0.860322,"{0: 49, 1: 23, 2: 57}"


Evaluation results for deepset/gbert-base with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.394826889038086, 'eval_accuracy': 0.7843137254901961, 'eval_precision': 0.8034387738835371, 'eval_recall': 0.7843137254901961, 'eval_f1': 0.7871793565911214, 'eval_class_distribution': {0: 41, 1: 42, 2: 70}, 'eval_runtime': 2.3621, 'eval_samples_per_second': 64.773, 'eval_steps_per_second': 32.598, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.78      0.78      0.78        36
     Neutral       0.55      0.85      0.67        33
     Positiv       0.91      0.71      0.80        84

    accuracy                           0.76       153
   macro avg       0.75      0.78      0.75       153
weighted avg       0.80      0.76      0.77       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 36, 1: 51, 2: 66}
Negativ Precision Score: 0.7777777777777778
Negativ Recall Score: 0.7777777777777778
Negativ F1 Score: 0.7777777777777778

Neutral Precision Score: 0.5490196078431373
Neutral Recall Score: 0

In [7]:
absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=8)

Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 3912.45 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3736.20 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3753.63 examples/s]


Training results for aari1995/German_Sentiment with 8 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8976,0.819189,0.837209,0.868564,0.837209,0.823483,"{0: 60, 1: 12, 2: 57}"
2,0.5459,0.549485,0.891473,0.889679,0.891473,0.889848,"{0: 43, 1: 24, 2: 62}"
3,0.4719,0.827644,0.868217,0.868833,0.868217,0.86638,"{0: 45, 1: 22, 2: 62}"
4,0.2458,0.690967,0.891473,0.893065,0.891473,0.891744,"{0: 45, 1: 26, 2: 58}"
5,0.2218,0.732186,0.899225,0.899166,0.899225,0.898415,"{0: 44, 1: 24, 2: 61}"
6,0.0898,0.896576,0.875969,0.876773,0.875969,0.876278,"{0: 41, 1: 28, 2: 60}"
7,0.0453,0.849441,0.899225,0.901475,0.899225,0.89794,"{0: 46, 1: 22, 2: 61}"
8,0.025,0.877132,0.891473,0.895371,0.891473,0.890744,"{0: 47, 1: 22, 2: 60}"


Evaluation results for aari1995/German_Sentiment with 8 epochs and random seeds: 42, 42



{'eval_loss': 1.1299880743026733, 'eval_accuracy': 0.8496732026143791, 'eval_precision': 0.8480319973744039, 'eval_recall': 0.8496732026143791, 'eval_f1': 0.8485100417318726, 'eval_class_distribution': {0: 35, 1: 31, 2: 87}, 'eval_runtime': 7.2956, 'eval_samples_per_second': 20.972, 'eval_steps_per_second': 10.554, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.86      0.83      0.85        36
     Neutral       0.72      0.70      0.71        33
     Positiv       0.86      0.88      0.87        84

    accuracy                           0.83       153
   macro avg       0.81      0.80      0.81       153
weighted avg       0.83      0.83      0.83       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 35, 1: 32, 2: 86}
Negativ Precision Score: 0.8571428571428571
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.8450704225352113

Neutral Precision Score: 0.71875
Neutral Recall Score: 0.696969696

In [5]:
for model in models:
    print(f'training and results for {model}:')
    absa_model(data, model, rn1=42, rn2=42, epochs=10)
    print()

training and results for google-bert/bert-base-german-cased:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 1912.56 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3270.72 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3352.46 examples/s]


Training results for google-bert/bert-base-german-cased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8175,0.77086,0.813953,0.853291,0.813953,0.817559,"{0: 59, 1: 25, 2: 45}"
2,0.5239,0.848461,0.821705,0.834705,0.821705,0.822909,"{0: 52, 1: 25, 2: 52}"
3,0.4469,1.166039,0.806202,0.811522,0.806202,0.806556,"{0: 40, 1: 33, 2: 56}"
4,0.2456,1.113193,0.813953,0.823701,0.813953,0.81606,"{0: 48, 1: 28, 2: 53}"
5,0.2318,1.348512,0.79845,0.805795,0.79845,0.798501,"{0: 35, 1: 33, 2: 61}"
6,0.1126,1.359115,0.79845,0.800042,0.79845,0.798178,"{0: 38, 1: 30, 2: 61}"
7,0.0762,1.496658,0.79845,0.80257,0.79845,0.79982,"{0: 42, 1: 30, 2: 57}"
8,0.0371,1.507993,0.79845,0.799336,0.79845,0.798812,"{0: 42, 1: 28, 2: 59}"


Evaluation results for google-bert/bert-base-german-cased with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.1806288957595825, 'eval_accuracy': 0.7450980392156863, 'eval_precision': 0.7843137254901961, 'eval_recall': 0.7450980392156863, 'eval_f1': 0.7483619534770429, 'eval_class_distribution': {0: 54, 1: 36, 2: 63}, 'eval_runtime': 2.438, 'eval_samples_per_second': 62.756, 'eval_steps_per_second': 31.583, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.63      0.89      0.74        36
     Neutral       0.64      0.76      0.69        33
     Positiv       0.92      0.69      0.79        84

    accuracy                           0.75       153
   macro avg       0.73      0.78      0.74       153
weighted avg       0.79      0.75      0.76       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 51, 1: 39, 2: 63}
Negativ Precision Score: 0.6274509803921569
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.735632183908046

Neutral Precision Score: 0.6410256410256411
Neutral Recall Score: 0.

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3719.54 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3440.88 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3563.79 examples/s]


Training results for dbmdz/bert-base-german-cased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8833,1.014879,0.775194,0.827673,0.775194,0.776978,"{0: 64, 1: 16, 2: 49}"
2,0.5636,0.675081,0.837209,0.83631,0.837209,0.836675,"{0: 42, 1: 26, 2: 61}"
3,0.515,0.938221,0.837209,0.858515,0.837209,0.838281,"{0: 55, 1: 20, 2: 54}"
4,0.2815,0.872746,0.860465,0.85989,0.860465,0.859904,"{0: 40, 1: 27, 2: 62}"
5,0.2645,0.964401,0.829457,0.82746,0.829457,0.82745,"{0: 41, 1: 24, 2: 64}"
6,0.2104,1.042262,0.844961,0.850884,0.844961,0.846525,"{0: 38, 1: 31, 2: 60}"
7,0.1732,0.809677,0.875969,0.877643,0.875969,0.876589,"{0: 43, 1: 28, 2: 58}"
8,0.1016,0.967563,0.844961,0.844921,0.844961,0.844196,"{0: 44, 1: 24, 2: 61}"
9,0.042,1.047411,0.868217,0.874535,0.868217,0.869942,"{0: 46, 1: 28, 2: 55}"
10,0.0161,1.042477,0.860465,0.863313,0.860465,0.861409,"{0: 44, 1: 28, 2: 57}"


Evaluation results for dbmdz/bert-base-german-cased with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.5276323556900024, 'eval_accuracy': 0.7908496732026143, 'eval_precision': 0.8052272799427928, 'eval_recall': 0.7908496732026143, 'eval_f1': 0.793962209473637, 'eval_class_distribution': {0: 41, 1: 39, 2: 73}, 'eval_runtime': 2.414, 'eval_samples_per_second': 63.38, 'eval_steps_per_second': 31.897, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.80      0.89      0.84        36
     Neutral       0.65      0.79      0.71        33
     Positiv       0.90      0.79      0.84        84

    accuracy                           0.81       153
   macro avg       0.78      0.82      0.80       153
weighted avg       0.82      0.81      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 40, 2: 73}
Negativ Precision Score: 0.8
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8421052631578947

Neutral Precision Score: 0.65
Neutral Recall Score: 0.7878787878787878
Neutral F1 S

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3383.75 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3456.07 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3418.52 examples/s]


Training results for dbmdz/bert-base-german-uncased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8372,0.942204,0.79845,0.824124,0.79845,0.79872,"{0: 58, 1: 20, 2: 51}"
2,0.5499,0.671465,0.837209,0.835297,0.837209,0.834587,"{0: 41, 1: 23, 2: 65}"
3,0.4617,0.796972,0.852713,0.85305,0.852713,0.850706,"{0: 43, 1: 22, 2: 64}"
4,0.2906,0.943668,0.821705,0.848021,0.821705,0.821349,"{0: 58, 1: 19, 2: 52}"
5,0.2406,0.879387,0.837209,0.845236,0.837209,0.83599,"{0: 33, 1: 31, 2: 65}"
6,0.19,0.894554,0.837209,0.844956,0.837209,0.839521,"{0: 40, 1: 32, 2: 57}"
7,0.1408,0.747313,0.868217,0.868369,0.868217,0.868055,"{0: 44, 1: 26, 2: 59}"
8,0.1045,0.855859,0.875969,0.878363,0.875969,0.876545,"{0: 45, 1: 27, 2: 57}"
9,0.0262,0.999977,0.852713,0.855113,0.852713,0.851674,"{0: 36, 1: 29, 2: 64}"
10,0.0039,0.90512,0.875969,0.876685,0.875969,0.876231,"{0: 41, 1: 28, 2: 60}"


Evaluation results for dbmdz/bert-base-german-uncased with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.8147130012512207, 'eval_accuracy': 0.7581699346405228, 'eval_precision': 0.7823650756261733, 'eval_recall': 0.7581699346405228, 'eval_f1': 0.7607198160373072, 'eval_class_distribution': {0: 45, 1: 41, 2: 67}, 'eval_runtime': 2.3991, 'eval_samples_per_second': 63.775, 'eval_steps_per_second': 32.096, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.74      0.81      0.77        36
     Neutral       0.62      0.85      0.72        33
     Positiv       0.90      0.74      0.81        84

    accuracy                           0.78       153
   macro avg       0.75      0.80      0.77       153
weighted avg       0.80      0.78      0.78       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 39, 1: 45, 2: 69}
Negativ Precision Score: 0.7435897435897436
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.7733333333333333

Neutral Precision Score: 0.6222222222222222
Neutral Recall Score:

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3996.42 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3901.57 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4115.57 examples/s]


Training results for FacebookAI/xlm-roberta-base with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9707,0.697857,0.736434,0.788762,0.736434,0.715937,"{0: 65, 1: 8, 2: 56}"
2,0.7624,0.735756,0.821705,0.820183,0.821705,0.820159,"{0: 42, 1: 24, 2: 63}"
3,0.8044,1.028958,0.821705,0.824989,0.821705,0.821096,"{0: 48, 1: 23, 2: 58}"
4,0.585,0.927237,0.844961,0.851497,0.844961,0.846616,"{0: 40, 1: 32, 2: 57}"
5,0.5645,0.956515,0.844961,0.849123,0.844961,0.846321,"{0: 42, 1: 30, 2: 57}"
6,0.5227,1.009846,0.860465,0.867445,0.860465,0.862051,"{0: 38, 1: 32, 2: 59}"
7,0.3513,0.984108,0.852713,0.856444,0.852713,0.85397,"{0: 41, 1: 30, 2: 58}"
8,0.2975,0.942258,0.868217,0.873177,0.868217,0.869487,"{0: 39, 1: 31, 2: 59}"
9,0.2145,0.978815,0.868217,0.875664,0.868217,0.870251,"{0: 39, 1: 32, 2: 58}"
10,0.1854,1.026806,0.860465,0.864072,0.860465,0.861624,"{0: 40, 1: 30, 2: 59}"


Evaluation results for FacebookAI/xlm-roberta-base with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.4451854228973389, 'eval_accuracy': 0.7973856209150327, 'eval_precision': 0.808722422112515, 'eval_recall': 0.7973856209150327, 'eval_f1': 0.8005103411227504, 'eval_class_distribution': {0: 37, 1: 40, 2: 76}, 'eval_runtime': 2.3566, 'eval_samples_per_second': 64.925, 'eval_steps_per_second': 32.675, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.83      0.81      0.82        36
     Neutral       0.61      0.76      0.68        33
     Positiv       0.90      0.82      0.86        84

    accuracy                           0.80       153
   macro avg       0.78      0.79      0.78       153
weighted avg       0.82      0.80      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 35, 1: 41, 2: 77}
Negativ Precision Score: 0.8285714285714286
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.8169014084507042

Neutral Precision Score: 0.6097560975609756
Neutral Recall Score: 

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3853.84 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3891.10 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4179.06 examples/s]


Training results for TUM/GottBERT_base_best with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8662,0.813601,0.821705,0.837179,0.821705,0.818343,"{0: 55, 1: 18, 2: 56}"
2,0.5302,0.690429,0.844961,0.850147,0.844961,0.846153,"{0: 46, 1: 28, 2: 55}"
3,0.4877,0.817732,0.844961,0.849332,0.844961,0.83965,"{0: 47, 1: 18, 2: 64}"
4,0.3022,1.024746,0.813953,0.81922,0.813953,0.813188,"{0: 50, 1: 23, 2: 56}"
5,0.2938,0.892657,0.852713,0.851863,0.852713,0.847559,"{0: 42, 1: 20, 2: 67}"
6,0.258,0.834962,0.844961,0.855102,0.844961,0.84759,"{0: 41, 1: 33, 2: 55}"
7,0.2139,1.013247,0.844961,0.846819,0.844961,0.841485,"{0: 47, 1: 20, 2: 62}"
8,0.1819,1.092038,0.844961,0.84581,0.844961,0.843528,"{0: 47, 1: 23, 2: 59}"


Evaluation results for TUM/GottBERT_base_best with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.170857548713684, 'eval_accuracy': 0.8300653594771242, 'eval_precision': 0.8503255240129871, 'eval_recall': 0.8300653594771242, 'eval_f1': 0.8321134853043223, 'eval_class_distribution': {0: 43, 1: 41, 2: 69}, 'eval_runtime': 2.36, 'eval_samples_per_second': 64.829, 'eval_steps_per_second': 32.626, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.77      0.92      0.84        36
     Neutral       0.71      0.88      0.78        33
     Positiv       0.94      0.77      0.85        84

    accuracy                           0.83       153
   macro avg       0.81      0.86      0.82       153
weighted avg       0.85      0.83      0.83       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 43, 1: 41, 2: 69}
Negativ Precision Score: 0.7674418604651163
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.8354430379746836

Neutral Precision Score: 0.7073170731707317
Neutral Recall Score: 0.8

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4986.83 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4604.58 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4367.52 examples/s]


Training results for TUM/GottBERT_filtered_base_best with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8055,0.643448,0.875969,0.88161,0.875969,0.875734,"{0: 48, 1: 22, 2: 59}"
2,0.5217,0.685101,0.883721,0.883374,0.883721,0.882997,"{0: 39, 1: 28, 2: 62}"
3,0.4944,0.622121,0.906977,0.907851,0.906977,0.906984,"{0: 44, 1: 25, 2: 60}"
4,0.2951,0.661963,0.891473,0.893564,0.891473,0.891307,"{0: 46, 1: 24, 2: 59}"
5,0.3147,0.651863,0.891473,0.891473,0.891473,0.891473,"{0: 42, 1: 27, 2: 60}"
6,0.2541,0.809266,0.875969,0.879215,0.875969,0.877052,"{0: 43, 1: 29, 2: 57}"
7,0.2472,0.855742,0.860465,0.86603,0.860465,0.860322,"{0: 49, 1: 23, 2: 57}"
8,0.1865,0.879591,0.860465,0.865116,0.860465,0.861909,"{0: 45, 1: 28, 2: 56}"


Evaluation results for TUM/GottBERT_filtered_base_best with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.0035086870193481, 'eval_accuracy': 0.8169934640522876, 'eval_precision': 0.8178500270003538, 'eval_recall': 0.8169934640522876, 'eval_f1': 0.8169459298871063, 'eval_class_distribution': {0: 39, 1: 33, 2: 81}, 'eval_runtime': 2.3652, 'eval_samples_per_second': 64.687, 'eval_steps_per_second': 32.555, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.86      0.89      0.88        36
     Neutral       0.69      0.73      0.71        33
     Positiv       0.88      0.85      0.86        84

    accuracy                           0.83       153
   macro avg       0.81      0.82      0.81       153
weighted avg       0.83      0.83      0.83       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 37, 1: 35, 2: 81}
Negativ Precision Score: 0.8648648648648649
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8767123287671232

Neutral Precision Score: 0.6857142857142857
Neutral Recall Score: 

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be abl

Training results for TUM/GottBERT_base_last with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9029,0.784432,0.813953,0.847793,0.813953,0.808608,"{0: 61, 1: 15, 2: 53}"
2,0.5471,0.585556,0.868217,0.868744,0.868217,0.868385,"{0: 41, 1: 28, 2: 60}"
3,0.5674,0.781611,0.860465,0.864786,0.860465,0.858017,"{0: 47, 1: 20, 2: 62}"
4,0.3248,0.785425,0.844961,0.84581,0.844961,0.843528,"{0: 47, 1: 23, 2: 59}"
5,0.3006,0.661972,0.860465,0.860278,0.860465,0.860093,"{0: 40, 1: 27, 2: 62}"
6,0.2522,0.873954,0.821705,0.823541,0.821705,0.820672,"{0: 48, 1: 24, 2: 57}"
7,0.2361,0.949543,0.837209,0.839398,0.837209,0.836834,"{0: 47, 1: 24, 2: 58}"
8,0.2105,0.986091,0.821705,0.826177,0.821705,0.82256,"{0: 47, 1: 26, 2: 56}"
9,0.1209,1.092986,0.821705,0.82715,0.821705,0.823572,"{0: 45, 1: 28, 2: 56}"
10,0.086,1.147903,0.829457,0.83356,0.829457,0.829603,"{0: 48, 1: 25, 2: 56}"


Evaluation results for TUM/GottBERT_base_last with 10 epochs and random seeds: 42, 42



{'eval_loss': 0.7744659185409546, 'eval_accuracy': 0.8366013071895425, 'eval_precision': 0.8480288957688339, 'eval_recall': 0.8366013071895425, 'eval_f1': 0.8391038629586361, 'eval_class_distribution': {0: 38, 1: 40, 2: 75}, 'eval_runtime': 2.3923, 'eval_samples_per_second': 63.955, 'eval_steps_per_second': 32.187, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.76      0.86      0.81        36
     Neutral       0.68      0.79      0.73        33
     Positiv       0.92      0.81      0.86        84

    accuracy                           0.82       153
   macro avg       0.79      0.82      0.80       153
weighted avg       0.83      0.82      0.82       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 41, 1: 38, 2: 74}
Negativ Precision Score: 0.7560975609756098
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8051948051948052

Neutral Precision Score: 0.6842105263157895
Neutral Recall Score:

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4502.72 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3551.58 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3806.40 examples/s]


Training results for distilbert/distilbert-base-german-cased with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8197,0.725881,0.821705,0.844388,0.821705,0.822841,"{0: 56, 1: 24, 2: 49}"
2,0.5572,0.794907,0.790698,0.815603,0.790698,0.790951,"{0: 58, 1: 21, 2: 50}"
3,0.5075,0.691476,0.852713,0.871585,0.852713,0.853319,"{0: 55, 1: 23, 2: 51}"
4,0.2849,0.841368,0.852713,0.874144,0.852713,0.853929,"{0: 55, 1: 25, 2: 49}"
5,0.2651,0.845057,0.837209,0.849911,0.837209,0.838142,"{0: 52, 1: 25, 2: 52}"
6,0.1886,0.997535,0.821705,0.843693,0.821705,0.823599,"{0: 49, 1: 33, 2: 47}"
7,0.1427,1.099006,0.806202,0.835849,0.806202,0.806759,"{0: 59, 1: 20, 2: 50}"
8,0.0903,0.985446,0.821705,0.843203,0.821705,0.823284,"{0: 51, 1: 31, 2: 47}"
9,0.0468,1.059866,0.821705,0.843203,0.821705,0.823284,"{0: 51, 1: 31, 2: 47}"
10,0.0295,1.029103,0.844961,0.859468,0.844961,0.846744,"{0: 49, 1: 30, 2: 50}"


Evaluation results for distilbert/distilbert-base-german-cased with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.3278671503067017, 'eval_accuracy': 0.7450980392156863, 'eval_precision': 0.7723480018062469, 'eval_recall': 0.7450980392156863, 'eval_f1': 0.7471504960588564, 'eval_class_distribution': {0: 52, 1: 34, 2: 67}, 'eval_runtime': 1.3682, 'eval_samples_per_second': 111.828, 'eval_steps_per_second': 56.279, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.69      0.86      0.77        36
     Neutral       0.64      0.64      0.64        33
     Positiv       0.88      0.79      0.83        84

    accuracy                           0.77       153
   macro avg       0.74      0.76      0.74       153
weighted avg       0.78      0.77      0.77       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 45, 1: 33, 2: 75}
Negativ Precision Score: 0.6888888888888889
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.7654320987654321

Neutral Precision Score: 0.6363636363636364
Neutral Recall Score

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

Training results for GerMedBERT/medbert-512 with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8787,0.681928,0.782946,0.793852,0.782946,0.784481,"{0: 51, 1: 24, 2: 54}"
2,0.5532,0.743539,0.790698,0.790594,0.790698,0.790558,"{0: 43, 1: 26, 2: 60}"
3,0.5053,1.306931,0.751938,0.753381,0.751938,0.751465,"{0: 44, 1: 23, 2: 62}"
4,0.2259,1.017397,0.79845,0.817601,0.79845,0.802991,"{0: 51, 1: 27, 2: 51}"
5,0.2379,1.140012,0.790698,0.795254,0.790698,0.789844,"{0: 49, 1: 22, 2: 58}"
6,0.1394,1.111544,0.813953,0.827467,0.813953,0.817343,"{0: 46, 1: 31, 2: 52}"
7,0.108,1.221766,0.813953,0.813734,0.813953,0.813758,"{0: 42, 1: 26, 2: 61}"
8,0.0695,1.167081,0.821705,0.826265,0.821705,0.822662,"{0: 47, 1: 25, 2: 57}"
9,0.0163,1.238996,0.821705,0.830964,0.821705,0.823986,"{0: 43, 1: 32, 2: 54}"
10,0.0103,1.223048,0.821705,0.827987,0.821705,0.823252,"{0: 43, 1: 31, 2: 55}"


Evaluation results for GerMedBERT/medbert-512 with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.516144037246704, 'eval_accuracy': 0.803921568627451, 'eval_precision': 0.8166104048456989, 'eval_recall': 0.803921568627451, 'eval_f1': 0.8053041729512317, 'eval_class_distribution': {0: 42, 1: 39, 2: 72}, 'eval_runtime': 2.3647, 'eval_samples_per_second': 64.701, 'eval_steps_per_second': 32.562, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.61      0.78      0.68        36
     Neutral       0.66      0.76      0.70        33
     Positiv       0.87      0.71      0.78        84

    accuracy                           0.74       153
   macro avg       0.71      0.75      0.72       153
weighted avg       0.76      0.74      0.74       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 46, 1: 38, 2: 69}
Negativ Precision Score: 0.6086956521739131
Negativ Recall Score: 0.7777777777777778
Negativ F1 Score: 0.6829268292682927

Neutral Precision Score: 0.6578947368421053
Neutral Recall Score: 0.

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Training results for deepset/gbert-base with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8376,0.68419,0.852713,0.875238,0.852713,0.849436,"{0: 53, 1: 16, 2: 60}"
2,0.5022,0.7111,0.852713,0.854506,0.852713,0.85311,"{0: 45, 1: 26, 2: 58}"
3,0.4759,0.890525,0.860465,0.860318,0.860465,0.858208,"{0: 43, 1: 22, 2: 64}"
4,0.2657,0.863175,0.868217,0.870597,0.868217,0.868486,"{0: 46, 1: 26, 2: 57}"
5,0.2049,0.98255,0.875969,0.879148,0.875969,0.875164,"{0: 46, 1: 22, 2: 61}"
6,0.1291,1.097139,0.852713,0.852734,0.852713,0.851349,"{0: 45, 1: 23, 2: 61}"
7,0.0756,1.186705,0.852713,0.858942,0.852713,0.852003,"{0: 48, 1: 21, 2: 60}"
8,0.0305,1.145777,0.860465,0.865045,0.860465,0.859239,"{0: 47, 1: 21, 2: 61}"
9,0.0006,1.182693,0.875969,0.879148,0.875969,0.875164,"{0: 46, 1: 22, 2: 61}"
10,0.0001,1.194644,0.875969,0.879148,0.875969,0.875164,"{0: 46, 1: 22, 2: 61}"


Evaluation results for deepset/gbert-base with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.3732352256774902, 'eval_accuracy': 0.8300653594771242, 'eval_precision': 0.8322170285382814, 'eval_recall': 0.8300653594771242, 'eval_f1': 0.8308042099086875, 'eval_class_distribution': {0: 38, 1: 34, 2: 81}, 'eval_runtime': 2.4892, 'eval_samples_per_second': 61.466, 'eval_steps_per_second': 30.934, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.78      0.89      0.83        36
     Neutral       0.71      0.73      0.72        33
     Positiv       0.88      0.82      0.85        84

    accuracy                           0.82       153
   macro avg       0.79      0.81      0.80       153
weighted avg       0.82      0.82      0.82       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 41, 1: 34, 2: 78}
Negativ Precision Score: 0.7804878048780488
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8311688311688312

Neutral Precision Score: 0.7058823529411765
Neutral Recall Score:

In [6]:
absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=10)

Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 3718.35 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3585.02 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3640.62 examples/s]


Training results for aari1995/German_Sentiment with 10 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8687,0.963993,0.829457,0.883016,0.829457,0.818445,"{0: 63, 1: 11, 2: 55}"
2,0.5082,0.935839,0.829457,0.849099,0.829457,0.83093,"{0: 32, 1: 37, 2: 60}"
3,0.5221,0.983847,0.837209,0.840277,0.837209,0.832491,"{0: 50, 1: 19, 2: 60}"
4,0.2651,0.948324,0.844961,0.850181,0.844961,0.846534,"{0: 40, 1: 31, 2: 58}"
5,0.2229,0.756907,0.883721,0.886772,0.883721,0.882852,"{0: 46, 1: 22, 2: 61}"
6,0.1861,0.911469,0.844961,0.8537,0.844961,0.847947,"{0: 44, 1: 30, 2: 55}"
7,0.1104,0.816184,0.868217,0.87861,0.868217,0.870515,"{0: 49, 1: 26, 2: 54}"
8,0.0865,0.929037,0.868217,0.877087,0.868217,0.870558,"{0: 48, 1: 26, 2: 55}"
9,0.0122,0.855249,0.899225,0.910749,0.899225,0.900007,"{0: 51, 1: 22, 2: 56}"
10,0.0036,0.903134,0.875969,0.884216,0.875969,0.878393,"{0: 47, 1: 27, 2: 55}"


Evaluation results for aari1995/German_Sentiment with 10 epochs and random seeds: 42, 42



{'eval_loss': 1.1450538635253906, 'eval_accuracy': 0.8562091503267973, 'eval_precision': 0.8597936536611651, 'eval_recall': 0.8562091503267973, 'eval_f1': 0.8575058384023574, 'eval_class_distribution': {0: 34, 1: 36, 2: 83}, 'eval_runtime': 7.372, 'eval_samples_per_second': 20.754, 'eval_steps_per_second': 10.445, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.87      0.92      0.89        36
     Neutral       0.72      0.79      0.75        33
     Positiv       0.91      0.86      0.88        84

    accuracy                           0.86       153
   macro avg       0.83      0.85      0.84       153
weighted avg       0.86      0.86      0.86       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 38, 1: 36, 2: 79}
Negativ Precision Score: 0.868421052631579
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.8918918918918919

Neutral Precision Score: 0.7222222222222222
Neutral Recall Score: 0

In [None]:
for model in models:
    print(f'training and results for {model}:')
    absa_model(data, model, rn1=42, rn2=42, epochs=12)
    print()

training and results for google-bert/bert-base-german-cased:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2219.46 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3703.34 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3567.30 examples/s]


Training results for google-bert/bert-base-german-cased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8482,0.622184,0.813953,0.847231,0.813953,0.816413,"{0: 59, 1: 22, 2: 48}"
2,0.5702,0.66232,0.844961,0.848597,0.844961,0.845592,"{0: 39, 1: 31, 2: 59}"
3,0.4666,0.936649,0.837209,0.845643,0.837209,0.839518,"{0: 46, 1: 29, 2: 54}"
4,0.265,0.995227,0.813953,0.824687,0.813953,0.813546,"{0: 52, 1: 21, 2: 56}"
5,0.2489,1.229109,0.79845,0.799289,0.79845,0.793569,"{0: 34, 1: 25, 2: 70}"
6,0.1674,1.131079,0.79845,0.80623,0.79845,0.796121,"{0: 32, 1: 30, 2: 67}"
7,0.1745,1.321051,0.829457,0.831233,0.829457,0.829859,"{0: 45, 1: 26, 2: 58}"
8,0.0971,1.475749,0.806202,0.808394,0.806202,0.806207,"{0: 46, 1: 24, 2: 59}"
9,0.0343,1.565634,0.790698,0.800151,0.790698,0.793499,"{0: 47, 1: 28, 2: 54}"
10,0.0104,1.558809,0.790698,0.793654,0.790698,0.791597,"{0: 40, 1: 30, 2: 59}"


Evaluation results for google-bert/bert-base-german-cased with 12 epochs and random seeds: 42, 42



{'eval_loss': 0.9771641492843628, 'eval_accuracy': 0.7843137254901961, 'eval_precision': 0.7910649396717508, 'eval_recall': 0.7843137254901961, 'eval_f1': 0.7864462034396675, 'eval_class_distribution': {0: 38, 1: 37, 2: 78}, 'eval_runtime': 2.4575, 'eval_samples_per_second': 62.257, 'eval_steps_per_second': 31.332, 'epoch': 10.0}
              precision    recall  f1-score   support

     Negativ       0.81      0.81      0.81        36
     Neutral       0.64      0.76      0.69        33
     Positiv       0.88      0.82      0.85        84

    accuracy                           0.80       153
   macro avg       0.78      0.79      0.78       153
weighted avg       0.81      0.80      0.81       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 36, 1: 39, 2: 78}
Negativ Precision Score: 0.8055555555555556
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.8055555555555556

Neutral Precision Score: 0.6410256410256411
Neutral Recall Score:

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3757.99 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3913.62 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3917.76 examples/s]


Training results for dbmdz/bert-base-german-cased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9623,0.888917,0.751938,0.794722,0.751938,0.745918,"{0: 63, 1: 13, 2: 53}"
2,0.6434,0.914787,0.782946,0.788805,0.782946,0.784966,"{0: 43, 1: 30, 2: 56}"
3,0.5927,1.112766,0.806202,0.806738,0.806202,0.804479,"{0: 44, 1: 22, 2: 63}"
4,0.2981,0.814169,0.829457,0.842111,0.829457,0.830347,"{0: 52, 1: 25, 2: 52}"
5,0.3566,1.084527,0.806202,0.80847,0.806202,0.800338,"{0: 46, 1: 18, 2: 65}"
6,0.2676,1.316612,0.782946,0.822771,0.782946,0.790897,"{0: 38, 1: 42, 2: 49}"
7,0.2238,1.204969,0.806202,0.81623,0.806202,0.805988,"{0: 52, 1: 22, 2: 55}"
8,0.1848,1.267827,0.813953,0.82687,0.813953,0.817562,"{0: 44, 1: 32, 2: 53}"
9,0.0582,1.354712,0.829457,0.840002,0.829457,0.832159,"{0: 46, 1: 30, 2: 53}"
10,0.0272,1.373917,0.837209,0.846952,0.837209,0.839451,"{0: 47, 1: 29, 2: 53}"


Evaluation results for dbmdz/bert-base-german-cased with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.7890936136245728, 'eval_accuracy': 0.7908496732026143, 'eval_precision': 0.8093365253077976, 'eval_recall': 0.7908496732026143, 'eval_f1': 0.7934078182244728, 'eval_class_distribution': {0: 43, 1: 40, 2: 70}, 'eval_runtime': 2.3725, 'eval_samples_per_second': 64.488, 'eval_steps_per_second': 32.455, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.65      0.86      0.74        36
     Neutral       0.67      0.79      0.72        33
     Positiv       0.89      0.70      0.79        84

    accuracy                           0.76       153
   macro avg       0.74      0.78      0.75       153
weighted avg       0.79      0.76      0.76       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 48, 1: 39, 2: 66}
Negativ Precision Score: 0.6458333333333334
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.7380952380952381

Neutral Precision Score: 0.6666666666666666
Neutral Recall Score:

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3722.54 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 2893.83 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3161.97 examples/s]


Training results for dbmdz/bert-base-german-uncased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8704,0.808046,0.829457,0.848089,0.829457,0.827539,"{0: 56, 1: 19, 2: 54}"
2,0.5456,0.585445,0.844961,0.84776,0.844961,0.843106,"{0: 37, 1: 24, 2: 68}"
3,0.4953,0.697167,0.844961,0.850043,0.844961,0.845437,"{0: 48, 1: 26, 2: 55}"
4,0.2733,0.693329,0.829457,0.835594,0.829457,0.831143,"{0: 46, 1: 28, 2: 55}"
5,0.2616,0.742059,0.875969,0.876155,0.875969,0.874865,"{0: 46, 1: 24, 2: 59}"
6,0.1484,0.88388,0.813953,0.828662,0.813953,0.817493,"{0: 37, 1: 35, 2: 57}"
7,0.1556,0.982414,0.837209,0.837511,0.837209,0.836223,"{0: 46, 1: 24, 2: 59}"
8,0.0761,0.965461,0.852713,0.852602,0.852713,0.852421,"{0: 44, 1: 26, 2: 59}"
9,0.0357,1.016884,0.852713,0.856227,0.852713,0.853097,"{0: 47, 1: 25, 2: 57}"
10,0.0118,1.033618,0.860465,0.863409,0.860465,0.861034,"{0: 46, 1: 26, 2: 57}"


Evaluation results for dbmdz/bert-base-german-uncased with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.834274411201477, 'eval_accuracy': 0.7581699346405228, 'eval_precision': 0.7803009575923393, 'eval_recall': 0.7581699346405228, 'eval_f1': 0.7625949901491698, 'eval_class_distribution': {0: 40, 1: 43, 2: 70}, 'eval_runtime': 2.5143, 'eval_samples_per_second': 60.853, 'eval_steps_per_second': 30.625, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.78      0.86      0.82        36
     Neutral       0.67      0.91      0.77        33
     Positiv       0.94      0.76      0.84        84

    accuracy                           0.82       153
   macro avg       0.79      0.84      0.81       153
weighted avg       0.84      0.82      0.82       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 45, 2: 68}
Negativ Precision Score: 0.775
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8157894736842105

Neutral Precision Score: 0.6666666666666666
Neutral Recall Score: 0.90909090909

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4788.96 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4171.86 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3776.70 examples/s]


Training results for FacebookAI/xlm-roberta-base with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9338,0.87914,0.806202,0.80755,0.806202,0.805384,"{0: 46, 1: 23, 2: 60}"
2,0.7457,0.870991,0.821705,0.821858,0.821705,0.819176,"{0: 39, 1: 23, 2: 67}"
3,0.732,1.232942,0.813953,0.816189,0.813953,0.813261,"{0: 47, 1: 23, 2: 59}"
4,0.5042,1.239383,0.821705,0.825479,0.821705,0.821645,"{0: 48, 1: 24, 2: 57}"
5,0.4753,1.25918,0.821705,0.822854,0.821705,0.820803,"{0: 37, 1: 29, 2: 63}"
6,0.3918,1.233514,0.821705,0.825459,0.821705,0.822867,"{0: 42, 1: 30, 2: 57}"
7,0.3087,1.419691,0.79845,0.800451,0.79845,0.799195,"{0: 41, 1: 29, 2: 59}"
8,0.2621,1.45371,0.79845,0.804803,0.79845,0.800099,"{0: 39, 1: 32, 2: 58}"


Evaluation results for FacebookAI/xlm-roberta-base with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.3295048475265503, 'eval_accuracy': 0.8104575163398693, 'eval_precision': 0.8229240552769964, 'eval_recall': 0.8104575163398693, 'eval_f1': 0.8130984608166139, 'eval_class_distribution': {0: 39, 1: 40, 2: 74}, 'eval_runtime': 2.3497, 'eval_samples_per_second': 65.114, 'eval_steps_per_second': 32.77, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.84      0.86      0.85        36
     Neutral       0.60      0.76      0.67        33
     Positiv       0.89      0.79      0.84        84

    accuracy                           0.80       153
   macro avg       0.77      0.80      0.78       153
weighted avg       0.82      0.80      0.80       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 37, 1: 42, 2: 74}
Negativ Precision Score: 0.8378378378378378
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8493150684931506

Neutral Precision Score: 0.5952380952380952
Neutral Recall Score: 0

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4829.61 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4668.18 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3889.10 examples/s]


Training results for TUM/GottBERT_base_best with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8913,0.913396,0.813953,0.84288,0.813953,0.812816,"{0: 58, 1: 17, 2: 54}"
2,0.5696,0.548578,0.860465,0.87907,0.860465,0.862859,"{0: 50, 1: 30, 2: 49}"
3,0.5052,0.686304,0.875969,0.875969,0.875969,0.875969,"{0: 42, 1: 27, 2: 60}"
4,0.3821,0.770901,0.868217,0.872623,0.868217,0.869347,"{0: 46, 1: 27, 2: 56}"
5,0.3363,0.706364,0.875969,0.874585,0.875969,0.87497,"{0: 43, 1: 25, 2: 61}"
6,0.2902,0.683381,0.844961,0.85369,0.844961,0.846902,"{0: 43, 1: 32, 2: 54}"
7,0.262,1.138385,0.829457,0.834324,0.829457,0.822092,"{0: 51, 1: 17, 2: 61}"
8,0.2264,0.900502,0.852713,0.857113,0.852713,0.853064,"{0: 37, 1: 31, 2: 61}"
9,0.1566,0.952361,0.868217,0.895912,0.868217,0.871581,"{0: 34, 1: 40, 2: 55}"
10,0.1257,1.205676,0.844961,0.844287,0.844961,0.843087,"{0: 46, 1: 23, 2: 60}"


Evaluation results for TUM/GottBERT_base_best with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.2735322713851929, 'eval_accuracy': 0.7908496732026143, 'eval_precision': 0.8017740814010343, 'eval_recall': 0.7908496732026143, 'eval_f1': 0.7913412343417324, 'eval_class_distribution': {0: 47, 1: 31, 2: 75}, 'eval_runtime': 2.369, 'eval_samples_per_second': 64.584, 'eval_steps_per_second': 32.503, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.73      0.92      0.81        36
     Neutral       0.68      0.64      0.66        33
     Positiv       0.90      0.82      0.86        84

    accuracy                           0.80       153
   macro avg       0.77      0.79      0.78       153
weighted avg       0.81      0.80      0.80       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 45, 1: 31, 2: 77}
Negativ Precision Score: 0.7333333333333333
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.8148148148148148

Neutral Precision Score: 0.6774193548387096
Neutral Recall Score: 

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4061.69 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3626.27 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3531.32 examples/s]


Training results for TUM/GottBERT_filtered_base_best with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8374,0.819797,0.782946,0.815771,0.782946,0.785823,"{0: 59, 1: 21, 2: 49}"
2,0.6146,0.652929,0.868217,0.867761,0.868217,0.867671,"{0: 43, 1: 25, 2: 61}"
3,0.5076,0.648037,0.883721,0.888676,0.883721,0.883684,"{0: 48, 1: 23, 2: 58}"
4,0.3124,0.73543,0.875969,0.879701,0.875969,0.876747,"{0: 46, 1: 27, 2: 56}"
5,0.282,0.569201,0.899225,0.899893,0.899225,0.898154,"{0: 44, 1: 23, 2: 62}"
6,0.2393,0.631626,0.883721,0.8874,0.883721,0.884926,"{0: 41, 1: 30, 2: 58}"
7,0.2288,0.780108,0.883721,0.888311,0.883721,0.883875,"{0: 48, 1: 24, 2: 57}"
8,0.1734,0.523946,0.899225,0.899479,0.899225,0.898934,"{0: 44, 1: 25, 2: 60}"
9,0.1345,0.881867,0.860465,0.86955,0.860465,0.862039,"{0: 44, 1: 32, 2: 53}"
10,0.0881,1.002421,0.860465,0.86655,0.860465,0.861425,"{0: 48, 1: 26, 2: 55}"


Evaluation results for TUM/GottBERT_filtered_base_best with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.5551546812057495, 'eval_accuracy': 0.7712418300653595, 'eval_precision': 0.7852195074331297, 'eval_recall': 0.7712418300653595, 'eval_f1': 0.7757990867579909, 'eval_class_distribution': {0: 37, 1: 40, 2: 76}, 'eval_runtime': 2.2707, 'eval_samples_per_second': 67.382, 'eval_steps_per_second': 33.911, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.78      0.81      0.79        36
     Neutral       0.57      0.70      0.63        33
     Positiv       0.87      0.79      0.82        84

    accuracy                           0.77       153
   macro avg       0.74      0.76      0.75       153
weighted avg       0.79      0.77      0.78       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 37, 1: 40, 2: 76}
Negativ Precision Score: 0.7837837837837838
Negativ Recall Score: 0.8055555555555556
Negativ F1 Score: 0.7945205479452054

Neutral Precision Score: 0.575
Neutral Recall Score: 0.6969696969

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be abl

Training results for TUM/GottBERT_base_last with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8788,0.770787,0.821705,0.836008,0.821705,0.818674,"{0: 54, 1: 18, 2: 57}"
2,0.5791,0.640158,0.829457,0.837209,0.829457,0.830743,"{0: 49, 1: 24, 2: 56}"
3,0.5566,0.672115,0.852713,0.873593,0.852713,0.846657,"{0: 52, 1: 15, 2: 62}"
4,0.3413,0.692673,0.852713,0.855932,0.852713,0.853683,"{0: 40, 1: 30, 2: 59}"
5,0.3182,0.974592,0.852713,0.853564,0.852713,0.849657,"{0: 36, 1: 25, 2: 68}"
6,0.2718,0.775704,0.875969,0.879964,0.875969,0.876851,"{0: 40, 1: 31, 2: 58}"
7,0.2286,0.7992,0.868217,0.869347,0.868217,0.86851,"{0: 44, 1: 27, 2: 58}"
8,0.1996,1.004949,0.852713,0.854364,0.852713,0.852925,"{0: 39, 1: 29, 2: 61}"
9,0.1135,0.881326,0.868217,0.873856,0.868217,0.869307,"{0: 39, 1: 32, 2: 58}"
10,0.0744,0.96294,0.875969,0.878107,0.875969,0.876696,"{0: 42, 1: 29, 2: 58}"


Evaluation results for TUM/GottBERT_base_last with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.8143446445465088, 'eval_accuracy': 0.7843137254901961, 'eval_precision': 0.8095990375237124, 'eval_recall': 0.7843137254901961, 'eval_f1': 0.7891662153181003, 'eval_class_distribution': {0: 41, 1: 43, 2: 69}, 'eval_runtime': 2.3693, 'eval_samples_per_second': 64.575, 'eval_steps_per_second': 32.499, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.71      0.83      0.77        36
     Neutral       0.57      0.73      0.64        33
     Positiv       0.90      0.74      0.81        84

    accuracy                           0.76       153
   macro avg       0.73      0.77      0.74       153
weighted avg       0.78      0.76      0.76       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 42, 1: 42, 2: 69}
Negativ Precision Score: 0.7142857142857143
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.7692307692307693

Neutral Precision Score: 0.5714285714285714
Neutral Recall Score:

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4135.30 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4386.10 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3551.39 examples/s]


Training results for distilbert/distilbert-base-german-cased with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8382,0.712005,0.821705,0.852925,0.821705,0.82293,"{0: 59, 1: 22, 2: 48}"
2,0.5972,0.607035,0.829457,0.840782,0.829457,0.832001,"{0: 49, 1: 27, 2: 53}"
3,0.4971,0.774801,0.852713,0.854095,0.852713,0.852804,"{0: 45, 1: 25, 2: 59}"
4,0.3151,0.738722,0.852713,0.858093,0.852713,0.853336,"{0: 48, 1: 26, 2: 55}"
5,0.2827,0.858924,0.829457,0.838186,0.829457,0.831786,"{0: 44, 1: 31, 2: 54}"
6,0.195,0.762569,0.844961,0.851325,0.844961,0.846233,"{0: 46, 1: 29, 2: 54}"
7,0.1466,1.041093,0.813953,0.835484,0.813953,0.814784,"{0: 56, 1: 21, 2: 52}"
8,0.0971,0.972823,0.821705,0.839793,0.821705,0.823,"{0: 54, 1: 25, 2: 50}"
9,0.0404,1.219914,0.806202,0.820057,0.806202,0.809238,"{0: 40, 1: 35, 2: 54}"
10,0.0236,1.170979,0.806202,0.816656,0.806202,0.808132,"{0: 46, 1: 31, 2: 52}"


Evaluation results for distilbert/distilbert-base-german-cased with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.135193109512329, 'eval_accuracy': 0.803921568627451, 'eval_precision': 0.8117758467023173, 'eval_recall': 0.803921568627451, 'eval_f1': 0.8050101646009574, 'eval_class_distribution': {0: 44, 1: 32, 2: 77}, 'eval_runtime': 1.3432, 'eval_samples_per_second': 113.904, 'eval_steps_per_second': 57.324, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.68      0.83      0.75        36
     Neutral       0.68      0.58      0.62        33
     Positiv       0.86      0.83      0.85        84

    accuracy                           0.78       153
   macro avg       0.74      0.75      0.74       153
weighted avg       0.78      0.78      0.78       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 44, 1: 28, 2: 81}
Negativ Precision Score: 0.6818181818181818
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.75

Neutral Precision Score: 0.6785714285714286
Neutral Recall Score: 0.5757575757575

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

Training results for GerMedBERT/medbert-512 with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9232,1.052394,0.728682,0.728203,0.728682,0.722326,"{0: 46, 1: 18, 2: 65}"
2,0.5612,0.901064,0.79845,0.803761,0.79845,0.799787,"{0: 47, 1: 26, 2: 56}"
3,0.5084,0.985297,0.782946,0.789198,0.782946,0.785282,"{0: 44, 1: 29, 2: 56}"
4,0.2844,1.103902,0.790698,0.790584,0.790698,0.78943,"{0: 44, 1: 23, 2: 62}"
5,0.2664,1.161893,0.806202,0.814228,0.806202,0.806995,"{0: 50, 1: 24, 2: 55}"
6,0.2014,1.338339,0.782946,0.829852,0.782946,0.790941,"{0: 38, 1: 44, 2: 47}"
7,0.1896,1.106464,0.782946,0.789452,0.782946,0.783789,"{0: 49, 1: 24, 2: 56}"
8,0.1294,1.242427,0.806202,0.810514,0.806202,0.807262,"{0: 39, 1: 31, 2: 59}"
9,0.0458,1.449633,0.806202,0.822854,0.806202,0.811051,"{0: 42, 1: 34, 2: 53}"
10,0.007,1.225843,0.844961,0.847005,0.844961,0.845773,"{0: 43, 1: 28, 2: 58}"


Evaluation results for GerMedBERT/medbert-512 with 12 epochs and random seeds: 42, 42



{'eval_loss': 2.0055363178253174, 'eval_accuracy': 0.7581699346405228, 'eval_precision': 0.7608341479309221, 'eval_recall': 0.7581699346405228, 'eval_f1': 0.7568703098523359, 'eval_class_distribution': {0: 44, 1: 31, 2: 78}, 'eval_runtime': 2.4231, 'eval_samples_per_second': 63.141, 'eval_steps_per_second': 31.777, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.70      0.83      0.76        36
     Neutral       0.61      0.58      0.59        33
     Positiv       0.82      0.77      0.80        84

    accuracy                           0.75       153
   macro avg       0.71      0.73      0.72       153
weighted avg       0.75      0.75      0.74       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 43, 1: 31, 2: 79}
Negativ Precision Score: 0.6976744186046512
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.759493670886076

Neutral Precision Score: 0.6129032258064516
Neutral Recall Score: 

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Training results for deepset/gbert-base with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8555,0.938555,0.79845,0.832249,0.79845,0.795958,"{0: 58, 1: 15, 2: 56}"
2,0.5041,0.593408,0.860465,0.863409,0.860465,0.861034,"{0: 46, 1: 26, 2: 57}"
3,0.5256,0.863217,0.852713,0.863657,0.852713,0.848062,"{0: 49, 1: 17, 2: 63}"
4,0.2845,0.940825,0.844961,0.852596,0.844961,0.845617,"{0: 36, 1: 33, 2: 60}"
5,0.228,0.978493,0.852713,0.852771,0.852713,0.852,"{0: 44, 1: 24, 2: 61}"
6,0.1604,1.155141,0.821705,0.823625,0.821705,0.821148,"{0: 37, 1: 30, 2: 62}"
7,0.1202,1.231521,0.837209,0.837275,0.837209,0.836294,"{0: 38, 1: 29, 2: 62}"
8,0.0486,1.370049,0.806202,0.808362,0.806202,0.806234,"{0: 46, 1: 24, 2: 59}"
9,0.017,1.361674,0.829457,0.831266,0.829457,0.828191,"{0: 36, 1: 30, 2: 63}"
10,0.0165,1.436535,0.821705,0.818724,0.821705,0.819381,"{0: 39, 1: 26, 2: 64}"


In [6]:
absa_model(data, "deepset/gbert-base", rn1=42, rn2=42, epochs=12)

Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
tensor([1.0957, 1.3467, 0.7436])tral, positive): 


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archit

Training results for deepset/gbert-base with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8614,0.859552,0.79845,0.836724,0.79845,0.791556,"{0: 60, 1: 13, 2: 56}"
2,0.551,0.659712,0.868217,0.866802,0.868217,0.866653,"{0: 40, 1: 25, 2: 64}"
3,0.4935,0.866866,0.852713,0.851817,0.852713,0.851961,"{0: 43, 1: 25, 2: 61}"
4,0.2799,0.791959,0.837209,0.840745,0.837209,0.838388,"{0: 45, 1: 27, 2: 57}"
5,0.1733,1.165019,0.829457,0.838182,0.829457,0.830488,"{0: 50, 1: 24, 2: 55}"
6,0.0752,1.088761,0.852713,0.854921,0.852713,0.852666,"{0: 46, 1: 24, 2: 59}"
7,0.0584,1.01312,0.860465,0.861979,0.860465,0.860621,"{0: 45, 1: 25, 2: 59}"
8,0.0314,1.089047,0.860465,0.864468,0.860465,0.860891,"{0: 47, 1: 24, 2: 58}"
9,0.008,0.997349,0.875969,0.877812,0.875969,0.875935,"{0: 46, 1: 25, 2: 58}"
10,0.0002,1.075863,0.860465,0.861136,0.860465,0.860564,"{0: 44, 1: 26, 2: 59}"


Evaluation results for deepset/gbert-base with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.536918044090271, 'eval_accuracy': 0.8366013071895425, 'eval_precision': 0.8385620915032681, 'eval_recall': 0.8366013071895425, 'eval_f1': 0.8367271598412579, 'eval_class_distribution': {0: 40, 1: 33, 2: 80}, 'eval_runtime': 2.3555, 'eval_samples_per_second': 64.956, 'eval_steps_per_second': 32.69, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.89      0.89      0.89        36
     Neutral       0.70      0.79      0.74        33
     Positiv       0.89      0.85      0.87        84

    accuracy                           0.84       153
   macro avg       0.83      0.84      0.83       153
weighted avg       0.85      0.84      0.84       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 36, 1: 37, 2: 80}
Negativ Precision Score: 0.8888888888888888
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8888888888888888

Neutral Precision Score: 0.7027027027027027
Neutral Recall Score: 0

In [5]:
absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=12)

Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 1858.77 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3727.32 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3439.76 examples/s]


Training results for aari1995/German_Sentiment with 12 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8989,0.918514,0.837209,0.862563,0.837209,0.829621,"{0: 56, 1: 14, 2: 59}"
2,0.516,0.599222,0.852713,0.849629,0.852713,0.849907,"{0: 43, 1: 23, 2: 63}"
3,0.4661,0.60164,0.875969,0.874638,0.875969,0.874997,"{0: 43, 1: 25, 2: 61}"
4,0.2661,0.573434,0.906977,0.913837,0.906977,0.90835,"{0: 48, 1: 25, 2: 56}"
5,0.1979,0.679561,0.899225,0.9121,0.899225,0.899771,"{0: 52, 1: 22, 2: 55}"
6,0.1193,0.704647,0.899225,0.904631,0.899225,0.899777,"{0: 48, 1: 24, 2: 57}"
7,0.0948,0.755952,0.914729,0.926194,0.914729,0.914681,"{0: 51, 1: 21, 2: 57}"
8,0.0619,0.963016,0.891473,0.911882,0.891473,0.886217,"{0: 54, 1: 16, 2: 59}"
9,0.0001,0.644698,0.922481,0.929311,0.922481,0.922133,"{0: 49, 1: 22, 2: 58}"
10,0.0089,0.756722,0.914729,0.919517,0.914729,0.914569,"{0: 48, 1: 23, 2: 58}"


Evaluation results for aari1995/German_Sentiment with 12 epochs and random seeds: 42, 42



{'eval_loss': 1.2433797121047974, 'eval_accuracy': 0.8496732026143791, 'eval_precision': 0.8541262658909718, 'eval_recall': 0.8496732026143791, 'eval_f1': 0.8500083794201441, 'eval_class_distribution': {0: 42, 1: 33, 2: 78}, 'eval_runtime': 7.1915, 'eval_samples_per_second': 21.275, 'eval_steps_per_second': 10.707, 'epoch': 12.0}
              precision    recall  f1-score   support

     Negativ       0.77      0.94      0.85        36
     Neutral       0.70      0.70      0.70        33
     Positiv       0.93      0.85      0.89        84

    accuracy                           0.84       153
   macro avg       0.80      0.83      0.81       153
weighted avg       0.85      0.84      0.84       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 44, 1: 33, 2: 76}
Negativ Precision Score: 0.7727272727272727
Negativ Recall Score: 0.9444444444444444
Negativ F1 Score: 0.85

Neutral Precision Score: 0.696969696969697
Neutral Recall Score: 0.696969696969

In [5]:
for model in models:
    print(f'training and results for {model}:')
    absa_model(data, model, rn1=42, rn2=42, epochs=20)
    print()

training and results for google-bert/bert-base-german-cased:
Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 2220.53 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3743.90 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3779.16 examples/s]


Training results for google-bert/bert-base-german-cased with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8401,0.731253,0.790698,0.817829,0.790698,0.793084,"{0: 57, 1: 24, 2: 48}"
2,0.5493,0.820497,0.75969,0.815313,0.75969,0.76496,"{0: 50, 1: 39, 2: 40}"
3,0.4916,0.921278,0.821705,0.850058,0.821705,0.827834,"{0: 41, 1: 38, 2: 50}"
4,0.2715,1.188077,0.782946,0.822918,0.782946,0.786236,"{0: 57, 1: 29, 2: 43}"
5,0.2449,1.332023,0.790698,0.803379,0.790698,0.792451,"{0: 35, 1: 35, 2: 59}"
6,0.1649,1.106796,0.813953,0.816643,0.813953,0.814791,"{0: 43, 1: 29, 2: 57}"
7,0.1055,1.319131,0.837209,0.83797,0.837209,0.837359,"{0: 44, 1: 26, 2: 59}"
8,0.0976,1.540717,0.79845,0.803229,0.79845,0.798716,"{0: 37, 1: 32, 2: 60}"
9,0.0407,1.244166,0.813953,0.827862,0.813953,0.816192,"{0: 38, 1: 36, 2: 55}"
10,0.017,1.733393,0.79845,0.801086,0.79845,0.797605,"{0: 48, 1: 23, 2: 58}"


Evaluation results for google-bert/bert-base-german-cased with 20 epochs and random seeds: 42, 42



{'eval_loss': 1.7437177896499634, 'eval_accuracy': 0.7843137254901961, 'eval_precision': 0.7865773951857166, 'eval_recall': 0.7843137254901961, 'eval_f1': 0.7842864540242286, 'eval_class_distribution': {0: 41, 1: 30, 2: 82}, 'eval_runtime': 2.4235, 'eval_samples_per_second': 63.131, 'eval_steps_per_second': 31.772, 'epoch': 20.0}
              precision    recall  f1-score   support

     Negativ       0.68      0.83      0.75        36
     Neutral       0.68      0.58      0.62        33
     Positiv       0.85      0.82      0.84        84

    accuracy                           0.77       153
   macro avg       0.74      0.74      0.74       153
weighted avg       0.77      0.77      0.77       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 44, 1: 28, 2: 81}
Negativ Precision Score: 0.6818181818181818
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.75

Neutral Precision Score: 0.6785714285714286
Neutral Recall Score: 0.57575757575

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4012.54 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3867.04 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3975.30 examples/s]


Training results for dbmdz/bert-base-german-cased with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9088,1.063785,0.75969,0.831988,0.75969,0.75732,"{0: 69, 1: 13, 2: 47}"
2,0.5995,0.847203,0.806202,0.81761,0.806202,0.808861,"{0: 49, 1: 27, 2: 53}"
3,0.5124,1.106801,0.79845,0.806126,0.79845,0.798278,"{0: 49, 1: 21, 2: 59}"
4,0.3364,1.039021,0.79845,0.825624,0.79845,0.79737,"{0: 59, 1: 19, 2: 51}"
5,0.2803,1.386765,0.75969,0.7592,0.75969,0.753893,"{0: 36, 1: 22, 2: 71}"
6,0.1827,1.216493,0.821705,0.819214,0.821705,0.819703,"{0: 42, 1: 24, 2: 63}"
7,0.1706,1.342657,0.821705,0.823872,0.821705,0.821204,"{0: 37, 1: 30, 2: 62}"
8,0.0952,1.453277,0.829457,0.830849,0.829457,0.828548,"{0: 37, 1: 30, 2: 62}"
9,0.0475,1.42306,0.821705,0.823358,0.821705,0.822277,"{0: 44, 1: 27, 2: 58}"
10,0.0217,1.709724,0.806202,0.808321,0.806202,0.802886,"{0: 43, 1: 20, 2: 66}"


Evaluation results for dbmdz/bert-base-german-cased with 20 epochs and random seeds: 42, 42



{'eval_loss': 2.4008398056030273, 'eval_accuracy': 0.7647058823529411, 'eval_precision': 0.7809002526128624, 'eval_recall': 0.7647058823529411, 'eval_f1': 0.7686751940153632, 'eval_class_distribution': {0: 43, 1: 37, 2: 73}, 'eval_runtime': 2.3782, 'eval_samples_per_second': 64.333, 'eval_steps_per_second': 32.377, 'epoch': 20.0}
              precision    recall  f1-score   support

     Negativ       0.70      0.83      0.76        36
     Neutral       0.58      0.64      0.61        33
     Positiv       0.85      0.75      0.80        84

    accuracy                           0.75       153
   macro avg       0.71      0.74      0.72       153
weighted avg       0.76      0.75      0.75       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 43, 1: 36, 2: 74}
Negativ Precision Score: 0.6976744186046512
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.759493670886076

Neutral Precision Score: 0.5833333333333334
Neutral Recall Score: 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-german-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 3833.13 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3683.70 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3571.89 examples/s]


Training results for dbmdz/bert-base-german-uncased with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8694,0.794908,0.829457,0.839405,0.829457,0.829093,"{0: 52, 1: 22, 2: 55}"
2,0.5772,0.981109,0.782946,0.789549,0.782946,0.780826,"{0: 33, 1: 34, 2: 62}"
3,0.5087,0.945239,0.79845,0.805356,0.79845,0.794501,"{0: 45, 1: 18, 2: 66}"
4,0.3019,0.875436,0.852713,0.875581,0.852713,0.853495,"{0: 56, 1: 20, 2: 53}"
5,0.2561,1.21459,0.813953,0.831997,0.813953,0.817696,"{0: 35, 1: 36, 2: 58}"
6,0.187,1.049481,0.844961,0.847417,0.844961,0.844717,"{0: 47, 1: 24, 2: 58}"
7,0.2042,1.298939,0.837209,0.836537,0.837209,0.836668,"{0: 41, 1: 26, 2: 62}"
8,0.0813,1.180989,0.837209,0.840121,0.837209,0.837785,"{0: 46, 1: 26, 2: 57}"
9,0.0885,1.41525,0.813953,0.84981,0.813953,0.821645,"{0: 40, 1: 40, 2: 49}"


Evaluation results for dbmdz/bert-base-german-uncased with 20 epochs and random seeds: 42, 42



{'eval_loss': 1.3670132160186768, 'eval_accuracy': 0.7712418300653595, 'eval_precision': 0.791146505550577, 'eval_recall': 0.7712418300653595, 'eval_f1': 0.7737892373900856, 'eval_class_distribution': {0: 49, 1: 33, 2: 71}, 'eval_runtime': 2.3745, 'eval_samples_per_second': 64.435, 'eval_steps_per_second': 32.428, 'epoch': 9.0}
              precision    recall  f1-score   support

     Negativ       0.63      0.89      0.74        36
     Neutral       0.68      0.76      0.71        33
     Positiv       0.91      0.70      0.79        84

    accuracy                           0.76       153
   macro avg       0.74      0.78      0.75       153
weighted avg       0.79      0.76      0.76       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 51, 1: 37, 2: 65}
Negativ Precision Score: 0.6274509803921569
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.735632183908046

Neutral Precision Score: 0.6756756756756757
Neutral Recall Score: 0.

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4563.88 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4147.36 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4190.22 examples/s]


Training results for FacebookAI/xlm-roberta-base with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.915,0.663395,0.806202,0.829958,0.806202,0.807037,"{0: 57, 1: 21, 2: 51}"
2,0.7519,1.06252,0.782946,0.805537,0.782946,0.775245,"{0: 28, 1: 22, 2: 79}"
3,0.7633,1.142633,0.806202,0.809843,0.806202,0.804287,"{0: 49, 1: 21, 2: 59}"
4,0.54,0.933923,0.821705,0.8273,0.821705,0.823142,"{0: 47, 1: 26, 2: 56}"
5,0.4828,1.047801,0.829457,0.831253,0.829457,0.829784,"{0: 45, 1: 25, 2: 59}"
6,0.4071,0.929373,0.844961,0.846242,0.844961,0.845519,"{0: 42, 1: 28, 2: 59}"
7,0.3403,0.933199,0.813953,0.814105,0.813953,0.812033,"{0: 44, 1: 22, 2: 63}"
8,0.3364,1.255537,0.790698,0.78961,0.790698,0.788426,"{0: 38, 1: 25, 2: 66}"
9,0.2283,1.622544,0.775194,0.803271,0.775194,0.777269,"{0: 40, 1: 41, 2: 48}"


Evaluation results for FacebookAI/xlm-roberta-base with 20 epochs and random seeds: 42, 42



{'eval_loss': 1.1320393085479736, 'eval_accuracy': 0.8169934640522876, 'eval_precision': 0.8240610438752853, 'eval_recall': 0.8169934640522876, 'eval_f1': 0.8173202614379085, 'eval_class_distribution': {0: 44, 1: 33, 2: 76}, 'eval_runtime': 2.3005, 'eval_samples_per_second': 66.508, 'eval_steps_per_second': 33.471, 'epoch': 9.0}
              precision    recall  f1-score   support

     Negativ       0.79      0.94      0.86        36
     Neutral       0.71      0.73      0.72        33
     Positiv       0.92      0.83      0.88        84

    accuracy                           0.84       153
   macro avg       0.81      0.84      0.82       153
weighted avg       0.84      0.84      0.84       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 43, 1: 34, 2: 76}
Negativ Precision Score: 0.7906976744186046
Negativ Recall Score: 0.9444444444444444
Negativ F1 Score: 0.8607594936708861

Neutral Precision Score: 0.7058823529411765
Neutral Recall Score: 

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4945.07 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4272.57 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4485.79 examples/s]


Training results for TUM/GottBERT_base_best with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9128,0.930279,0.79845,0.841664,0.79845,0.773155,"{0: 58, 1: 8, 2: 63}"
2,0.5706,0.792996,0.837209,0.844116,0.837209,0.835469,"{0: 33, 1: 30, 2: 66}"
3,0.4794,0.783604,0.829457,0.832869,0.829457,0.823661,"{0: 49, 1: 18, 2: 62}"
4,0.3834,0.801004,0.852713,0.854343,0.852713,0.852602,"{0: 46, 1: 25, 2: 58}"
5,0.3143,0.778088,0.837209,0.843,0.837209,0.834289,"{0: 33, 1: 27, 2: 69}"
6,0.2884,0.799475,0.860465,0.873472,0.860465,0.860423,"{0: 53, 1: 22, 2: 54}"
7,0.2758,0.846196,0.852713,0.860853,0.852713,0.851432,"{0: 51, 1: 21, 2: 57}"
8,0.2061,0.865548,0.852713,0.852472,0.852713,0.8469,"{0: 47, 1: 19, 2: 63}"
9,0.1472,0.784947,0.891473,0.8949,0.891473,0.892414,"{0: 42, 1: 30, 2: 57}"
10,0.1436,1.017646,0.860465,0.860486,0.860465,0.860238,"{0: 44, 1: 26, 2: 59}"


Evaluation results for TUM/GottBERT_base_best with 20 epochs and random seeds: 42, 42



{'eval_loss': 1.6042653322219849, 'eval_accuracy': 0.7777777777777778, 'eval_precision': 0.8210547091465312, 'eval_recall': 0.7777777777777778, 'eval_f1': 0.7835766974934838, 'eval_class_distribution': {0: 41, 1: 49, 2: 63}, 'eval_runtime': 2.2826, 'eval_samples_per_second': 67.03, 'eval_steps_per_second': 33.734, 'epoch': 20.0}
              precision    recall  f1-score   support

     Negativ       0.76      0.89      0.82        36
     Neutral       0.56      0.85      0.67        33
     Positiv       0.93      0.68      0.79        84

    accuracy                           0.76       153
   macro avg       0.75      0.81      0.76       153
weighted avg       0.81      0.76      0.77       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 42, 1: 50, 2: 61}
Negativ Precision Score: 0.7619047619047619
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8205128205128205

Neutral Precision Score: 0.56
Neutral Recall Score: 0.848484848484

Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_filtered_base_best and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 5031.20 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4689.13 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4640.02 examples/s]


Training results for TUM/GottBERT_filtered_base_best with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8992,0.747352,0.829457,0.851474,0.829457,0.82775,"{0: 57, 1: 19, 2: 53}"
2,0.662,1.118062,0.790698,0.797312,0.790698,0.7846,"{0: 31, 1: 25, 2: 73}"
3,0.6429,0.911352,0.844961,0.843686,0.844961,0.842311,"{0: 44, 1: 22, 2: 63}"
4,0.3884,0.718325,0.837209,0.838675,0.837209,0.836562,"{0: 47, 1: 25, 2: 57}"
5,0.4251,0.935575,0.806202,0.80612,0.806202,0.80314,"{0: 43, 1: 21, 2: 65}"
6,0.3309,0.834874,0.852713,0.851363,0.852713,0.851687,"{0: 42, 1: 25, 2: 62}"
7,0.3428,0.853509,0.837209,0.847963,0.837209,0.836316,"{0: 51, 1: 20, 2: 58}"
8,0.2867,0.982253,0.837209,0.840756,0.837209,0.837999,"{0: 43, 1: 30, 2: 56}"
9,0.2343,0.937703,0.829457,0.833049,0.829457,0.830422,"{0: 44, 1: 29, 2: 56}"


Evaluation results for TUM/GottBERT_filtered_base_best with 20 epochs and random seeds: 42, 42



{'eval_loss': 1.1985373497009277, 'eval_accuracy': 0.7777777777777778, 'eval_precision': 0.803517243503843, 'eval_recall': 0.7777777777777778, 'eval_f1': 0.7803444481980835, 'eval_class_distribution': {0: 48, 1: 38, 2: 67}, 'eval_runtime': 2.2942, 'eval_samples_per_second': 66.69, 'eval_steps_per_second': 33.563, 'epoch': 9.0}
              precision    recall  f1-score   support

     Negativ       0.66      0.92      0.77        36
     Neutral       0.65      0.73      0.69        33
     Positiv       0.91      0.71      0.80        84

    accuracy                           0.76       153
   macro avg       0.74      0.79      0.75       153
weighted avg       0.79      0.76      0.77       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 50, 1: 37, 2: 66}
Negativ Precision Score: 0.66
Negativ Recall Score: 0.9166666666666666
Negativ F1 Score: 0.7674418604651163

Neutral Precision Score: 0.6486486486486487
Neutral Recall Score: 0.72727272727272

Some weights of the model checkpoint at TUM/GottBERT_base_last were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at TUM/GottBERT_base_last and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be abl

Training results for TUM/GottBERT_base_last with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8908,0.899764,0.790698,0.836694,0.790698,0.774315,"{0: 64, 1: 10, 2: 55}"
2,0.5795,0.597934,0.868217,0.872712,0.868217,0.869265,"{0: 39, 1: 31, 2: 59}"
3,0.5518,0.818882,0.844961,0.845007,0.844961,0.840734,"{0: 44, 1: 20, 2: 65}"
4,0.3863,0.704501,0.844961,0.846913,0.844961,0.845576,"{0: 40, 1: 29, 2: 60}"
5,0.317,0.926258,0.829457,0.829596,0.829457,0.828433,"{0: 46, 1: 24, 2: 59}"
6,0.2891,1.231328,0.767442,0.827384,0.767442,0.764328,"{0: 68, 1: 15, 2: 46}"
7,0.2548,1.390793,0.79845,0.810461,0.79845,0.792994,"{0: 54, 1: 17, 2: 58}"
8,0.2857,1.094601,0.813953,0.815068,0.813953,0.811379,"{0: 49, 1: 22, 2: 58}"
9,0.1896,0.95236,0.844961,0.867302,0.844961,0.849351,"{0: 47, 1: 33, 2: 49}"
10,0.1492,0.973283,0.806202,0.816194,0.806202,0.806146,"{0: 52, 1: 24, 2: 53}"


Evaluation results for TUM/GottBERT_base_last with 20 epochs and random seeds: 42, 42



{'eval_loss': 0.8247197270393372, 'eval_accuracy': 0.8431372549019608, 'eval_precision': 0.8627227146566389, 'eval_recall': 0.8431372549019608, 'eval_f1': 0.8471991769106795, 'eval_class_distribution': {0: 36, 1: 44, 2: 73}, 'eval_runtime': 2.3372, 'eval_samples_per_second': 65.463, 'eval_steps_per_second': 32.946, 'epoch': 20.0}
              precision    recall  f1-score   support

     Negativ       0.89      0.86      0.87        36
     Neutral       0.60      0.82      0.69        33
     Positiv       0.90      0.79      0.84        84

    accuracy                           0.81       153
   macro avg       0.80      0.82      0.80       153
weighted avg       0.83      0.81      0.82       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 35, 1: 45, 2: 73}
Negativ Precision Score: 0.8857142857142857
Negativ Recall Score: 0.8611111111111112
Negativ F1 Score: 0.8732394366197183

Neutral Precision Score: 0.6
Neutral Recall Score: 0.818181818181

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1111/1111 [00:00<00:00, 4989.39 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 4058.46 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4513.02 examples/s]


Training results for distilbert/distilbert-base-german-cased with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8258,0.677565,0.813953,0.843767,0.813953,0.815986,"{0: 58, 1: 23, 2: 48}"
2,0.5814,0.651191,0.852713,0.869497,0.852713,0.853473,"{0: 54, 1: 22, 2: 53}"
3,0.5293,0.666169,0.875969,0.883069,0.875969,0.876799,"{0: 49, 1: 25, 2: 55}"
4,0.3057,0.728416,0.852713,0.863971,0.852713,0.854109,"{0: 50, 1: 27, 2: 52}"
5,0.3044,0.73386,0.891473,0.902268,0.891473,0.892245,"{0: 51, 1: 25, 2: 53}"
6,0.1898,0.797213,0.852713,0.862779,0.852713,0.853556,"{0: 50, 1: 27, 2: 52}"
7,0.1804,1.039371,0.852713,0.880041,0.852713,0.853315,"{0: 57, 1: 19, 2: 53}"
8,0.1121,1.162707,0.844961,0.866133,0.844961,0.845914,"{0: 52, 1: 30, 2: 47}"


Evaluation results for distilbert/distilbert-base-german-cased with 20 epochs and random seeds: 42, 42



{'eval_loss': 1.393026351928711, 'eval_accuracy': 0.7843137254901961, 'eval_precision': 0.7951178150891207, 'eval_recall': 0.7843137254901961, 'eval_f1': 0.7869841590152357, 'eval_class_distribution': {0: 41, 1: 37, 2: 75}, 'eval_runtime': 1.2967, 'eval_samples_per_second': 117.994, 'eval_steps_per_second': 59.383, 'epoch': 8.0}
              precision    recall  f1-score   support

     Negativ       0.71      0.83      0.77        36
     Neutral       0.65      0.67      0.66        33
     Positiv       0.88      0.81      0.84        84

    accuracy                           0.78       153
   macro avg       0.75      0.77      0.76       153
weighted avg       0.79      0.78      0.79       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 42, 1: 34, 2: 77}
Negativ Precision Score: 0.7142857142857143
Negativ Recall Score: 0.8333333333333334
Negativ F1 Score: 0.7692307692307693

Neutral Precision Score: 0.6470588235294118
Neutral Recall Score: 

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GerMedBERT/medbert-512 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictio

Training results for GerMedBERT/medbert-512 with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.9047,0.830547,0.775194,0.790361,0.775194,0.776429,"{0: 53, 1: 21, 2: 55}"
2,0.5615,1.171958,0.782946,0.818991,0.782946,0.781938,"{0: 26, 1: 38, 2: 65}"
3,0.5238,1.097914,0.806202,0.809721,0.806202,0.806,"{0: 48, 1: 24, 2: 57}"
4,0.284,1.33616,0.744186,0.763123,0.744186,0.748688,"{0: 49, 1: 30, 2: 50}"
5,0.2547,1.444278,0.767442,0.778759,0.767442,0.769726,"{0: 50, 1: 26, 2: 53}"
6,0.2213,1.147887,0.775194,0.773392,0.775194,0.773813,"{0: 41, 1: 25, 2: 63}"
7,0.1805,1.448047,0.790698,0.796592,0.790698,0.785809,"{0: 50, 1: 18, 2: 61}"
8,0.1039,1.83013,0.751938,0.785689,0.751938,0.757941,"{0: 32, 1: 41, 2: 56}"
9,0.1046,1.649139,0.782946,0.802021,0.782946,0.786608,"{0: 40, 1: 37, 2: 52}"
10,0.0638,1.698795,0.79845,0.799091,0.79845,0.79356,"{0: 38, 1: 21, 2: 70}"


Evaluation results for GerMedBERT/medbert-512 with 20 epochs and random seeds: 42, 42



{'eval_loss': 1.9444224834442139, 'eval_accuracy': 0.7973856209150327, 'eval_precision': 0.7996853062212539, 'eval_recall': 0.7973856209150327, 'eval_f1': 0.7980779663644113, 'eval_class_distribution': {0: 36, 1: 36, 2: 81}, 'eval_runtime': 2.3256, 'eval_samples_per_second': 65.789, 'eval_steps_per_second': 33.109, 'epoch': 19.0}
              precision    recall  f1-score   support

     Negativ       0.71      0.75      0.73        36
     Neutral       0.66      0.76      0.70        33
     Positiv       0.83      0.76      0.80        84

    accuracy                           0.76       153
   macro avg       0.73      0.76      0.74       153
weighted avg       0.77      0.76      0.76       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 38, 1: 38, 2: 77}
Negativ Precision Score: 0.7105263157894737
Negativ Recall Score: 0.75
Negativ F1 Score: 0.7297297297297297

Neutral Precision Score: 0.6578947368421053
Neutral Recall Score: 0.75757575757

Some weights of the model checkpoint at deepset/gbert-base were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infe

Training results for deepset/gbert-base with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.823,0.986467,0.744186,0.825097,0.744186,0.733167,"{0: 71, 1: 9, 2: 49}"
2,0.5169,0.770246,0.806202,0.81761,0.806202,0.808861,"{0: 49, 1: 27, 2: 53}"
3,0.5046,0.813819,0.868217,0.866791,0.868217,0.866213,"{0: 44, 1: 23, 2: 62}"
4,0.2669,0.773547,0.868217,0.870349,0.868217,0.864872,"{0: 45, 1: 20, 2: 64}"
5,0.2162,0.918801,0.860465,0.859442,0.860465,0.859648,"{0: 43, 1: 25, 2: 61}"
6,0.1232,0.907457,0.875969,0.877259,0.875969,0.874309,"{0: 46, 1: 22, 2: 61}"
7,0.0843,1.156711,0.852713,0.856227,0.852713,0.853097,"{0: 47, 1: 25, 2: 57}"
8,0.0493,1.262757,0.844961,0.843133,0.844961,0.843581,"{0: 40, 1: 26, 2: 63}"
9,0.0001,1.256452,0.868217,0.867965,0.868217,0.865979,"{0: 44, 1: 22, 2: 63}"
10,0.001,1.363554,0.860465,0.86107,0.860465,0.8607,"{0: 43, 1: 27, 2: 59}"


Evaluation results for deepset/gbert-base with 20 epochs and random seeds: 42, 42



{'eval_loss': 1.3813351392745972, 'eval_accuracy': 0.8169934640522876, 'eval_precision': 0.8154736593432554, 'eval_recall': 0.8169934640522876, 'eval_f1': 0.8153393934525169, 'eval_class_distribution': {0: 40, 1: 30, 2: 83}, 'eval_runtime': 2.3715, 'eval_samples_per_second': 64.517, 'eval_steps_per_second': 32.469, 'epoch': 14.0}
              precision    recall  f1-score   support

     Negativ       0.84      0.89      0.86        36
     Neutral       0.72      0.70      0.71        33
     Positiv       0.88      0.87      0.87        84

    accuracy                           0.84       153
   macro avg       0.81      0.82      0.82       153
weighted avg       0.84      0.84      0.84       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 38, 1: 32, 2: 83}
Negativ Precision Score: 0.8421052631578947
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8648648648648649

Neutral Precision Score: 0.71875
Neutral Recall Score: 0.69696969

In [6]:
absa_model(data, "aari1995/German_Sentiment", rn1=42, rn2=42, epochs=20)

Training Sentiment label count:  {'negativ': 338, 'neutral': 275, 'positiv': 498}
Validation Sentiment label count:  {'negativ': 42, 'neutral': 27, 'positiv': 60}
Test Sentiment label count:  {'negativ': 36, 'neutral': 33, 'positiv': 84}
Class weights for (negative, neutral, positive): tensor([1.0957, 1.3467, 0.7436])


Map: 100%|██████████| 1111/1111 [00:00<00:00, 3899.23 examples/s]
Map: 100%|██████████| 129/129 [00:00<00:00, 3720.76 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 3676.22 examples/s]


Training results for aari1995/German_Sentiment with 20 epochs and random seeds: 42, 42



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Class Distribution
1,0.8808,1.006432,0.821705,0.884794,0.821705,0.809369,"{0: 65, 1: 10, 2: 54}"
2,0.5463,0.70961,0.860465,0.860465,0.860465,0.860465,"{0: 42, 1: 27, 2: 60}"
3,0.5244,0.786635,0.875969,0.896885,0.875969,0.871794,"{0: 53, 1: 16, 2: 60}"
4,0.2522,0.85272,0.868217,0.877711,0.868217,0.865526,"{0: 51, 1: 19, 2: 59}"
5,0.3405,0.835548,0.875969,0.889401,0.875969,0.873973,"{0: 49, 1: 18, 2: 62}"
6,0.1771,0.738415,0.844961,0.875502,0.844961,0.852286,"{0: 37, 1: 39, 2: 53}"
7,0.1289,0.886112,0.891473,0.903614,0.891473,0.893073,"{0: 51, 1: 24, 2: 54}"
8,0.0758,0.966485,0.852713,0.859588,0.852713,0.855141,"{0: 40, 1: 31, 2: 58}"
9,0.034,1.430055,0.837209,0.853782,0.837209,0.841637,"{0: 37, 1: 35, 2: 57}"
10,0.0399,1.096365,0.883721,0.904813,0.883721,0.880644,"{0: 55, 1: 17, 2: 57}"


Evaluation results for aari1995/German_Sentiment with 20 epochs and random seeds: 42, 42



{'eval_loss': 1.3768686056137085, 'eval_accuracy': 0.8431372549019608, 'eval_precision': 0.8470219666814094, 'eval_recall': 0.8431372549019608, 'eval_f1': 0.8445081084122354, 'eval_class_distribution': {0: 38, 1: 35, 2: 80}, 'eval_runtime': 7.2857, 'eval_samples_per_second': 21.0, 'eval_steps_per_second': 10.569, 'epoch': 19.0}
              precision    recall  f1-score   support

     Negativ       0.80      0.89      0.84        36
     Neutral       0.66      0.70      0.68        33
     Positiv       0.91      0.85      0.88        84

    accuracy                           0.82       153
   macro avg       0.79      0.81      0.80       153
weighted avg       0.83      0.82      0.83       153

True label distribution: {0: 36, 1: 33, 2: 84}
Predicted label distribution: {0: 40, 1: 35, 2: 78}
Negativ Precision Score: 0.8
Negativ Recall Score: 0.8888888888888888
Negativ F1 Score: 0.8421052631578947

Neutral Precision Score: 0.6571428571428571
Neutral Recall Score: 0.69696969696969

## Cross-Validation to check stability

In [None]:
avg_metrics, std_metrics = absa_model_kfold(data, "dbmdz/bert-base-german-cased", rn1=42, rn2=42, epochs=5, n_splits=3, save=False)

In [None]:
all_model_metrics = {}

for model in models:
    print(f'training and results for {model}:')
    avg_metrics, std_metrics = absa_model_kfold(data, model, rn1=42, rn2=42, epochs=5, n_splits=3, save=False)
    
    # Store both metrics together under the model name
    all_model_metrics[model] = {
        'avg_metrics': avg_metrics,
        'std_metrics': std_metrics
    }
    
    print()

# Access:
# all_model_metrics['model_name']['avg_metrics']
# all_model_metrics['model_name']['std_metrics']