# Fine-tuning Classifier LLM


In [1]:
!pip install optuna
!pip install typing
!pip install evaluate
!pip install torch
!pip install transformers
pip install accelerate>=0.26.0



In [1]:
# setup - load packages
import pandas as pd
from datasets import Dataset, load_dataset
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    balanced_accuracy_score
)
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from typing import Union, Mapping, List, Dict, Any
import evaluate
from tqdm import tqdm
import zipfile
import os
import accelerate


# Set up device (is available use GPU to speed up computations)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

seed = 13

cuda


## VERSION B

In [2]:
classifier_data = pd.read_csv("../data/classifier_data_B.csv")
# converting to huggingface dataset format
data = Dataset.from_pandas(classifier_data)
# splitting into train, test and validation sets
# party data
raw_dataset = data.shuffle(seed=seed)

# 70% train, 15% test, 15% validation data
split = raw_dataset.train_test_split(test_size=0.3, seed=seed)
train_data = split["train"]
text_and_val_data = split["test"]
split = text_and_val_data.train_test_split(test_size=0.5, seed=seed)
test_data = split["train"]
val_data = split["test"]

print(f"Training samples party: {len(train_data)}")
print(f"Test samples party: {len(test_data)}")
print(f"Validation samples party: {len(val_data)}")



# data balancing??


Training samples party: 25281
Test samples party: 5418
Validation samples party: 5418


In [3]:
WINDOW_LENGTH = 512
STRIDE = 256

In [4]:
# Load Tokenizer
model_name = "bert-base-german-cased"
num_labels = 6
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=WINDOW_LENGTH
)


In [5]:
label_names = ['CDU/CSU', 'SPD', 'GRÜNE', 'FDP', 'AfD', 'LINKE']
label2id = {label: i for i, label in enumerate(sorted(label_names))}
id2label = {i: label for label, i in label2id.items()}

In [6]:
def sliding_window_tokenize(batch):
    texts = batch["speech_text"]
    labels = batch["label"]  # ensure this is a flat list of ints

    tokenized = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=WINDOW_LENGTH,
        stride=STRIDE,
        return_overflowing_tokens=True,
    )

    # Assign each overflow window the correct label
    tokenized["labels"] = [label2id[labels[i]] for i in tokenized["overflow_to_sample_mapping"]]

    return tokenized


In [7]:
tokenized_train_data = train_data.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=train_data.column_names
)

tokenized_val_data = val_data.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=train_data.column_names
)

Map:   0%|          | 0/25281 [00:00<?, ? examples/s]

Map:   0%|          | 0/5418 [00:00<?, ? examples/s]

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-german-cased",
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id
)

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]

    return {
        "accuracy": acc,
        "f1": f1
    }


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Hyperparameter Tuning

In [9]:
# training arguments for hyperparameter tuning
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",  # don't save checkpoints during tuning
    logging_dir="./logs",
    disable_tqdm=True,  # speed up tuning
    report_to="none",   # optional: disable W&B or other logging
)

def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 4),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
    }

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

best_run = trainer.hyperparameter_search(
    direction="maximize",        # because we want to maximize accuracy
    hp_space=hp_space,
    n_trials=12,                 # how many combinations to try
    compute_objective=lambda metrics: metrics["eval_accuracy"],
    backend="optuna"
)

print("Best hyperparameters:")
print(best_run.hyperparameters)


  trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-08-01 16:57:57,777] A new study created in memory with name: no-name-da855184-2fc6-43f4-86d0-16055b932a64
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.4537, 'grad_norm': 5.947567939758301, 'learning_rate': 1.8528543363470492e-05, 'epoch': 0.5995203836930456}
{'eval_loss': 1.061736822128296, 'eval_accuracy': 0.5855413230391296, 'eval_f1': 0.5856789812325323, 'eval_runtime': 80.5806, 'eval_samples_per_second': 70.724, 'eval_steps_per_second': 8.848, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.1109, 'grad_norm': 10.283782958984375, 'learning_rate': 1.526302638013061e-05, 'epoch': 1.1990407673860912}
{'loss': 0.9223, 'grad_norm': 16.60808753967285, 'learning_rate': 1.1997509396790729e-05, 'epoch': 1.7985611510791366}
{'eval_loss': 0.9230913519859314, 'eval_accuracy': 0.6378311984558694, 'eval_f1': 0.6322667488641837, 'eval_runtime': 79.7667, 'eval_samples_per_second': 71.446, 'eval_steps_per_second': 8.939, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.7618, 'grad_norm': 13.900824546813965, 'learning_rate': 8.731992413450846e-06, 'epoch': 2.3980815347721824}
{'loss': 0.6844, 'grad_norm': 10.650286674499512, 'learning_rate': 5.466475430110963e-06, 'epoch': 2.997601918465228}
{'eval_loss': 0.9260182976722717, 'eval_accuracy': 0.6559045446569574, 'eval_f1': 0.6501574948791311, 'eval_runtime': 80.3917, 'eval_samples_per_second': 70.89, 'eval_steps_per_second': 8.869, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.5273, 'grad_norm': 10.9815673828125, 'learning_rate': 2.2009584467710808e-06, 'epoch': 3.597122302158273}
{'eval_loss': 0.9637743234634399, 'eval_accuracy': 0.6543253202316196, 'eval_f1': 0.6554568677327564, 'eval_runtime': 79.7398, 'eval_samples_per_second': 71.47, 'eval_steps_per_second': 8.942, 'epoch': 4.0}
{'train_runtime': 4249.6591, 'train_samples_per_second': 25.093, 'train_steps_per_second': 0.785, 'train_loss': 0.8687293192179655, 'epoch': 4.0}


[I 2025-08-01 18:08:48,873] Trial 0 finished with value: 0.6543253202316196 and parameters: {'learning_rate': 2.1787529312843695e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.17377118733943814}. Best is trial 0 with value: 0.6543253202316196.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.7306, 'grad_norm': 8.447221755981445, 'learning_rate': 3.495948518605665e-05, 'epoch': 0.15001500150015}
{'loss': 1.6048, 'grad_norm': 32.88897705078125, 'learning_rate': 3.359739194261955e-05, 'epoch': 0.3000300030003}
{'loss': 1.5337, 'grad_norm': 9.852897644042969, 'learning_rate': 3.2235298699182446e-05, 'epoch': 0.45004500450045004}
{'loss': 1.4379, 'grad_norm': 94.99247741699219, 'learning_rate': 3.087320545574535e-05, 'epoch': 0.6000600060006}
{'loss': 1.4015, 'grad_norm': 11.775771141052246, 'learning_rate': 2.9511112212308245e-05, 'epoch': 0.7500750075007501}
{'loss': 1.3522, 'grad_norm': 7.481191158294678, 'learning_rate': 2.8149018968871142e-05, 'epoch': 0.9000900090009001}
{'eval_loss': 1.3086260557174683, 'eval_accuracy': 0.4930689594665731, 'eval_f1': 0.41231148553808966, 'eval_runtime': 80.1324, 'eval_samples_per_second': 71.12, 'eval_steps_per_second': 8.898, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.3132, 'grad_norm': 10.767287254333496, 'learning_rate': 2.678692572543404e-05, 'epoch': 1.05010501050105}
{'loss': 1.2649, 'grad_norm': 23.12171173095703, 'learning_rate': 2.5424832481996938e-05, 'epoch': 1.2001200120012}
{'loss': 1.2406, 'grad_norm': 10.577842712402344, 'learning_rate': 2.4062739238559836e-05, 'epoch': 1.3501350135013501}
{'loss': 1.1908, 'grad_norm': 21.127695083618164, 'learning_rate': 2.2700645995122737e-05, 'epoch': 1.5001500150015001}
{'loss': 1.1479, 'grad_norm': 13.480355262756348, 'learning_rate': 2.1338552751685635e-05, 'epoch': 1.6501650165016502}
{'loss': 1.0983, 'grad_norm': 26.9051570892334, 'learning_rate': 1.9976459508248532e-05, 'epoch': 1.8001800180018002}
{'loss': 1.0699, 'grad_norm': 27.675125122070312, 'learning_rate': 1.861436626481143e-05, 'epoch': 1.9501950195019502}
{'eval_loss': 1.18199622631073, 'eval_accuracy': 0.5693981400245657, 'eval_f1': 0.5570309152034459, 'eval_runtime': 80.3794, 'eval_samples_per_second': 70.901, 'eval_step

  return forward_call(*args, **kwargs)


{'loss': 1.0162, 'grad_norm': 19.061208724975586, 'learning_rate': 1.7252273021374328e-05, 'epoch': 2.1002100210021}
{'loss': 0.9165, 'grad_norm': 19.91899299621582, 'learning_rate': 1.589017977793723e-05, 'epoch': 2.25022502250225}
{'loss': 0.9397, 'grad_norm': 27.931062698364258, 'learning_rate': 1.4528086534500127e-05, 'epoch': 2.4002400240024}
{'loss': 0.9474, 'grad_norm': 10.376792907714844, 'learning_rate': 1.3165993291063024e-05, 'epoch': 2.5502550255025502}
{'loss': 0.9011, 'grad_norm': 52.54730987548828, 'learning_rate': 1.1803900047625922e-05, 'epoch': 2.7002700270027002}
{'loss': 0.8764, 'grad_norm': 36.312355041503906, 'learning_rate': 1.0441806804188822e-05, 'epoch': 2.8502850285028503}
{'eval_loss': 1.0593117475509644, 'eval_accuracy': 0.6157220565011405, 'eval_f1': 0.6007828871338664, 'eval_runtime': 84.1413, 'eval_samples_per_second': 67.731, 'eval_steps_per_second': 8.474, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.8828, 'grad_norm': 23.85162353515625, 'learning_rate': 9.07971356075172e-06, 'epoch': 3.0003000300030003}
{'loss': 0.7173, 'grad_norm': 34.265785217285156, 'learning_rate': 7.717620317314617e-06, 'epoch': 3.1503150315031503}
{'loss': 0.7227, 'grad_norm': 40.49232482910156, 'learning_rate': 6.355527073877516e-06, 'epoch': 3.3003300330033003}
{'loss': 0.7424, 'grad_norm': 22.092140197753906, 'learning_rate': 4.993433830440414e-06, 'epoch': 3.4503450345034503}
{'loss': 0.7454, 'grad_norm': 47.0213508605957, 'learning_rate': 3.631340587003313e-06, 'epoch': 3.6003600360036003}
{'loss': 0.6696, 'grad_norm': 19.92720603942871, 'learning_rate': 2.269247343566211e-06, 'epoch': 3.7503750375037503}
{'loss': 0.6962, 'grad_norm': 29.740478515625, 'learning_rate': 9.071541001291096e-07, 'epoch': 3.9003900390039004}
{'eval_loss': 1.083469271659851, 'eval_accuracy': 0.6311633619933321, 'eval_f1': 0.6262641313353853, 'eval_runtime': 81.2527, 'eval_samples_per_second': 70.139, 'eval_steps_per

[I 2025-08-01 19:26:36,699] Trial 1 finished with value: 0.6311633619933321 and parameters: {'learning_rate': 3.631885424300688e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 4, 'weight_decay': 0.05718841552421505}. Best is trial 0 with value: 0.6543253202316196.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.3955, 'grad_norm': 7.277722358703613, 'learning_rate': 3.910347437691134e-05, 'epoch': 0.5995203836930456}
{'eval_loss': 0.999954104423523, 'eval_accuracy': 0.6050184242849622, 'eval_f1': 0.5967833710195204, 'eval_runtime': 79.75, 'eval_samples_per_second': 71.461, 'eval_steps_per_second': 8.94, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.0441, 'grad_norm': 10.821451187133789, 'learning_rate': 3.2211779914995345e-05, 'epoch': 1.1990407673860912}
{'loss': 0.8593, 'grad_norm': 20.454172134399414, 'learning_rate': 2.5320085453079357e-05, 'epoch': 1.7985611510791366}
{'eval_loss': 0.916075587272644, 'eval_accuracy': 0.6434462186348482, 'eval_f1': 0.6300214304331898, 'eval_runtime': 79.9019, 'eval_samples_per_second': 71.325, 'eval_steps_per_second': 8.923, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.6663, 'grad_norm': 15.334062576293945, 'learning_rate': 1.8428390991163363e-05, 'epoch': 2.3980815347721824}
{'loss': 0.579, 'grad_norm': 12.408621788024902, 'learning_rate': 1.1536696529247371e-05, 'epoch': 2.997601918465228}
{'eval_loss': 0.9472567439079285, 'eval_accuracy': 0.6609931566941569, 'eval_f1': 0.653818295054949, 'eval_runtime': 80.1954, 'eval_samples_per_second': 71.064, 'eval_steps_per_second': 8.891, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.357, 'grad_norm': 13.704679489135742, 'learning_rate': 4.645002067331378e-06, 'epoch': 3.597122302158273}
{'eval_loss': 1.0625407695770264, 'eval_accuracy': 0.66344972802246, 'eval_f1': 0.66278193798667, 'eval_runtime': 79.8753, 'eval_samples_per_second': 71.349, 'eval_steps_per_second': 8.926, 'epoch': 4.0}
{'train_runtime': 4247.1743, 'train_samples_per_second': 25.108, 'train_steps_per_second': 0.785, 'train_loss': 0.7676925727789351, 'epoch': 4.0}


[I 2025-08-01 20:37:25,129] Trial 2 finished with value: 0.66344972802246 and parameters: {'learning_rate': 4.59813854499035e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.21941660232006788}. Best is trial 2 with value: 0.66344972802246.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.7597, 'grad_norm': 4.577311038970947, 'learning_rate': 3.713922391342888e-05, 'epoch': 0.15001500150015}
{'loss': 1.7498, 'grad_norm': 7.358942985534668, 'learning_rate': 3.5184527917985254e-05, 'epoch': 0.3000300030003}
{'loss': 1.7418, 'grad_norm': 6.218388557434082, 'learning_rate': 3.322983192254163e-05, 'epoch': 0.45004500450045004}
{'loss': 1.727, 'grad_norm': 6.527444839477539, 'learning_rate': 3.1275135927098e-05, 'epoch': 0.6000600060006}
{'loss': 1.7351, 'grad_norm': 6.40305233001709, 'learning_rate': 2.9320439931654376e-05, 'epoch': 0.7500750075007501}
{'loss': 1.7361, 'grad_norm': 7.609278202056885, 'learning_rate': 2.7365743936210753e-05, 'epoch': 0.9000900090009001}
{'eval_loss': 1.7522971630096436, 'eval_accuracy': 0.2654851728373399, 'eval_f1': 0.06992974671843225, 'eval_runtime': 78.5837, 'eval_samples_per_second': 72.521, 'eval_steps_per_second': 9.073, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.7229, 'grad_norm': 5.871448993682861, 'learning_rate': 2.5411047940767124e-05, 'epoch': 1.05010501050105}
{'loss': 1.7323, 'grad_norm': 3.804692029953003, 'learning_rate': 2.34563519453235e-05, 'epoch': 1.2001200120012}
{'loss': 1.7332, 'grad_norm': 3.1752943992614746, 'learning_rate': 2.150165594987988e-05, 'epoch': 1.3501350135013501}
{'loss': 1.7207, 'grad_norm': 4.494482040405273, 'learning_rate': 1.954695995443625e-05, 'epoch': 1.5001500150015001}
{'loss': 1.7313, 'grad_norm': 4.692550182342529, 'learning_rate': 1.7592263958992627e-05, 'epoch': 1.6501650165016502}
{'loss': 1.7311, 'grad_norm': 4.576578617095947, 'learning_rate': 1.5637567963549e-05, 'epoch': 1.8001800180018002}
{'loss': 1.7336, 'grad_norm': 5.250627517700195, 'learning_rate': 1.3682871968105377e-05, 'epoch': 1.9501950195019502}
{'eval_loss': 1.7327665090560913, 'eval_accuracy': 0.2654851728373399, 'eval_f1': 0.06992974671843225, 'eval_runtime': 79.8256, 'eval_samples_per_second': 71.393, 'eval_steps_per

  return forward_call(*args, **kwargs)


{'loss': 1.7337, 'grad_norm': 4.193598747253418, 'learning_rate': 1.172817597266175e-05, 'epoch': 2.1002100210021}
{'loss': 1.7142, 'grad_norm': 4.224034309387207, 'learning_rate': 9.773479977218125e-06, 'epoch': 2.25022502250225}
{'loss': 1.7333, 'grad_norm': 5.126185417175293, 'learning_rate': 7.8187839817745e-06, 'epoch': 2.4002400240024}
{'loss': 1.7303, 'grad_norm': 3.0372958183288574, 'learning_rate': 5.864087986330875e-06, 'epoch': 2.5502550255025502}
{'loss': 1.7162, 'grad_norm': 4.151998519897461, 'learning_rate': 3.90939199088725e-06, 'epoch': 2.7002700270027002}
{'loss': 1.6536, 'grad_norm': 6.818431377410889, 'learning_rate': 1.954695995443625e-06, 'epoch': 2.8502850285028503}
{'eval_loss': 1.6069438457489014, 'eval_accuracy': 0.31426566064221795, 'eval_f1': 0.14861697373813137, 'eval_runtime': 79.8675, 'eval_samples_per_second': 71.356, 'eval_steps_per_second': 8.927, 'epoch': 3.0}
{'train_runtime': 3740.0052, 'train_samples_per_second': 21.384, 'train_steps_per_second': 2

[I 2025-08-01 21:39:46,401] Trial 3 finished with value: 0.31426566064221795 and parameters: {'learning_rate': 3.9090010516881616e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.29806962128746783}. Best is trial 2 with value: 0.66344972802246.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.4707, 'grad_norm': 7.994380950927734, 'learning_rate': 1.0184688005320043e-05, 'epoch': 0.5995203836930456}
{'eval_loss': 1.1748024225234985, 'eval_accuracy': 0.533602386383576, 'eval_f1': 0.5190698293562787, 'eval_runtime': 80.3431, 'eval_samples_per_second': 70.933, 'eval_steps_per_second': 8.874, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.2104, 'grad_norm': 8.844426155090332, 'learning_rate': 8.3897130308188e-06, 'epoch': 1.1990407673860912}
{'loss': 1.0268, 'grad_norm': 15.250950813293457, 'learning_rate': 6.59473805631756e-06, 'epoch': 1.7985611510791366}
{'eval_loss': 0.9959552884101868, 'eval_accuracy': 0.6080014037550447, 'eval_f1': 0.6077275569233633, 'eval_runtime': 37.6406, 'eval_samples_per_second': 151.406, 'eval_steps_per_second': 18.942, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.891, 'grad_norm': 14.344752311706543, 'learning_rate': 4.799763081816318e-06, 'epoch': 2.3980815347721824}
{'loss': 0.8259, 'grad_norm': 13.60208797454834, 'learning_rate': 3.0047881073150774e-06, 'epoch': 2.997601918465228}
{'eval_loss': 0.9475002884864807, 'eval_accuracy': 0.6395858922618003, 'eval_f1': 0.6308113956068795, 'eval_runtime': 37.6234, 'eval_samples_per_second': 151.475, 'eval_steps_per_second': 18.951, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.7256, 'grad_norm': 12.31307315826416, 'learning_rate': 1.2098131328138365e-06, 'epoch': 3.597122302158273}
{'eval_loss': 0.9602210521697998, 'eval_accuracy': 0.6318652395157045, 'eval_f1': 0.6332539365383422, 'eval_runtime': 37.5464, 'eval_samples_per_second': 151.786, 'eval_steps_per_second': 18.99, 'epoch': 4.0}
{'train_runtime': 2587.49, 'train_samples_per_second': 41.212, 'train_steps_per_second': 1.289, 'train_loss': 0.9922336285634579, 'epoch': 4.0}


[I 2025-08-01 22:22:55,172] Trial 4 finished with value: 0.6318652395157045 and parameters: {'learning_rate': 1.197607302987228e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.1118295687729018}. Best is trial 2 with value: 0.66344972802246.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.5486, 'grad_norm': 10.697694778442383, 'learning_rate': 1.1639475289629497e-05, 'epoch': 0.29994001199760045}
{'loss': 1.3375, 'grad_norm': 12.028133392333984, 'learning_rate': 1.0696091006145181e-05, 'epoch': 0.5998800239952009}
{'loss': 1.2119, 'grad_norm': 8.412325859069824, 'learning_rate': 9.752706722660864e-06, 'epoch': 0.8998200359928015}
{'eval_loss': 1.0578627586364746, 'eval_accuracy': 0.5783470784348131, 'eval_f1': 0.5728201141002277, 'eval_runtime': 37.5856, 'eval_samples_per_second': 151.627, 'eval_steps_per_second': 18.97, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.0508, 'grad_norm': 18.18069839477539, 'learning_rate': 8.809322439176548e-06, 'epoch': 1.1997600479904018}
{'loss': 0.9546, 'grad_norm': 27.85676383972168, 'learning_rate': 7.865938155692233e-06, 'epoch': 1.4997000599880024}
{'loss': 0.9197, 'grad_norm': 19.607519149780273, 'learning_rate': 6.922553872207915e-06, 'epoch': 1.799640071985603}
{'eval_loss': 0.9436349868774414, 'eval_accuracy': 0.6351991577469731, 'eval_f1': 0.6331626030700789, 'eval_runtime': 37.7916, 'eval_samples_per_second': 150.801, 'eval_steps_per_second': 18.867, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.8497, 'grad_norm': 12.758941650390625, 'learning_rate': 5.979169588723598e-06, 'epoch': 2.0995800839832035}
{'loss': 0.7363, 'grad_norm': 26.483840942382812, 'learning_rate': 5.035785305239282e-06, 'epoch': 2.3995200959808036}
{'loss': 0.7302, 'grad_norm': 34.604103088378906, 'learning_rate': 4.092401021754965e-06, 'epoch': 2.699460107978404}
{'loss': 0.7038, 'grad_norm': 18.7579288482666, 'learning_rate': 3.149016738270649e-06, 'epoch': 2.9994001199760048}
{'eval_loss': 0.9210144281387329, 'eval_accuracy': 0.6588875241270399, 'eval_f1': 0.656044146946713, 'eval_runtime': 37.8119, 'eval_samples_per_second': 150.72, 'eval_steps_per_second': 18.857, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.5791, 'grad_norm': 15.093894004821777, 'learning_rate': 2.2056324547863323e-06, 'epoch': 3.2993401319736053}
{'loss': 0.5872, 'grad_norm': 33.22097396850586, 'learning_rate': 1.2622481713020154e-06, 'epoch': 3.599280143971206}
{'loss': 0.5605, 'grad_norm': 10.859090805053711, 'learning_rate': 3.1886388781769903e-07, 'epoch': 3.8992201559688064}
{'eval_loss': 0.961851179599762, 'eval_accuracy': 0.6525706264256887, 'eval_f1': 0.6566040658826694, 'eval_runtime': 37.8076, 'eval_samples_per_second': 150.737, 'eval_steps_per_second': 18.859, 'epoch': 4.0}
{'train_runtime': 2052.5083, 'train_samples_per_second': 51.954, 'train_steps_per_second': 3.249, 'train_loss': 0.8969270537982247, 'epoch': 4.0}


[I 2025-08-01 22:57:08,776] Trial 5 finished with value: 0.6525706264256887 and parameters: {'learning_rate': 1.2580972804546846e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4, 'weight_decay': 0.0686923404819531}. Best is trial 2 with value: 0.66344972802246.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.6132, 'grad_norm': 7.645420074462891, 'learning_rate': 2.500394736242887e-05, 'epoch': 0.15001500150015}
{'loss': 1.4219, 'grad_norm': 15.37460708618164, 'learning_rate': 2.4029742291033683e-05, 'epoch': 0.3000300030003}
{'loss': 1.3498, 'grad_norm': 23.142534255981445, 'learning_rate': 2.3055537219638497e-05, 'epoch': 0.45004500450045004}
{'loss': 1.2062, 'grad_norm': 9.419940948486328, 'learning_rate': 2.2081332148243308e-05, 'epoch': 0.6000600060006}
{'loss': 1.1438, 'grad_norm': 12.361284255981445, 'learning_rate': 2.110712707684812e-05, 'epoch': 0.7500750075007501}
{'loss': 1.1056, 'grad_norm': 18.70070457458496, 'learning_rate': 2.0132922005452936e-05, 'epoch': 0.9000900090009001}
{'eval_loss': 1.0175377130508423, 'eval_accuracy': 0.6067731180908932, 'eval_f1': 0.5995843849354824, 'eval_runtime': 37.8083, 'eval_samples_per_second': 150.734, 'eval_steps_per_second': 18.858, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.0317, 'grad_norm': 20.549762725830078, 'learning_rate': 1.915871693405775e-05, 'epoch': 1.05010501050105}
{'loss': 0.9068, 'grad_norm': 22.784406661987305, 'learning_rate': 1.818451186266256e-05, 'epoch': 1.2001200120012}
{'loss': 0.8847, 'grad_norm': 28.953094482421875, 'learning_rate': 1.7210306791267374e-05, 'epoch': 1.3501350135013501}
{'loss': 0.875, 'grad_norm': 17.938974380493164, 'learning_rate': 1.623610171987219e-05, 'epoch': 1.5001500150015001}
{'loss': 0.8533, 'grad_norm': 11.379889488220215, 'learning_rate': 1.5261896648477e-05, 'epoch': 1.6501650165016502}
{'loss': 0.8379, 'grad_norm': 18.909099578857422, 'learning_rate': 1.4287691577081813e-05, 'epoch': 1.8001800180018002}
{'loss': 0.8052, 'grad_norm': 27.410083770751953, 'learning_rate': 1.3313486505686625e-05, 'epoch': 1.9501950195019502}
{'eval_loss': 0.9549838900566101, 'eval_accuracy': 0.6362519740305317, 'eval_f1': 0.633289194662922, 'eval_runtime': 37.6614, 'eval_samples_per_second': 151.322, 'eval_step

  return forward_call(*args, **kwargs)


{'loss': 0.6944, 'grad_norm': 25.372718811035156, 'learning_rate': 1.233928143429144e-05, 'epoch': 2.1002100210021}
{'loss': 0.6, 'grad_norm': 26.096466064453125, 'learning_rate': 1.1365076362896252e-05, 'epoch': 2.25022502250225}
{'loss': 0.5934, 'grad_norm': 27.69170379638672, 'learning_rate': 1.0390871291501064e-05, 'epoch': 2.4002400240024}
{'loss': 0.6129, 'grad_norm': 28.166915893554688, 'learning_rate': 9.416666220105878e-06, 'epoch': 2.5502550255025502}
{'loss': 0.5775, 'grad_norm': 30.282426834106445, 'learning_rate': 8.44246114871069e-06, 'epoch': 2.7002700270027002}
{'loss': 0.5757, 'grad_norm': 44.19147872924805, 'learning_rate': 7.468256077315504e-06, 'epoch': 2.8502850285028503}
{'eval_loss': 0.983411431312561, 'eval_accuracy': 0.6664327074925426, 'eval_f1': 0.6643142653244971, 'eval_runtime': 37.6239, 'eval_samples_per_second': 151.473, 'eval_steps_per_second': 18.951, 'epoch': 3.0}
{'loss': 0.5544, 'grad_norm': 29.45746421813965, 'learning_rate': 6.494051005920317e-06, 

  return forward_call(*args, **kwargs)


{'loss': 0.3585, 'grad_norm': 32.284446716308594, 'learning_rate': 5.51984593452513e-06, 'epoch': 3.1503150315031503}
{'loss': 0.3656, 'grad_norm': 21.853315353393555, 'learning_rate': 4.545640863129943e-06, 'epoch': 3.3003300330033003}
{'loss': 0.3826, 'grad_norm': 36.43042755126953, 'learning_rate': 3.571435791734756e-06, 'epoch': 3.4503450345034503}
{'loss': 0.3638, 'grad_norm': 28.935213088989258, 'learning_rate': 2.5972307203395687e-06, 'epoch': 3.6003600360036003}
{'loss': 0.352, 'grad_norm': 38.27227020263672, 'learning_rate': 1.6230256489443816e-06, 'epoch': 3.7503750375037503}
{'loss': 0.3651, 'grad_norm': 38.64236831665039, 'learning_rate': 6.488205775491945e-07, 'epoch': 3.9003900390039004}
{'eval_loss': 1.2479426860809326, 'eval_accuracy': 0.6706439726267767, 'eval_f1': 0.670705355372517, 'eval_runtime': 39.5256, 'eval_samples_per_second': 144.185, 'eval_steps_per_second': 18.039, 'epoch': 4.0}
{'train_runtime': 2250.833, 'train_samples_per_second': 47.376, 'train_steps_per

[I 2025-08-01 23:34:40,993] Trial 6 finished with value: 0.6706439726267767 and parameters: {'learning_rate': 2.5976204023681267e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 4, 'weight_decay': 0.07629723788316208}. Best is trial 6 with value: 0.6706439726267767.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.3898, 'grad_norm': 7.828648090362549, 'learning_rate': 3.45157332064703e-05, 'epoch': 0.5995203836930456}
{'eval_loss': 0.9991999268531799, 'eval_accuracy': 0.6088787506580102, 'eval_f1': 0.6085752103841441, 'eval_runtime': 37.658, 'eval_samples_per_second': 151.336, 'eval_steps_per_second': 18.934, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.0418, 'grad_norm': 10.473342895507812, 'learning_rate': 2.8432593762256286e-05, 'epoch': 1.1990407673860912}
{'loss': 0.8631, 'grad_norm': 17.31819725036621, 'learning_rate': 2.234945431804228e-05, 'epoch': 1.7985611510791366}
{'eval_loss': 0.9118499159812927, 'eval_accuracy': 0.6462537287243376, 'eval_f1': 0.6426467297222495, 'eval_runtime': 37.6477, 'eval_samples_per_second': 151.377, 'eval_steps_per_second': 18.939, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.6768, 'grad_norm': 13.972012519836426, 'learning_rate': 1.6266314873828264e-05, 'epoch': 2.3980815347721824}
{'loss': 0.5824, 'grad_norm': 8.609002113342285, 'learning_rate': 1.0183175429614255e-05, 'epoch': 2.997601918465228}
{'eval_loss': 0.9353725910186768, 'eval_accuracy': 0.6669591156343219, 'eval_f1': 0.6604096079380722, 'eval_runtime': 37.6934, 'eval_samples_per_second': 151.194, 'eval_steps_per_second': 18.916, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.3714, 'grad_norm': 10.684033393859863, 'learning_rate': 4.1000359854002435e-06, 'epoch': 3.597122302158273}
{'eval_loss': 1.0072349309921265, 'eval_accuracy': 0.6708194420073698, 'eval_f1': 0.6703683030093779, 'eval_runtime': 37.6357, 'eval_samples_per_second': 151.425, 'eval_steps_per_second': 18.945, 'epoch': 4.0}
{'train_runtime': 2029.9214, 'train_samples_per_second': 52.532, 'train_steps_per_second': 1.643, 'train_loss': 0.7724193188783933, 'epoch': 4.0}


[I 2025-08-02 00:08:32,452] Trial 7 finished with value: 0.6708194420073698 and parameters: {'learning_rate': 4.058670637179588e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.18336963954777766}. Best is trial 7 with value: 0.6708194420073698.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.6164, 'grad_norm': 13.59861946105957, 'learning_rate': 2.0993369954377246e-05, 'epoch': 0.15001500150015}
{'loss': 1.4211, 'grad_norm': 16.232032775878906, 'learning_rate': 1.9888455746252128e-05, 'epoch': 0.3000300030003}
{'loss': 1.3535, 'grad_norm': 15.464179039001465, 'learning_rate': 1.878354153812701e-05, 'epoch': 0.45004500450045004}
{'loss': 1.2169, 'grad_norm': 9.996145248413086, 'learning_rate': 1.767862733000189e-05, 'epoch': 0.6000600060006}
{'loss': 1.1379, 'grad_norm': 11.16698932647705, 'learning_rate': 1.6573713121876775e-05, 'epoch': 0.7500750075007501}
{'loss': 1.116, 'grad_norm': 19.618427276611328, 'learning_rate': 1.5468798913751657e-05, 'epoch': 0.9000900090009001}
{'eval_loss': 1.02041757106781, 'eval_accuracy': 0.5988769959642043, 'eval_f1': 0.5937703622431075, 'eval_runtime': 37.811, 'eval_samples_per_second': 150.723, 'eval_steps_per_second': 18.857, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.0324, 'grad_norm': 48.592350006103516, 'learning_rate': 1.4363884705626536e-05, 'epoch': 1.05010501050105}
{'loss': 0.912, 'grad_norm': 31.042821884155273, 'learning_rate': 1.3258970497501418e-05, 'epoch': 1.2001200120012}
{'loss': 0.8749, 'grad_norm': 29.7739315032959, 'learning_rate': 1.2154056289376302e-05, 'epoch': 1.3501350135013501}
{'loss': 0.8677, 'grad_norm': 23.99934196472168, 'learning_rate': 1.1049142081251182e-05, 'epoch': 1.5001500150015001}
{'loss': 0.847, 'grad_norm': 20.572202682495117, 'learning_rate': 9.944227873126064e-06, 'epoch': 1.6501650165016502}
{'loss': 0.8149, 'grad_norm': 18.290952682495117, 'learning_rate': 8.839313665000945e-06, 'epoch': 1.8001800180018002}
{'loss': 0.8035, 'grad_norm': 25.063861846923828, 'learning_rate': 7.734399456875828e-06, 'epoch': 1.9501950195019502}
{'eval_loss': 0.9150165915489197, 'eval_accuracy': 0.6516932795227233, 'eval_f1': 0.6442840079663544, 'eval_runtime': 38.0147, 'eval_samples_per_second': 149.916, 'eval_step

  return forward_call(*args, **kwargs)


{'loss': 0.6853, 'grad_norm': 45.00600814819336, 'learning_rate': 6.629485248750709e-06, 'epoch': 2.1002100210021}
{'loss': 0.6009, 'grad_norm': 20.26804542541504, 'learning_rate': 5.524571040625591e-06, 'epoch': 2.25022502250225}
{'loss': 0.5859, 'grad_norm': 37.0764274597168, 'learning_rate': 4.419656832500472e-06, 'epoch': 2.4002400240024}
{'loss': 0.6036, 'grad_norm': 21.833894729614258, 'learning_rate': 3.3147426243753544e-06, 'epoch': 2.5502550255025502}
{'loss': 0.5657, 'grad_norm': 34.02779006958008, 'learning_rate': 2.209828416250236e-06, 'epoch': 2.7002700270027002}
{'loss': 0.55, 'grad_norm': 56.79747009277344, 'learning_rate': 1.104914208125118e-06, 'epoch': 2.8502850285028503}
{'eval_loss': 0.9713751673698425, 'eval_accuracy': 0.6630987892612739, 'eval_f1': 0.6639498069213958, 'eval_runtime': 37.7881, 'eval_samples_per_second': 150.815, 'eval_steps_per_second': 18.868, 'epoch': 3.0}
{'train_runtime': 1672.3125, 'train_samples_per_second': 47.824, 'train_steps_per_second': 

[I 2025-08-02 00:36:25,905] Trial 8 finished with value: 0.6630987892612739 and parameters: {'learning_rate': 2.2096074334086114e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.09600511663603951}. Best is trial 7 with value: 0.6708194420073698.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.7554, 'grad_norm': 5.0842390060424805, 'learning_rate': 4.7402039846005027e-05, 'epoch': 0.15001500150015}
{'loss': 1.7489, 'grad_norm': 8.039613723754883, 'learning_rate': 4.555515915380504e-05, 'epoch': 0.3000300030003}
{'loss': 1.7448, 'grad_norm': 6.890608787536621, 'learning_rate': 4.370827846160504e-05, 'epoch': 0.45004500450045004}
{'loss': 1.7269, 'grad_norm': 7.151069641113281, 'learning_rate': 4.186139776940505e-05, 'epoch': 0.6000600060006}
{'loss': 1.7366, 'grad_norm': 7.050571441650391, 'learning_rate': 4.0014517077205055e-05, 'epoch': 0.7500750075007501}
{'loss': 1.7374, 'grad_norm': 8.329668998718262, 'learning_rate': 3.8167636385005066e-05, 'epoch': 0.9000900090009001}


[I 2025-08-02 00:45:43,005] Trial 9 pruned. 


{'eval_loss': 1.7511155605316162, 'eval_accuracy': 0.2654851728373399, 'eval_f1': 0.06992974671843225, 'eval_runtime': 37.7912, 'eval_samples_per_second': 150.802, 'eval_steps_per_second': 18.867, 'epoch': 1.0}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.5074, 'grad_norm': 11.955113410949707, 'learning_rate': 2.8042379084625337e-05, 'epoch': 0.29994001199760045}
{'loss': 1.2624, 'grad_norm': 8.62268352508545, 'learning_rate': 2.4927943379980146e-05, 'epoch': 0.5998800239952009}
{'loss': 1.142, 'grad_norm': 8.649127006530762, 'learning_rate': 2.181350767533495e-05, 'epoch': 0.8998200359928015}
{'eval_loss': 1.0132594108581543, 'eval_accuracy': 0.5967713633970873, 'eval_f1': 0.5915897176325734, 'eval_runtime': 37.7146, 'eval_samples_per_second': 151.108, 'eval_steps_per_second': 18.905, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 0.9768, 'grad_norm': 29.08860969543457, 'learning_rate': 1.8699071970689755e-05, 'epoch': 1.1997600479904018}
{'loss': 0.8832, 'grad_norm': 27.207441329956055, 'learning_rate': 1.558463626604456e-05, 'epoch': 1.4997000599880024}
{'loss': 0.841, 'grad_norm': 13.732091903686523, 'learning_rate': 1.2470200561399361e-05, 'epoch': 1.799640071985603}
{'eval_loss': 0.9210802316665649, 'eval_accuracy': 0.6504649938585717, 'eval_f1': 0.6473258888544068, 'eval_runtime': 37.6098, 'eval_samples_per_second': 151.529, 'eval_steps_per_second': 18.958, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.7444, 'grad_norm': 13.95425033569336, 'learning_rate': 9.355764856754166e-06, 'epoch': 2.0995800839832035}
{'loss': 0.5967, 'grad_norm': 20.677427291870117, 'learning_rate': 6.241329152108971e-06, 'epoch': 2.3995200959808036}
{'loss': 0.5904, 'grad_norm': 30.512723922729492, 'learning_rate': 3.1268934474637765e-06, 'epoch': 2.699460107978404}
{'loss': 0.5582, 'grad_norm': 7.648320198059082, 'learning_rate': 1.2457742818580782e-08, 'epoch': 2.9994001199760048}
{'eval_loss': 0.9311506152153015, 'eval_accuracy': 0.6639761361642393, 'eval_f1': 0.6666455838850364, 'eval_runtime': 37.8357, 'eval_samples_per_second': 150.625, 'eval_steps_per_second': 18.845, 'epoch': 3.0}
{'train_runtime': 1534.4917, 'train_samples_per_second': 52.12, 'train_steps_per_second': 3.259, 'train_loss': 0.9100847143133839, 'epoch': 3.0}


[I 2025-08-02 01:11:18,660] Trial 10 finished with value: 0.6639761361642393 and parameters: {'learning_rate': 3.1150585917861244e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.23100132920522845}. Best is trial 7 with value: 0.6708194420073698.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.4536, 'grad_norm': 3.9133267402648926, 'learning_rate': 2.3391802485445706e-05, 'epoch': 0.5995203836930456}


[I 2025-08-02 01:19:46,779] Trial 11 pruned. 


{'eval_loss': 1.044995665550232, 'eval_accuracy': 0.5892261800315844, 'eval_f1': 0.5860441082037408, 'eval_runtime': 37.5913, 'eval_samples_per_second': 151.604, 'eval_steps_per_second': 18.967, 'epoch': 1.0}
Best hyperparameters:
{'learning_rate': 4.058670637179588e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.18336963954777766}


In [10]:
best_run_A_df = pd.DataFrame(best_run)
best_run_A_df.to_csv("hyperpara_B.csv",index=False)

## Training with best Tuning Parameters


In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=best_run.hyperparameters["learning_rate"],
    per_device_train_batch_size=best_run.hyperparameters["per_device_train_batch_size"],
    num_train_epochs=best_run.hyperparameters["num_train_epochs"],
    weight_decay=best_run.hyperparameters["weight_decay"],
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.4097,1.007001,0.610984,0.603266
2,0.8659,0.912571,0.639937,0.639139
3,0.5841,0.910643,0.675733,0.671737
4,0.3722,0.992627,0.675382,0.675238


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=3336, training_loss=0.7817067025090865, metrics={'train_runtime': 455.3439, 'train_samples_per_second': 234.188, 'train_steps_per_second': 7.326, 'total_flos': 2.805811815326515e+16, 'train_loss': 0.7817067025090865, 'epoch': 4.0})

In [12]:
model.save_pretrained("classifier_final_B/")
tokenizer.save_pretrained("classifier_final_B/")

('classifier_final_B/tokenizer_config.json',
 'classifier_final_B/special_tokens_map.json',
 'classifier_final_B/vocab.txt',
 'classifier_final_B/added_tokens.json',
 'classifier_final_B/tokenizer.json')

In [13]:
# model und tokenizer müssen schon geladen sein
model.eval()

def tokenize_sliding_windows(example: Dict[str, Any]) -> Dict[str, Any]:
    encoding = tokenizer(
        example["speech_text"],
        truncation=True,
        padding="max_length",
        max_length=WINDOW_LENGTH,
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=False,
        return_tensors="pt"
    )
    return encoding

def predict_proba_for_dataset(dataset: Dataset, label_names) -> List[Dict[str, Any]]:
    results = []

    for example in tqdm(dataset):
        tokenized = tokenize_sliding_windows(example)
        input_ids = tokenized["input_ids"].to(model.device)
        attention_mask = tokenized["attention_mask"].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()

        avg_probs = probs.mean(axis=0)
        results.append({
            "probs": avg_probs.tolist(),
            "label": example["label"]  # falls du wahren Wert mitgeben willst
        })

    for item in results:
      probs = item["probs"]
      pred_idx = int(np.argmax(probs))
      item["prediction_label"] = label_names[pred_idx]
    return results


### Validation

In [14]:
results_val_B = predict_proba_for_dataset(val_data, sorted(label_names))

  return forward_call(*args, **kwargs)
100%|██████████| 5418/5418 [00:35<00:00, 153.21it/s]


In [15]:
results_val_B_df = pd.DataFrame(results_val_B)

In [16]:
results_val_B_df.to_csv("classifier_final_B_validation_results.csv",index=False)

### TEST

In [17]:
results_test_B = predict_proba_for_dataset(test_data, sorted(label_names))

100%|██████████| 5418/5418 [00:35<00:00, 153.34it/s]


In [18]:
results_test_B_df = pd.DataFrame(results_test_B)

In [19]:
results_test_B_df.to_csv("classifier_final_B_test_results.csv", index=False)

In [20]:

# Name of the zip file you want to create
zip_filename = "allresultsB.zip"

# Create a zip file
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add model/tokenizer folder
    for root, dirs, files in os.walk("classifier_final_B/"):
        for file in files:
            filepath = os.path.join(root, file)
            arcname = os.path.relpath(filepath, start=os.path.dirname("classifier_final_B/"))
            zipf.write(filepath, arcname=arcname)

    # Add any CSVs you want
    for csv_file in ["hyperpara_B.csv", "classifier_final_B_validation_results.csv", "classifier_final_B_test_results.csv"]:
        if os.path.exists(csv_file):
            zipf.write(csv_file)