# Fine-tuning Classifier LLM


In [1]:
%pip install optuna
%pip install typing
%pip install evaluate



In [2]:
# setup - load packages
import pandas as pd
from datasets import Dataset, load_dataset
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    balanced_accuracy_score
)
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from typing import Union, Mapping, List, Dict, Any
import evaluate
from tqdm import tqdm
import zipfile
import os


# Set up device (is available use GPU to speed up computations)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

seed = 13

cuda


## VERSION B

In [None]:
classifier_data = pd.read_csv("../data/classifier_data_B.csv")
# converting to huggingface dataset format
data = Dataset.from_pandas(classifier_data)
# splitting into train, test and validation sets
# party data
raw_dataset = data.shuffle(seed=seed)

# 70% train, 15% test, 15% validation data
split = raw_dataset.train_test_split(test_size=0.3, seed=seed)
train_data = split["train"]
text_and_val_data = split["test"]
split = text_and_val_data.train_test_split(test_size=0.5, seed=seed)
test_data = split["train"]
val_data = split["test"]

print(f"Training samples party: {len(train_data)}")
print(f"Test samples party: {len(test_data)}")
print(f"Validation samples party: {len(val_data)}")



# data balancing??


Training samples party: 25281
Test samples party: 5418
Validation samples party: 5418


In [4]:
WINDOW_LENGTH = 512
STRIDE = 256

In [5]:
# Load Tokenizer
model_name = "bert-base-german-cased"
num_labels = 6
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=WINDOW_LENGTH
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/485k [00:00<?, ?B/s]

In [6]:
label_names = ['CDU/CSU', 'SPD', 'GRÜNE', 'FDP', 'AfD', 'LINKE']
label2id = {label: i for i, label in enumerate(sorted(label_names))}
id2label = {i: label for label, i in label2id.items()}

In [7]:
def sliding_window_tokenize(batch):
    texts = batch["speech_text"]
    labels = batch["label"]  # ensure this is a flat list of ints

    tokenized = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=WINDOW_LENGTH,
        stride=STRIDE,
        return_overflowing_tokens=True,
    )

    # Assign each overflow window the correct label
    tokenized["labels"] = [label2id[labels[i]] for i in tokenized["overflow_to_sample_mapping"]]

    return tokenized


In [None]:
tokenized_train_data = train_data.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=train_data.column_names
)

tokenized_val_data = val_data.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=train_data.column_names
)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-german-cased",
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id
)

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]

    return {
        "accuracy": acc,
        "f1": f1
    }


model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

## Hyperparameter Tuning

In [11]:
# training arguments for hyperparameter tuning
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",  # don't save checkpoints during tuning
    logging_dir="./logs",
    disable_tqdm=True,  # speed up tuning
    report_to="none",   # optional: disable W&B or other logging
)

def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 4),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
    }

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

best_run = trainer.hyperparameter_search(
    direction="maximize",        # because we want to maximize accuracy
    hp_space=hp_space,
    n_trials=12,                 # how many combinations to try
    compute_objective=lambda metrics: metrics["eval_accuracy"],
    backend="optuna"
)

print("Best hyperparameters:")
print(best_run.hyperparameters)


  trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-08-01 15:47:10,393] A new study created in memory with name: no-name-abac7e30-8d8d-4bd5-8ad9-a149ddf73a3f
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.549001693725586, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.7599, 'eval_samples_per_second': 28.951, 'eval_steps_per_second': 3.948, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5406497716903687, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6842, 'eval_samples_per_second': 32.154, 'eval_steps_per_second': 4.385, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.541081190109253, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6172, 'eval_samples_per_second': 35.642, 'eval_steps_per_second': 4.86, 'epoch': 3.0}
{'train_runtime': 8.9393, 'train_samples_per_second': 7.383, 'train_steps_per_second': 0.336, 'train_loss': 1.5092126528422039, 'epoch': 3.0}


[I 2025-08-01 15:47:21,524] Trial 0 finished with value: 0.45454545454545453 and parameters: {'learning_rate': 3.945062452114298e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0.1627290747856}. Best is trial 0 with value: 0.45454545454545453.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.5372813940048218, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6296, 'eval_samples_per_second': 34.945, 'eval_steps_per_second': 4.765, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.599972128868103, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6357, 'eval_samples_per_second': 34.61, 'eval_steps_per_second': 4.72, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5849273204803467, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6575, 'eval_samples_per_second': 33.461, 'eval_steps_per_second': 4.563, 'epoch': 3.0}
{'train_runtime': 7.7864, 'train_samples_per_second': 8.476, 'train_steps_per_second': 1.156, 'train_loss': 1.4752503501044378, 'epoch': 3.0}


[I 2025-08-01 15:47:32,317] Trial 1 finished with value: 0.45454545454545453 and parameters: {'learning_rate': 4.096594140562765e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.05962584913485126}. Best is trial 0 with value: 0.45454545454545453.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.5666934251785278, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6414, 'eval_samples_per_second': 34.302, 'eval_steps_per_second': 4.678, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5416412353515625, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6523, 'eval_samples_per_second': 33.726, 'eval_steps_per_second': 4.599, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5362071990966797, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6521, 'eval_samples_per_second': 33.736, 'eval_steps_per_second': 4.6, 'epoch': 3.0}
{'train_runtime': 7.8513, 'train_samples_per_second': 8.406, 'train_steps_per_second': 0.764, 'train_loss': 1.5619068145751953, 'epoch': 3.0}


[I 2025-08-01 15:47:42,511] Trial 2 finished with value: 0.45454545454545453 and parameters: {'learning_rate': 1.3501026375522563e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.049275384377303055}. Best is trial 0 with value: 0.45454545454545453.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.525761365890503, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6482, 'eval_samples_per_second': 33.94, 'eval_steps_per_second': 4.628, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.532608151435852, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6363, 'eval_samples_per_second': 34.573, 'eval_steps_per_second': 4.715, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.540518879890442, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6334, 'eval_samples_per_second': 34.732, 'eval_steps_per_second': 4.736, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5379321575164795, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6519, 'eval_samples_per_second': 33.745, 'eval_steps_per_second': 4.602, 'epoch': 4.0}
{'train_runtime': 10.4424, 'train_samples_per_second': 8.427, 'train_steps_per_second': 1.149, 'train_loss': 1.429746945699056, 'epoch': 4.0}


[I 2025-08-01 15:47:54,363] Trial 3 finished with value: 0.45454545454545453 and parameters: {'learning_rate': 1.752808291256064e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 4, 'weight_decay': 0.27910405343346906}. Best is trial 0 with value: 0.45454545454545453.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.5964511632919312, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6524, 'eval_samples_per_second': 33.723, 'eval_steps_per_second': 4.599, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5548428297042847, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6507, 'eval_samples_per_second': 33.812, 'eval_steps_per_second': 4.611, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.539088487625122, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6464, 'eval_samples_per_second': 34.034, 'eval_steps_per_second': 4.641, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.533701777458191, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6459, 'eval_samples_per_second': 34.06, 'eval_steps_per_second': 4.645, 'epoch': 4.0}
{'train_runtime': 10.1757, 'train_samples_per_second': 8.648, 'train_steps_per_second': 0.393, 'train_loss': 1.549450397491455, 'epoch': 4.0}


[I 2025-08-01 15:48:06,023] Trial 4 finished with value: 0.45454545454545453 and parameters: {'learning_rate': 1.4900197687624833e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.22972099835884757}. Best is trial 0 with value: 0.45454545454545453.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.5498390197753906, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6699, 'eval_samples_per_second': 32.843, 'eval_steps_per_second': 4.479, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5229264497756958, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6612, 'eval_samples_per_second': 33.273, 'eval_steps_per_second': 4.537, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5243923664093018, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6665, 'eval_samples_per_second': 33.009, 'eval_steps_per_second': 4.501, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.524011492729187, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6794, 'eval_samples_per_second': 32.383, 'eval_steps_per_second': 4.416, 'epoch': 4.0}
{'train_runtime': 10.8173, 'train_samples_per_second': 8.135, 'train_steps_per_second': 1.109, 'train_loss': 1.459578514099121, 'epoch': 4.0}


[I 2025-08-01 15:48:18,350] Trial 5 finished with value: 0.45454545454545453 and parameters: {'learning_rate': 1.253362792407971e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 4, 'weight_decay': 0.22878470146484226}. Best is trial 0 with value: 0.45454545454545453.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.5551064014434814, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6932, 'eval_samples_per_second': 31.736, 'eval_steps_per_second': 4.328, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.6126034259796143, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6891, 'eval_samples_per_second': 31.927, 'eval_steps_per_second': 4.354, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5903956890106201, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6917, 'eval_samples_per_second': 31.806, 'eval_steps_per_second': 4.337, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5830901861190796, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6986, 'eval_samples_per_second': 31.493, 'eval_steps_per_second': 4.294, 'epoch': 4.0}
{'train_runtime': 11.0832, 'train_samples_per_second': 7.94, 'train_steps_per_second': 1.083, 'train_loss': 1.4166576067606609, 'epoch': 4.0}


[I 2025-08-01 15:48:30,857] Trial 6 finished with value: 0.45454545454545453 and parameters: {'learning_rate': 4.5477977842471815e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 4, 'weight_decay': 0.13173514468960767}. Best is trial 0 with value: 0.45454545454545453.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.5780831575393677, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.7178, 'eval_samples_per_second': 30.651, 'eval_steps_per_second': 4.18, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5468742847442627, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.7152, 'eval_samples_per_second': 30.761, 'eval_steps_per_second': 4.195, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5369664430618286, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.714, 'eval_samples_per_second': 30.813, 'eval_steps_per_second': 4.202, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5324923992156982, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.7215, 'eval_samples_per_second': 30.494, 'eval_steps_per_second': 4.158, 'epoch': 4.0}
{'train_runtime': 11.1988, 'train_samples_per_second': 7.858, 'train_steps_per_second': 0.714, 'train_loss': 1.536794662475586, 'epoch': 4.0}


[I 2025-08-01 15:48:43,534] Trial 7 finished with value: 0.45454545454545453 and parameters: {'learning_rate': 1.1233824098360417e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4, 'weight_decay': 0.13629535542622354}. Best is trial 0 with value: 0.45454545454545453.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.5666331052780151, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.7689, 'eval_samples_per_second': 28.612, 'eval_steps_per_second': 3.902, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.537272572517395, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.7336, 'eval_samples_per_second': 29.989, 'eval_steps_per_second': 4.089, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.531195878982544, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.7328, 'eval_samples_per_second': 30.022, 'eval_steps_per_second': 4.094, 'epoch': 3.0}
{'train_runtime': 8.5848, 'train_samples_per_second': 7.688, 'train_steps_per_second': 0.349, 'train_loss': 1.545888900756836, 'epoch': 3.0}


[I 2025-08-01 15:48:53,638] Trial 8 finished with value: 0.45454545454545453 and parameters: {'learning_rate': 2.351409517679184e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0.2821701409777596}. Best is trial 0 with value: 0.45454545454545453.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.557132601737976, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.705, 'eval_samples_per_second': 31.208, 'eval_steps_per_second': 4.256, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5354080200195312, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.7429, 'eval_samples_per_second': 29.612, 'eval_steps_per_second': 4.038, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5348930358886719, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.7125, 'eval_samples_per_second': 30.876, 'eval_steps_per_second': 4.21, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5348767042160034, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.7064, 'eval_samples_per_second': 31.143, 'eval_steps_per_second': 4.247, 'epoch': 4.0}
{'train_runtime': 11.0498, 'train_samples_per_second': 7.964, 'train_steps_per_second': 0.362, 'train_loss': 1.4852694272994995, 'epoch': 4.0}


[I 2025-08-01 15:49:06,516] Trial 9 finished with value: 0.45454545454545453 and parameters: {'learning_rate': 2.851121258035379e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.10544630818571896}. Best is trial 0 with value: 0.45454545454545453.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.5521572828292847, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.695, 'eval_samples_per_second': 31.655, 'eval_steps_per_second': 4.317, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5367289781570435, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6776, 'eval_samples_per_second': 32.467, 'eval_steps_per_second': 4.427, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.5357569456100464, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6857, 'eval_samples_per_second': 32.083, 'eval_steps_per_second': 4.375, 'epoch': 3.0}
{'train_runtime': 8.0261, 'train_samples_per_second': 8.223, 'train_steps_per_second': 0.374, 'train_loss': 1.5208667119344075, 'epoch': 3.0}


[I 2025-08-01 15:49:16,489] Trial 10 finished with value: 0.45454545454545453 and parameters: {'learning_rate': 3.29332490570464e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0.19521382599487264}. Best is trial 0 with value: 0.45454545454545453.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.5553295612335205, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6806, 'eval_samples_per_second': 32.324, 'eval_steps_per_second': 4.408, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.6115927696228027, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6836, 'eval_samples_per_second': 32.184, 'eval_steps_per_second': 4.389, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.597044825553894, 'eval_accuracy': 0.45454545454545453, 'eval_f1': 0.10416666666666667, 'eval_runtime': 0.6821, 'eval_samples_per_second': 32.253, 'eval_steps_per_second': 4.398, 'epoch': 3.0}
{'train_runtime': 8.2486, 'train_samples_per_second': 8.001, 'train_steps_per_second': 1.091, 'train_loss': 1.482511626349555, 'epoch': 3.0}


[I 2025-08-01 15:49:26,223] Trial 11 finished with value: 0.45454545454545453 and parameters: {'learning_rate': 4.751634949842777e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.009835299275324366}. Best is trial 0 with value: 0.45454545454545453.


Best hyperparameters:
{'learning_rate': 3.945062452114298e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0.1627290747856}


In [12]:
best_run_A_df = pd.DataFrame(best_run)
best_run_A_df.to_csv("hyperpara_B.csv",index=False)

## Training with best Tuning Parameters


In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=best_run.hyperparameters["learning_rate"],
    per_device_train_batch_size=best_run.hyperparameters["per_device_train_batch_size"],
    num_train_epochs=best_run.hyperparameters["num_train_epochs"],
    weight_decay=best_run.hyperparameters["weight_decay"],
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.58323,0.454545,0.104167
2,No log,1.534845,0.454545,0.104167
3,No log,1.523571,0.454545,0.104167


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=3, training_loss=1.5890151659647624, metrics={'train_runtime': 25.2282, 'train_samples_per_second': 2.616, 'train_steps_per_second': 0.119, 'total_flos': 17365953318912.0, 'train_loss': 1.5890151659647624, 'epoch': 3.0})

In [14]:
model.save_pretrained("classifier_final_B/")
tokenizer.save_pretrained("classifier_final_B/")

('classifier_final_B/tokenizer_config.json',
 'classifier_final_B/special_tokens_map.json',
 'classifier_final_B/vocab.txt',
 'classifier_final_B/added_tokens.json',
 'classifier_final_B/tokenizer.json')

In [15]:
# model und tokenizer müssen schon geladen sein
model.eval()

def tokenize_sliding_windows(example: Dict[str, Any]) -> Dict[str, Any]:
    encoding = tokenizer(
        example["speech_text"],
        truncation=True,
        padding="max_length",
        max_length=WINDOW_LENGTH,
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=False,
        return_tensors="pt"
    )
    return encoding

def predict_proba_for_dataset(dataset: Dataset, label_names) -> List[Dict[str, Any]]:
    results = []

    for example in tqdm(dataset):
        tokenized = tokenize_sliding_windows(example)
        input_ids = tokenized["input_ids"].to(model.device)
        attention_mask = tokenized["attention_mask"].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()

        avg_probs = probs.mean(axis=0)
        results.append({
            "probs": avg_probs.tolist(),
            "label": example["label"]  # falls du wahren Wert mitgeben willst
        })

    for item in results:
      probs = item["probs"]
      pred_idx = int(np.argmax(probs))
      item["prediction_label"] = label_names[pred_idx]
    return results


### Validation

In [None]:
results_val_B = predict_proba_for_dataset(val_data, sorted(label_names))

  return forward_call(*args, **kwargs)
100%|██████████| 20/20 [00:00<00:00, 47.18it/s]


In [17]:
results_val_B_df = pd.DataFrame(results_val_B)

In [18]:
results_val_B_df.to_csv("classifier_final_B_validation_results.csv",index=False)

### TEST

In [None]:
results_test_B = predict_proba_for_dataset(test_data, sorted(label_names))

100%|██████████| 20/20 [00:00<00:00, 68.56it/s]


In [20]:
results_test_B_df = pd.DataFrame(results_test_B)

In [21]:
results_test_B_df.to_csv("classifier_final_B_test_results.csv", index=False)

In [22]:

# Name of the zip file you want to create
zip_filename = "allresultsB.zip"

# Create a zip file
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add model/tokenizer folder
    for root, dirs, files in os.walk("classifier_final_B/"):
        for file in files:
            filepath = os.path.join(root, file)
            arcname = os.path.relpath(filepath, start=os.path.dirname("classifier_final_B/"))
            zipf.write(filepath, arcname=arcname)

    # Add any CSVs you want
    for csv_file in ["hyperpara_B.csv", "classifier_final_B_validation_results.csv", "classifier_final_B_test_results.csv"]:
        if os.path.exists(csv_file):
            zipf.write(csv_file)