# Fine-tuning Classifier LLM


In [1]:
!pip install optuna
!pip install typing
!pip install evaluate
!pip install torch
!pip install transformers
!pip install accelerate>=0.26.0



In [1]:
# setup - load packages
import pandas as pd
from datasets import Dataset, load_dataset
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    balanced_accuracy_score
)
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from typing import Union, Mapping, List, Dict, Any
import evaluate
from tqdm import tqdm
import zipfile
import os


# Set up device (is available use GPU to speed up computations)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

seed = 13

cuda


## VERSION B

In [None]:
# load data 
classifier_data = pd.read_csv("../data/classifier_data_B.csv")
# converting to huggingface dataset format
data = Dataset.from_pandas(classifier_data)

# splitting into train, test and validation sets
# party data
raw_dataset = data.shuffle(seed=seed)

# 70% train, 15% test, 15% validation data
split = raw_dataset.train_test_split(test_size=0.3, seed=seed)
train_data = split["train"]
text_and_val_data = split["test"]
split = text_and_val_data.train_test_split(test_size=0.5, seed=seed)
test_data = split["train"]
val_data = split["test"]

print(f"Training samples party: {len(train_data)}")
print(f"Test samples party: {len(test_data)}")
print(f"Validation samples party: {len(val_data)}")


Training samples party: 25281
Test samples party: 5418
Validation samples party: 5418


In [3]:
# subset of train and val data for auto-tuning
train_data_for_tune = train_data.shuffle(seed=seed)
val_data_for_tune = val_data.shuffle(seed=seed)

# subsetting roughly 20-25% of train and vall data for tuning
train_data_for_tune = train_data_for_tune.select(range(6000))
val_data_for_tune = val_data_for_tune.select(range(1000))


In [4]:
WINDOW_LENGTH = 512
STRIDE = 256

In [5]:
# Load Tokenizer
model_name = "bert-base-german-cased"
num_labels = 6
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=WINDOW_LENGTH
)


In [6]:
label_names = ['CDU/CSU', 'SPD', 'GRÜNE', 'FDP', 'AfD', 'LINKE']
label2id = {label: i for i, label in enumerate(sorted(label_names))}
id2label = {i: label for label, i in label2id.items()}

In [None]:
# function to tokenize train data
def sliding_window_tokenize(batch):
    texts = batch["speech_text"]
    labels = batch["label"]  

    tokenized = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=WINDOW_LENGTH,
        stride=STRIDE,
        return_overflowing_tokens=True,
    )

    # Assign each overflow window the correct label
    tokenized["labels"] = [label2id[labels[i]] for i in tokenized["overflow_to_sample_mapping"]]

    return tokenized


In [8]:
tokenized_train_data = train_data.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=train_data.column_names
)

tokenized_val_data = val_data.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=train_data.column_names
)

# also tokenizing subsets

tokenized_train_data_subset = train_data_for_tune.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=train_data_for_tune.column_names
)

tokenized_val_data_subset = val_data_for_tune.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=val_data_for_tune.column_names
)

Map:   0%|          | 0/25281 [00:00<?, ? examples/s]

Map:   0%|          | 0/5418 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# load model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-german-cased",
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id
)

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]

    return {
        "accuracy": acc,
        "f1": f1
    }


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Hyperparameter Tuning

In [None]:
# training arguments for hyperparameter tuning
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",  # don't save checkpoints during tuning
    logging_dir="./logs",
    disable_tqdm=True,  # speed up tuning
    fp16=torch.cuda.is_available(),
    report_to="none",   
)

def hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [1e-5, 2e-5, 3e-5]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 4),
        "weight_decay": trial.suggest_categorical("weight_decay", [0.0, 0.01, 0.05]),
    }

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_train_data_subset,
    eval_dataset=tokenized_val_data_subset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

best_run = trainer.hyperparameter_search(
    direction="maximize",        # because we want to maximize accuracy
    hp_space=hp_space,
    n_trials=12,                 # how many combinations to try
    compute_objective=lambda metrics: metrics["eval_accuracy"],
    backend="optuna"
)

print("Best hyperparameters:")
print(best_run.hyperparameters)


  trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-08-03 11:58:50,300] A new study created in memory with name: no-name-bdb9dd2d-4193-4240-9b10-82c2bdc2dedd
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.6193, 'grad_norm': 23.902393341064453, 'learning_rate': 7.911949685534593e-06, 'epoch': 0.6289308176100629}
{'eval_loss': 1.3889740705490112, 'eval_accuracy': 0.4586894586894587, 'eval_f1': 0.394636765559514, 'eval_runtime': 1.7165, 'eval_samples_per_second': 613.444, 'eval_steps_per_second': 76.899, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.4114, 'grad_norm': 16.988828659057617, 'learning_rate': 5.815513626834381e-06, 'epoch': 1.2578616352201257}
{'loss': 1.2467, 'grad_norm': 15.048934936523438, 'learning_rate': 3.7232704402515725e-06, 'epoch': 1.8867924528301887}
{'eval_loss': 1.3262227773666382, 'eval_accuracy': 0.4624881291547958, 'eval_f1': 0.4383531109080376, 'eval_runtime': 1.7557, 'eval_samples_per_second': 599.745, 'eval_steps_per_second': 75.182, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 1.1043, 'grad_norm': 38.455665588378906, 'learning_rate': 1.6268343815513629e-06, 'epoch': 2.5157232704402515}
{'eval_loss': 1.2695996761322021, 'eval_accuracy': 0.5004748338081672, 'eval_f1': 0.4771533772801499, 'eval_runtime': 1.7069, 'eval_samples_per_second': 616.913, 'eval_steps_per_second': 77.334, 'epoch': 3.0}
{'train_runtime': 108.1891, 'train_samples_per_second': 176.275, 'train_steps_per_second': 22.045, 'train_loss': 1.2959720251695166, 'epoch': 3.0}


[I 2025-08-03 12:00:39,531] Trial 0 finished with value: 0.5004748338081672 and parameters: {'learning_rate': 1e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.01}. Best is trial 0 with value: 0.5004748338081672.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.6271, 'grad_norm': 21.853464126586914, 'learning_rate': 1.5840670859538786e-05, 'epoch': 0.6289308176100629}
{'eval_loss': 1.351960301399231, 'eval_accuracy': 0.4624881291547958, 'eval_f1': 0.4004168239983701, 'eval_runtime': 1.7612, 'eval_samples_per_second': 597.897, 'eval_steps_per_second': 74.95, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.3728, 'grad_norm': 16.602943420410156, 'learning_rate': 1.1647798742138366e-05, 'epoch': 1.2578616352201257}
{'loss': 1.1903, 'grad_norm': 17.823320388793945, 'learning_rate': 7.454926624737946e-06, 'epoch': 1.8867924528301887}
{'eval_loss': 1.2687902450561523, 'eval_accuracy': 0.5052231718898386, 'eval_f1': 0.493319885333879, 'eval_runtime': 1.7243, 'eval_samples_per_second': 610.696, 'eval_steps_per_second': 76.554, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.9393, 'grad_norm': 48.9475212097168, 'learning_rate': 3.2620545073375264e-06, 'epoch': 2.5157232704402515}
{'eval_loss': 1.165742039680481, 'eval_accuracy': 0.5508072174738842, 'eval_f1': 0.5457370612828932, 'eval_runtime': 1.7536, 'eval_samples_per_second': 600.493, 'eval_steps_per_second': 75.275, 'epoch': 3.0}
{'train_runtime': 107.934, 'train_samples_per_second': 176.691, 'train_steps_per_second': 22.097, 'train_loss': 1.2110294493989124, 'epoch': 3.0}


[I 2025-08-03 12:02:28,448] Trial 1 finished with value: 0.5508072174738842 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.05}. Best is trial 1 with value: 0.5508072174738842.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.4158216714859009, 'eval_accuracy': 0.4301994301994302, 'eval_f1': 0.3582097494609055, 'eval_runtime': 1.7168, 'eval_samples_per_second': 613.353, 'eval_steps_per_second': 76.888, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.348893642425537, 'eval_accuracy': 0.4681861348528015, 'eval_f1': 0.4359816555044184, 'eval_runtime': 3.1074, 'eval_samples_per_second': 338.866, 'eval_steps_per_second': 42.479, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 1.3454, 'grad_norm': 8.192010879516602, 'learning_rate': 4.974874371859296e-06, 'epoch': 2.5125628140703515}
{'eval_loss': 1.245074987411499, 'eval_accuracy': 0.51661918328585, 'eval_f1': 0.5025767629088321, 'eval_runtime': 2.129, 'eval_samples_per_second': 494.592, 'eval_steps_per_second': 62.0, 'epoch': 3.0}
{'train_runtime': 81.1349, 'train_samples_per_second': 235.053, 'train_steps_per_second': 7.358, 'train_loss': 1.287227406973016, 'epoch': 3.0}


[I 2025-08-03 12:03:50,579] Trial 2 finished with value: 0.51661918328585 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0.05}. Best is trial 1 with value: 0.5508072174738842.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.4649406671524048, 'eval_accuracy': 0.39411206077872746, 'eval_f1': 0.3336354844771934, 'eval_runtime': 1.7191, 'eval_samples_per_second': 612.542, 'eval_steps_per_second': 76.786, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.5641, 'grad_norm': 12.253183364868164, 'learning_rate': 5.837520938023451e-06, 'epoch': 1.2562814070351758}
{'eval_loss': 1.3697775602340698, 'eval_accuracy': 0.45584045584045585, 'eval_f1': 0.4145028403252422, 'eval_runtime': 1.7193, 'eval_samples_per_second': 612.471, 'eval_steps_per_second': 76.777, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 1.2595, 'grad_norm': 11.929889678955078, 'learning_rate': 1.6499162479061978e-06, 'epoch': 2.5125628140703515}
{'eval_loss': 1.3074302673339844, 'eval_accuracy': 0.4700854700854701, 'eval_f1': 0.43866464479025286, 'eval_runtime': 4.6464, 'eval_samples_per_second': 226.629, 'eval_steps_per_second': 28.409, 'epoch': 3.0}
{'train_runtime': 92.4338, 'train_samples_per_second': 206.321, 'train_steps_per_second': 12.917, 'train_loss': 1.370381238672202, 'epoch': 3.0}


[I 2025-08-03 12:05:23,996] Trial 3 finished with value: 0.4700854700854701 and parameters: {'learning_rate': 1e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.05}. Best is trial 1 with value: 0.5508072174738842.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.6239, 'grad_norm': 23.71595573425293, 'learning_rate': 1.5832285115303986e-05, 'epoch': 0.6289308176100629}
{'eval_loss': 1.4533549547195435, 'eval_accuracy': 0.4017094017094017, 'eval_f1': 0.35411849020356767, 'eval_runtime': 1.762, 'eval_samples_per_second': 597.619, 'eval_steps_per_second': 74.915, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.3926, 'grad_norm': 19.56734275817871, 'learning_rate': 1.1639412997903566e-05, 'epoch': 1.2578616352201257}
{'loss': 1.212, 'grad_norm': 16.245935440063477, 'learning_rate': 7.454926624737946e-06, 'epoch': 1.8867924528301887}
{'eval_loss': 1.2878844738006592, 'eval_accuracy': 0.4890788224121557, 'eval_f1': 0.4761868711414718, 'eval_runtime': 1.7247, 'eval_samples_per_second': 610.554, 'eval_steps_per_second': 76.537, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.9945, 'grad_norm': 41.73884963989258, 'learning_rate': 3.2620545073375264e-06, 'epoch': 2.5157232704402515}
{'eval_loss': 1.1963164806365967, 'eval_accuracy': 0.5422602089268755, 'eval_f1': 0.5323830799785298, 'eval_runtime': 2.026, 'eval_samples_per_second': 519.745, 'eval_steps_per_second': 65.153, 'epoch': 3.0}
{'train_runtime': 106.776, 'train_samples_per_second': 178.608, 'train_steps_per_second': 22.336, 'train_loss': 1.2358023653490238, 'epoch': 3.0}


[I 2025-08-03 12:07:11,733] Trial 4 finished with value: 0.5422602089268755 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.0}. Best is trial 1 with value: 0.5508072174738842.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.3896487951278687, 'eval_accuracy': 0.4624881291547958, 'eval_f1': 0.41445381466850933, 'eval_runtime': 9.2984, 'eval_samples_per_second': 113.246, 'eval_steps_per_second': 14.196, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.4923, 'grad_norm': 5.250607490539551, 'learning_rate': 1.7462311557788943e-05, 'epoch': 1.2562814070351758}
{'eval_loss': 1.297634243965149, 'eval_accuracy': 0.4852801519468186, 'eval_f1': 0.4750227797156672, 'eval_runtime': 1.7449, 'eval_samples_per_second': 603.489, 'eval_steps_per_second': 75.651, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 1.0481, 'grad_norm': 13.177608489990234, 'learning_rate': 4.899497487437186e-06, 'epoch': 2.5125628140703515}
{'eval_loss': 1.1829816102981567, 'eval_accuracy': 0.5508072174738842, 'eval_f1': 0.5397971194723207, 'eval_runtime': 1.7187, 'eval_samples_per_second': 612.685, 'eval_steps_per_second': 76.804, 'epoch': 3.0}
{'train_runtime': 97.3006, 'train_samples_per_second': 196.001, 'train_steps_per_second': 12.271, 'train_loss': 1.1982987625914403, 'epoch': 3.0}


[I 2025-08-03 12:08:49,993] Trial 5 finished with value: 0.5508072174738842 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.01}. Best is trial 1 with value: 0.5508072174738842.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'eval_loss': 1.4074381589889526, 'eval_accuracy': 0.43779677113010446, 'eval_f1': 0.36720136160028755, 'eval_runtime': 1.7319, 'eval_samples_per_second': 608.006, 'eval_steps_per_second': 76.217, 'epoch': 1.0}


  return forward_call(*args, **kwargs)
[I 2025-08-03 12:09:43,246] Trial 6 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.4502358436584473, 'eval_accuracy': 0.42830009496676164, 'eval_f1': 0.41078691892543734, 'eval_runtime': 1.7357, 'eval_samples_per_second': 606.663, 'eval_steps_per_second': 76.049, 'epoch': 2.0}


  return forward_call(*args, **kwargs)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.6258, 'grad_norm': 27.65797233581543, 'learning_rate': 1.5832285115303986e-05, 'epoch': 0.6289308176100629}
{'eval_loss': 1.3615940809249878, 'eval_accuracy': 0.46533713200379867, 'eval_f1': 0.4054517187074629, 'eval_runtime': 1.7152, 'eval_samples_per_second': 613.925, 'eval_steps_per_second': 76.959, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.3828, 'grad_norm': 17.681236267089844, 'learning_rate': 1.1647798742138366e-05, 'epoch': 1.2578616352201257}
{'loss': 1.2085, 'grad_norm': 14.948044776916504, 'learning_rate': 7.454926624737946e-06, 'epoch': 1.8867924528301887}


[I 2025-08-03 12:11:22,604] Trial 8 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.2720001935958862, 'eval_accuracy': 0.4881291547958215, 'eval_f1': 0.4759100069542206, 'eval_runtime': 2.0566, 'eval_samples_per_second': 512.0, 'eval_steps_per_second': 64.182, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.4376544952392578, 'eval_accuracy': 0.43209876543209874, 'eval_f1': 0.35875045974538583, 'eval_runtime': 1.7648, 'eval_samples_per_second': 596.681, 'eval_steps_per_second': 74.798, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 1.3476829528808594, 'eval_accuracy': 0.4691358024691358, 'eval_f1': 0.43956214354946677, 'eval_runtime': 2.7349, 'eval_samples_per_second': 385.019, 'eval_steps_per_second': 48.265, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 1.3811, 'grad_norm': 7.51597785949707, 'learning_rate': 3.3165829145728647e-06, 'epoch': 2.5125628140703515}


[I 2025-08-03 12:12:43,819] Trial 9 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.2790534496307373, 'eval_accuracy': 0.5052231718898386, 'eval_f1': 0.48145738843078467, 'eval_runtime': 1.9298, 'eval_samples_per_second': 545.643, 'eval_steps_per_second': 68.4, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 1.6094, 'grad_norm': 18.808290481567383, 'learning_rate': 1.6861635220125788e-05, 'epoch': 0.6289308176100629}


[I 2025-08-03 12:13:20,932] Trial 10 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.3904134035110474, 'eval_accuracy': 0.4482431149097816, 'eval_f1': 0.3881618233944481, 'eval_runtime': 1.7211, 'eval_samples_per_second': 611.834, 'eval_steps_per_second': 76.697, 'epoch': 1.0}


  return forward_call(*args, **kwargs)
[I 2025-08-03 12:13:51,444] Trial 11 pruned. 


{'eval_loss': 1.3935059309005737, 'eval_accuracy': 0.46153846153846156, 'eval_f1': 0.4066054912988464, 'eval_runtime': 1.7277, 'eval_samples_per_second': 609.492, 'eval_steps_per_second': 76.404, 'epoch': 1.0}
Best hyperparameters:
{'learning_rate': 2e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.05}


In [None]:
# for safety save best hyperparameters
best_run_A_df = pd.DataFrame(best_run)
best_run_A_df.to_csv("hyperpara_B.csv",index=False)

## Training with best Tuning Parameters


In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=best_run.hyperparameters["learning_rate"],
    per_device_train_batch_size=best_run.hyperparameters["per_device_train_batch_size"],
    num_train_epochs=best_run.hyperparameters["num_train_epochs"],
    weight_decay=best_run.hyperparameters["weight_decay"],
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.1322,1.012778,0.602913,0.599061
2,0.8055,0.953831,0.634848,0.62913
3,0.5721,0.958742,0.658712,0.66009


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=9999, training_loss=0.9272107233916751, metrics={'train_runtime': 473.678, 'train_samples_per_second': 168.843, 'train_steps_per_second': 21.109, 'total_flos': 2.1043588614948864e+16, 'train_loss': 0.9272107233916751, 'epoch': 3.0})

In [13]:
model.save_pretrained("classifier_final_B/")
tokenizer.save_pretrained("classifier_final_B/")

('classifier_final_B/tokenizer_config.json',
 'classifier_final_B/special_tokens_map.json',
 'classifier_final_B/vocab.txt',
 'classifier_final_B/added_tokens.json',
 'classifier_final_B/tokenizer.json')

In [14]:
# model und tokenizer müssen schon geladen sein
model.eval()

def tokenize_sliding_windows(example: Dict[str, Any]) -> Dict[str, Any]:
    encoding = tokenizer(
        example["speech_text"],
        truncation=True,
        padding="max_length",
        max_length=WINDOW_LENGTH,
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=False,
        return_tensors="pt"
    )
    return encoding

def predict_proba_for_dataset(dataset: Dataset, label_names) -> List[Dict[str, Any]]:
    results = []

    for example in tqdm(dataset):
        tokenized = tokenize_sliding_windows(example)
        input_ids = tokenized["input_ids"].to(model.device)
        attention_mask = tokenized["attention_mask"].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()

        avg_probs = probs.mean(axis=0)
        results.append({
            "probs": avg_probs.tolist(),
            "label": example["label"]  # falls du wahren Wert mitgeben willst
        })

    for item in results:
      probs = item["probs"]
      pred_idx = int(np.argmax(probs))
      item["prediction_label"] = label_names[pred_idx]
    return results


### Validation

In [15]:
results_val_B = predict_proba_for_dataset(val_data, sorted(label_names))

  return forward_call(*args, **kwargs)
100%|██████████| 5418/5418 [00:35<00:00, 154.78it/s]


In [16]:
results_val_B_df = pd.DataFrame(results_val_B)

In [17]:
results_val_B_df.to_csv("classifier_final_B_validation_results.csv",index=False)

### TEST

In [18]:
results_test_B = predict_proba_for_dataset(test_data, sorted(label_names))

100%|██████████| 5418/5418 [00:34<00:00, 155.33it/s]


In [19]:
results_test_B_df = pd.DataFrame(results_test_B)

In [20]:
results_test_B_df.to_csv("classifier_final_B_test_results.csv", index=False)

In [21]:

# Name of the zip file you want to create
zip_filename = "allresultsB.zip"

# Create a zip file
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add model/tokenizer folder
    for root, dirs, files in os.walk("classifier_final_B/"):
        for file in files:
            filepath = os.path.join(root, file)
            arcname = os.path.relpath(filepath, start=os.path.dirname("classifier_final_B/"))
            zipf.write(filepath, arcname=arcname)

    # Add any CSVs you want
    for csv_file in ["hyperpara_B.csv", "classifier_final_B_validation_results.csv", "classifier_final_B_test_results.csv"]:
        if os.path.exists(csv_file):
            zipf.write(csv_file)