# Fine-tuning Classifier LLM


In [2]:
!pip install optuna
!pip install typing
!pip install evaluate
!pip install torch
!pip install transformers
!pip install accelerate>=0.26.0

Collecting optuna
  Using cached optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Using cached colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Using cached optuna-4.4.0-py3-none-any.whl (395 kB)
Using cached colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.4.0
Collecting typing
  Using cached typing-3.7.4.3-py3-none-any.whl
Installing collected packages: typing
Successfully installed typing-3.7.4.3
Collecting evaluate
  Using cached evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Using cached datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from evaluate)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Using cached multiprocess-0.70.18-py311-none-any.whl.metadata (7.5 kB)
Collecting huggingface-hub>=0.7.0 

In [3]:
# setup - load packages
import pandas as pd
from datasets import Dataset, load_dataset
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    balanced_accuracy_score
)
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from typing import Union, Mapping, List, Dict, Any
import evaluate
from tqdm import tqdm
import zipfile
import os


# Set up device (is available use GPU to speed up computations)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

seed = 13

cuda


## VERSION A

In [None]:
# load data
classifier_data = pd.read_csv("../data/classifier_data_A.csv")
# converting to huggingface dataset format
data = Dataset.from_pandas(classifier_data)

# splitting into train, test and validation sets
# party data
raw_dataset = data.shuffle(seed=seed)

# 70% train, 15% test, 15% validation data
split = raw_dataset.train_test_split(test_size=0.3, seed=seed)
train_data = split["train"]
text_and_val_data = split["test"]
split = text_and_val_data.train_test_split(test_size=0.5, seed=seed)
test_data = split["train"]
val_data = split["test"]

print(f"Training samples party: {len(train_data)}")
print(f"Test samples party: {len(test_data)}")
print(f"Validation samples party: {len(val_data)}")


Training samples party: 25281
Test samples party: 5418
Validation samples party: 5418


In [5]:
# subset of train and val data for auto-tuning
train_data_for_tune = train_data.shuffle(seed=seed)
val_data_for_tune = val_data.shuffle(seed=seed)

# subsetting roughly 20-25% of train and vall data for tuning
train_data_for_tune = train_data_for_tune.select(range(6000))
val_data_for_tune = val_data_for_tune.select(range(1000))


In [6]:
WINDOW_LENGTH = 512
STRIDE = 256

In [7]:
# Load Tokenizer
model_name = "bert-base-german-cased"
num_labels = 6
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=WINDOW_LENGTH
)


In [8]:
label_names = ['CDU/CSU', 'SPD', 'GRÜNE', 'FDP', 'AfD', 'LINKE']
label2id = {label: i for i, label in enumerate(sorted(label_names))}
id2label = {i: label for label, i in label2id.items()}

In [None]:
# function to tokenize train data
def sliding_window_tokenize(batch):
    texts = batch["speech_text"]
    labels = batch["label"]  

    tokenized = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=WINDOW_LENGTH,
        stride=STRIDE,
        return_overflowing_tokens=True,
    )

    # Assign each overflow window the correct label
    tokenized["labels"] = [label2id[labels[i]] for i in tokenized["overflow_to_sample_mapping"]]

    return tokenized


In [10]:
tokenized_train_data = train_data.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=train_data.column_names
)

tokenized_val_data = val_data.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=train_data.column_names
)

# also tokenizing subsets

tokenized_train_data_subset = train_data_for_tune.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=train_data_for_tune.column_names
)

tokenized_val_data_subset = val_data_for_tune.map(
    sliding_window_tokenize,
    batched=True,
    remove_columns=val_data_for_tune.column_names
)

Map:   0%|          | 0/25281 [00:00<?, ? examples/s]

Map:   0%|          | 0/5418 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
# load model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-german-cased",
    num_labels=len(label_names),
    id2label=id2label,
    label2id=label2id
)

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]

    return {
        "accuracy": acc,
        "f1": f1
    }


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Hyperparameter Tuning

In [None]:
# training arguments for hyperparameter tuning
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",  # don't save checkpoints during tuning
    logging_dir="./logs",
    disable_tqdm=True,  # speed up tuning
    fp16=torch.cuda.is_available(),
    report_to="none",   
)

def hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [1e-5, 2e-5, 3e-5]),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 4),
        "weight_decay": trial.suggest_categorical("weight_decay", [0.0, 0.01, 0.05]),
    }

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_train_data_subset,
    eval_dataset=tokenized_val_data_subset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

best_run = trainer.hyperparameter_search(
    direction="maximize",        # because we want to maximize accuracy
    hp_space=hp_space,
    n_trials=12,                 # how many combinations to try
    compute_objective=lambda metrics: metrics["eval_accuracy"],
    backend="optuna"
)

print("Best hyperparameters:")
print(best_run.hyperparameters)


  trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-08-03 09:49:55,213] A new study created in memory with name: no-name-ab040935-930d-492b-980b-51601203f223
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.3588, 'grad_norm': 7.010796070098877, 'learning_rate': 2.300467289719626e-05, 'epoch': 0.9345794392523364}
{'eval_loss': 1.1404218673706055, 'eval_accuracy': 0.5475271834444054, 'eval_f1': 0.5046022190553141, 'eval_runtime': 4.5878, 'eval_samples_per_second': 621.433, 'eval_steps_per_second': 77.815, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 0.8737, 'grad_norm': 12.792377471923828, 'learning_rate': 1.5995327102803738e-05, 'epoch': 1.8691588785046729}
{'eval_loss': 1.0190753936767578, 'eval_accuracy': 0.6085584005612066, 'eval_f1': 0.6097855142082702, 'eval_runtime': 4.8344, 'eval_samples_per_second': 589.737, 'eval_steps_per_second': 73.846, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.5107, 'grad_norm': 8.238180160522461, 'learning_rate': 8.985981308411215e-06, 'epoch': 2.803738317757009}
{'eval_loss': 1.1840609312057495, 'eval_accuracy': 0.6047001052262364, 'eval_f1': 0.5965162573726983, 'eval_runtime': 4.5701, 'eval_samples_per_second': 623.844, 'eval_steps_per_second': 78.117, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.2758, 'grad_norm': 10.645442962646484, 'learning_rate': 1.9766355140186916e-06, 'epoch': 3.7383177570093458}
{'eval_loss': 1.2909108400344849, 'eval_accuracy': 0.6096106629252894, 'eval_f1': 0.6066088446739216, 'eval_runtime': 4.6009, 'eval_samples_per_second': 619.659, 'eval_steps_per_second': 77.593, 'epoch': 4.0}
{'train_runtime': 284.7827, 'train_samples_per_second': 240.239, 'train_steps_per_second': 7.515, 'train_loss': 0.7190143166301406, 'epoch': 4.0}


[I 2025-08-03 09:54:41,017] Trial 0 finished with value: 0.6096106629252894 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4, 'weight_decay': 0.01}. Best is trial 0 with value: 0.6096106629252894.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.4733, 'grad_norm': 11.476741790771484, 'learning_rate': 1.766604303086997e-05, 'epoch': 0.4677268475210477}
{'loss': 1.1911, 'grad_norm': 14.286605834960938, 'learning_rate': 1.5327408793264736e-05, 'epoch': 0.9354536950420954}
{'eval_loss': 1.1461663246154785, 'eval_accuracy': 0.5524377411434584, 'eval_f1': 0.5248503547735788, 'eval_runtime': 4.577, 'eval_samples_per_second': 622.894, 'eval_steps_per_second': 77.998, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 0.9198, 'grad_norm': 16.522441864013672, 'learning_rate': 1.2993451824134707e-05, 'epoch': 1.4031805425631432}
{'loss': 0.7803, 'grad_norm': 19.53230857849121, 'learning_rate': 1.0654817586529467e-05, 'epoch': 1.8709073900841908}
{'eval_loss': 1.003921627998352, 'eval_accuracy': 0.6117151876534549, 'eval_f1': 0.6084155072034919, 'eval_runtime': 5.3368, 'eval_samples_per_second': 534.219, 'eval_steps_per_second': 66.894, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.5533, 'grad_norm': 23.48984718322754, 'learning_rate': 8.31618334892423e-06, 'epoch': 2.3386342376052385}
{'loss': 0.4456, 'grad_norm': 48.369693756103516, 'learning_rate': 5.97754911131899e-06, 'epoch': 2.8063610851262863}
{'eval_loss': 1.1195707321166992, 'eval_accuracy': 0.6264468607506138, 'eval_f1': 0.6260638688261942, 'eval_runtime': 4.5838, 'eval_samples_per_second': 621.972, 'eval_steps_per_second': 77.883, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.3435, 'grad_norm': 35.545108795166016, 'learning_rate': 3.643592142188962e-06, 'epoch': 3.2740879326473338}
{'loss': 0.2267, 'grad_norm': 13.453972816467285, 'learning_rate': 1.3049579045837232e-06, 'epoch': 3.7418147801683816}
{'eval_loss': 1.3165870904922485, 'eval_accuracy': 0.6250438442651701, 'eval_f1': 0.6284779963828627, 'eval_runtime': 4.6092, 'eval_samples_per_second': 618.543, 'eval_steps_per_second': 77.454, 'epoch': 4.0}
{'train_runtime': 322.1801, 'train_samples_per_second': 212.353, 'train_steps_per_second': 13.272, 'train_loss': 0.7079996906562427, 'epoch': 4.0}


[I 2025-08-03 10:00:04,247] Trial 1 finished with value: 0.6250438442651701 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4, 'weight_decay': 0.01}. Best is trial 1 with value: 0.6250438442651701.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.4749, 'grad_norm': 12.236943244934082, 'learning_rate': 1.6888057374493297e-05, 'epoch': 0.4677268475210477}
{'loss': 1.1968, 'grad_norm': 14.109500885009766, 'learning_rate': 1.3769878391019646e-05, 'epoch': 0.9354536950420954}
{'eval_loss': 1.142946481704712, 'eval_accuracy': 0.5615573482988425, 'eval_f1': 0.5300731753744207, 'eval_runtime': 4.5673, 'eval_samples_per_second': 624.215, 'eval_steps_per_second': 78.164, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 0.9251, 'grad_norm': 16.836519241333008, 'learning_rate': 1.065793576551294e-05, 'epoch': 1.4031805425631432}
{'loss': 0.7873, 'grad_norm': 15.973337173461914, 'learning_rate': 7.539756782039289e-06, 'epoch': 1.8709073900841908}
{'eval_loss': 1.0207879543304443, 'eval_accuracy': 0.6047001052262364, 'eval_f1': 0.6029038029611193, 'eval_runtime': 5.4689, 'eval_samples_per_second': 521.313, 'eval_steps_per_second': 65.278, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.5836, 'grad_norm': 22.121753692626953, 'learning_rate': 4.421577798565638e-06, 'epoch': 2.3386342376052385}
{'loss': 0.4763, 'grad_norm': 30.23177719116211, 'learning_rate': 1.3033988150919862e-06, 'epoch': 2.8063610851262863}
{'eval_loss': 1.1191003322601318, 'eval_accuracy': 0.6096106629252894, 'eval_f1': 0.6159596985461427, 'eval_runtime': 4.5651, 'eval_samples_per_second': 624.528, 'eval_steps_per_second': 78.203, 'epoch': 3.0}
{'train_runtime': 241.1791, 'train_samples_per_second': 212.755, 'train_steps_per_second': 13.297, 'train_loss': 0.8805103307949408, 'epoch': 3.0}


[I 2025-08-03 10:04:06,470] Trial 2 finished with value: 0.6096106629252894 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.05}. Best is trial 1 with value: 0.6250438442651701.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.4661, 'grad_norm': 10.92751407623291, 'learning_rate': 2.6499064546304958e-05, 'epoch': 0.4677268475210477}
{'loss': 1.1414, 'grad_norm': 10.876792907714844, 'learning_rate': 2.2991113189897102e-05, 'epoch': 0.9354536950420954}
{'eval_loss': 1.1230506896972656, 'eval_accuracy': 0.5804980708523325, 'eval_f1': 0.5500162740828638, 'eval_runtime': 4.5683, 'eval_samples_per_second': 624.079, 'eval_steps_per_second': 78.147, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 0.8472, 'grad_norm': 25.02106475830078, 'learning_rate': 1.949017773620206e-05, 'epoch': 1.4031805425631432}
{'loss': 0.6993, 'grad_norm': 19.905242919921875, 'learning_rate': 1.59822263797942e-05, 'epoch': 1.8709073900841908}
{'eval_loss': 1.0219697952270508, 'eval_accuracy': 0.6201332865661171, 'eval_f1': 0.6275697024579233, 'eval_runtime': 4.5742, 'eval_samples_per_second': 623.279, 'eval_steps_per_second': 78.047, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.4402, 'grad_norm': 19.926969528198242, 'learning_rate': 1.2474275023386343e-05, 'epoch': 2.3386342376052385}
{'loss': 0.32, 'grad_norm': 56.39204025268555, 'learning_rate': 8.966323666978485e-06, 'epoch': 2.8063610851262863}
{'eval_loss': 1.281119704246521, 'eval_accuracy': 0.6176780077165906, 'eval_f1': 0.6268577525819655, 'eval_runtime': 4.5715, 'eval_samples_per_second': 623.645, 'eval_steps_per_second': 78.092, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'loss': 0.2188, 'grad_norm': 15.006366729736328, 'learning_rate': 5.4583723105706265e-06, 'epoch': 3.2740879326473338}
{'loss': 0.1134, 'grad_norm': 41.337501525878906, 'learning_rate': 1.9574368568755847e-06, 'epoch': 3.7418147801683816}
{'eval_loss': 1.6945178508758545, 'eval_accuracy': 0.6229393195370045, 'eval_f1': 0.6280833753842191, 'eval_runtime': 4.5816, 'eval_samples_per_second': 622.274, 'eval_steps_per_second': 77.921, 'epoch': 4.0}
{'train_runtime': 319.4622, 'train_samples_per_second': 214.16, 'train_steps_per_second': 13.385, 'train_loss': 0.62085386879317, 'epoch': 4.0}


[I 2025-08-03 10:09:26,953] Trial 3 finished with value: 0.6229393195370045 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4, 'weight_decay': 0.01}. Best is trial 1 with value: 0.6250438442651701.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.3858, 'grad_norm': 7.499480247497559, 'learning_rate': 1.3781931464174456e-05, 'epoch': 0.9345794392523364}
{'eval_loss': 1.2219468355178833, 'eval_accuracy': 0.5096457383374254, 'eval_f1': 0.4590046456270773, 'eval_runtime': 4.5653, 'eval_samples_per_second': 624.5, 'eval_steps_per_second': 78.199, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 0.9961, 'grad_norm': 14.370230674743652, 'learning_rate': 7.551401869158879e-06, 'epoch': 1.8691588785046729}
{'eval_loss': 1.0711692571640015, 'eval_accuracy': 0.5850578744300245, 'eval_f1': 0.5661563203267821, 'eval_runtime': 5.3621, 'eval_samples_per_second': 531.699, 'eval_steps_per_second': 66.579, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.7205, 'grad_norm': 17.958833694458008, 'learning_rate': 1.3333333333333334e-06, 'epoch': 2.803738317757009}
{'eval_loss': 1.0904748439788818, 'eval_accuracy': 0.5910206944931603, 'eval_f1': 0.5892366223304414, 'eval_runtime': 4.5785, 'eval_samples_per_second': 622.699, 'eval_steps_per_second': 77.974, 'epoch': 3.0}
{'train_runtime': 213.7504, 'train_samples_per_second': 240.056, 'train_steps_per_second': 7.509, 'train_loss': 1.0114895164038162, 'epoch': 3.0}


[I 2025-08-03 10:13:01,714] Trial 4 finished with value: 0.5910206944931603 and parameters: {'learning_rate': 2e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3, 'weight_decay': 0.01}. Best is trial 1 with value: 0.6250438442651701.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.4654, 'grad_norm': 10.902314186096191, 'learning_rate': 2.5332086061739943e-05, 'epoch': 0.4677268475210477}
{'loss': 1.1472, 'grad_norm': 11.424919128417969, 'learning_rate': 2.065481758652947e-05, 'epoch': 0.9354536950420954}


[I 2025-08-03 10:14:22,325] Trial 5 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.1567834615707397, 'eval_accuracy': 0.5608558400561207, 'eval_f1': 0.5310244693753146, 'eval_runtime': 4.5977, 'eval_samples_per_second': 620.088, 'eval_steps_per_second': 77.647, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.4643, 'grad_norm': 15.260435104370117, 'learning_rate': 2.5332086061739943e-05, 'epoch': 0.4677268475210477}
{'loss': 1.1378, 'grad_norm': 12.612088203430176, 'learning_rate': 2.065481758652947e-05, 'epoch': 0.9354536950420954}
{'eval_loss': 1.1369816064834595, 'eval_accuracy': 0.5682216766047001, 'eval_f1': 0.5314858955619988, 'eval_runtime': 4.6358, 'eval_samples_per_second': 614.995, 'eval_steps_per_second': 77.009, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 0.8356, 'grad_norm': 17.96656036376953, 'learning_rate': 1.598690364826941e-05, 'epoch': 1.4031805425631432}
{'loss': 0.6895, 'grad_norm': 15.333809852600098, 'learning_rate': 1.1309635173058933e-05, 'epoch': 1.8709073900841908}
{'eval_loss': 1.0303751230239868, 'eval_accuracy': 0.6183795159593125, 'eval_f1': 0.6239881860105169, 'eval_runtime': 4.5879, 'eval_samples_per_second': 621.416, 'eval_steps_per_second': 77.813, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.4445, 'grad_norm': 16.583087921142578, 'learning_rate': 6.632366697848457e-06, 'epoch': 2.3386342376052385}
{'loss': 0.3236, 'grad_norm': 51.34506607055664, 'learning_rate': 1.9550982226379795e-06, 'epoch': 2.8063610851262863}
{'eval_loss': 1.2453782558441162, 'eval_accuracy': 0.6075061381971238, 'eval_f1': 0.6144460669275746, 'eval_runtime': 4.5852, 'eval_samples_per_second': 621.787, 'eval_steps_per_second': 77.86, 'epoch': 3.0}
{'train_runtime': 237.9624, 'train_samples_per_second': 215.631, 'train_steps_per_second': 13.477, 'train_loss': 0.7840054389803738, 'epoch': 3.0}


[I 2025-08-03 10:18:21,236] Trial 6 finished with value: 0.6075061381971238 and parameters: {'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.0}. Best is trial 1 with value: 0.6250438442651701.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)


{'loss': 1.5777, 'grad_norm': 17.556352615356445, 'learning_rate': 2.7675397567820396e-05, 'epoch': 0.23386342376052385}
{'loss': 1.3863, 'grad_norm': 9.491962432861328, 'learning_rate': 2.5336763330215154e-05, 'epoch': 0.4677268475210477}
{'loss': 1.2475, 'grad_norm': 15.030865669250488, 'learning_rate': 2.299812909260992e-05, 'epoch': 0.7015902712815716}
{'loss': 1.0832, 'grad_norm': 11.10668659210205, 'learning_rate': 2.0659494855004677e-05, 'epoch': 0.9354536950420954}


[I 2025-08-03 10:19:57,095] Trial 7 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.132272481918335, 'eval_accuracy': 0.5626096106629253, 'eval_f1': 0.5268475753076346, 'eval_runtime': 4.5649, 'eval_samples_per_second': 624.548, 'eval_steps_per_second': 78.205, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.3846, 'grad_norm': 8.129979133605957, 'learning_rate': 1.533644859813084e-05, 'epoch': 0.9345794392523364}


[I 2025-08-03 10:21:08,874] Trial 8 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.2166361808776855, 'eval_accuracy': 0.5128025254296738, 'eval_f1': 0.46655498497618614, 'eval_runtime': 4.6189, 'eval_samples_per_second': 617.249, 'eval_steps_per_second': 77.291, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.5519, 'grad_norm': 14.927127838134766, 'learning_rate': 1.8447146866230123e-05, 'epoch': 0.23386342376052385}
{'loss': 1.3645, 'grad_norm': 32.44108963012695, 'learning_rate': 1.6888057374493297e-05, 'epoch': 0.4677268475210477}
{'loss': 1.2267, 'grad_norm': 14.992728233337402, 'learning_rate': 1.5328967882756472e-05, 'epoch': 0.7015902712815716}
{'loss': 1.0705, 'grad_norm': 10.32685661315918, 'learning_rate': 1.3769878391019646e-05, 'epoch': 0.9354536950420954}


[I 2025-08-03 10:22:44,723] Trial 9 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.1250839233398438, 'eval_accuracy': 0.5657663977551737, 'eval_f1': 0.5255007397498338, 'eval_runtime': 4.5676, 'eval_samples_per_second': 624.177, 'eval_steps_per_second': 78.159, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.5036, 'grad_norm': 13.761590957641602, 'learning_rate': 8.833021515434985e-06, 'epoch': 0.4677268475210477}
{'loss': 1.2903, 'grad_norm': 9.131773948669434, 'learning_rate': 7.666043030869974e-06, 'epoch': 0.9354536950420954}


[I 2025-08-03 10:24:04,622] Trial 10 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'eval_loss': 1.3069183826446533, 'eval_accuracy': 0.4949140652402666, 'eval_f1': 0.43732700491042986, 'eval_runtime': 4.5966, 'eval_samples_per_second': 620.247, 'eval_steps_per_second': 77.667, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 1.5036, 'grad_norm': 13.79149341583252, 'learning_rate': 8.833021515434985e-06, 'epoch': 0.4677268475210477}
{'loss': 1.2903, 'grad_norm': 9.133344650268555, 'learning_rate': 7.666043030869974e-06, 'epoch': 0.9354536950420954}


[I 2025-08-03 10:25:25,276] Trial 11 pruned. 


{'eval_loss': 1.3068537712097168, 'eval_accuracy': 0.4952648193616275, 'eval_f1': 0.43752644403923574, 'eval_runtime': 4.589, 'eval_samples_per_second': 621.274, 'eval_steps_per_second': 77.795, 'epoch': 1.0}
Best hyperparameters:
{'learning_rate': 2e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4, 'weight_decay': 0.01}


In [None]:
# for safety save best hyperparameters
best_run_A_df = pd.DataFrame(best_run)
best_run_A_df.to_csv("hyperpara_A.csv",index=False)

## Training with best Tuning Parameters


In [None]:
# use best training args and train with those
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=best_run.hyperparameters["learning_rate"],
    per_device_train_batch_size=best_run.hyperparameters["per_device_train_batch_size"],
    num_train_epochs=best_run.hyperparameters["num_train_epochs"],
    weight_decay=best_run.hyperparameters["weight_decay"],
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7825,0.791926,0.696212,0.685893
2,0.5035,0.744961,0.729337,0.726224
3,0.2739,0.901958,0.737407,0.730929
4,0.1238,1.193077,0.737212,0.731952


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=18004, training_loss=0.4964650215155229, metrics={'train_runtime': 1369.2838, 'train_samples_per_second': 210.355, 'train_steps_per_second': 13.148, 'total_flos': 7.578817772978995e+16, 'train_loss': 0.4964650215155229, 'epoch': 4.0})

In [None]:
# save trained model and tokenizer
model.save_pretrained("classifier_final_A/")
tokenizer.save_pretrained("classifier_final_A/")

('classifier_final_A/tokenizer_config.json',
 'classifier_final_A/special_tokens_map.json',
 'classifier_final_A/vocab.txt',
 'classifier_final_A/added_tokens.json',
 'classifier_final_A/tokenizer.json')

In [None]:
# define tokenization and evaluation on val and test set
model.eval()

def tokenize_sliding_windows(example: Dict[str, Any]) -> Dict[str, Any]:
    encoding = tokenizer(
        example["speech_text"],
        truncation=True,
        padding="max_length",
        max_length=WINDOW_LENGTH,
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=False,
        return_tensors="pt"
    )
    return encoding

# function that gives probabilities for all categories and prediction
def predict_proba_for_dataset(dataset: Dataset, label_names) -> List[Dict[str, Any]]:
    results = []

    for example in tqdm(dataset):
        tokenized = tokenize_sliding_windows(example)
        input_ids = tokenized["input_ids"].to(model.device)
        attention_mask = tokenized["attention_mask"].to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()

        avg_probs = probs.mean(axis=0)
        results.append({
            "probs": avg_probs.tolist(),
            "label": example["label"]  
        })

    for item in results:
      probs = item["probs"]
      pred_idx = int(np.argmax(probs))
      item["prediction_label"] = label_names[pred_idx]
    return results


### Validation

In [17]:
results_val_A = predict_proba_for_dataset(val_data, sorted(label_names))

  return forward_call(*args, **kwargs)
100%|██████████| 5418/5418 [00:42<00:00, 126.45it/s]


In [18]:
results_val_A_df = pd.DataFrame(results_val_A)

In [19]:
results_val_A_df.to_csv("classifier_final_A_validation_results.csv",index=False)

### TEST

In [20]:
results_test_A = predict_proba_for_dataset(test_data, sorted(label_names))

100%|██████████| 5418/5418 [00:42<00:00, 126.78it/s]


In [21]:
results_test_A_df = pd.DataFrame(results_test_A)

In [22]:
results_test_A_df.to_csv("classifier_final_A_test_results.csv", index=False)

In [None]:
# Save all results in zip to easily download from colab
zip_filename = "allresultsA.zip"

# Create a zip file
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add model/tokenizer folder
    for root, dirs, files in os.walk("classifier_final_A/"):
        for file in files:
            filepath = os.path.join(root, file)
            arcname = os.path.relpath(filepath, start=os.path.dirname("classifier_final_A/"))
            zipf.write(filepath, arcname=arcname)

    # Add all CSVs 
    for csv_file in ["hyperpara_A.csv", "classifier_final_A_validation_results.csv", "classifier_final_A_test_results.csv"]:
        if os.path.exists(csv_file):
            zipf.write(csv_file)