In [1]:
# ==============================================================================
#  CORE LIBRARIES & INITIAL SETUP
# ==============================================================================

# --- General Purpose Imports ---
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split

# --- PyTorch ---
# Consolidating torch import for deep learning operations.
import torch

# --- spaCy ---
# Load the German spaCy model for NLP tasks like tokenization and POS tagging.
# This 'nlp' object will be used globally in the notebook.
import spacy
nlp = spacy.load("de_core_news_lg")


# ==============================================================================
#  HUGGING FACE LIBRARY IMPORTS
# ==============================================================================

# --- Core Transformers ---
# Import main classes for models, tokenizers, and the training pipeline.
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    GenerationConfig
)

# --- TRL (Transformer Reinforcement Learning) ---
# Import specific classes for PPO training and value-head models.
from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead

# --- Datasets ---
# Import classes for handling and manipulating datasets efficiently.
from datasets import Dataset, DatasetDict


# ==============================================================================
#  ENVIRONMENT & LOGGING CONFIGURATION
# ==============================================================================

# --- Set Log Verbosity ---
# Reduce the log output from the transformers library to keep the console clean.
# This will only show warnings and errors, hiding routine progress info.
import transformers
transformers.logging.set_verbosity_warning()

# --- Disable Tokenizer Parallelism ---
# Set an environment variable to prevent a deadlock warning/error when using
# `datasets.map()` with multiprocessing. The `datasets` library will handle
# the parallelism, so the tokenizer's internal parallelism is disabled.
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# --- Configure Hardware Device (CPU vs. Apple Silicon GPU) ---
# Check for the availability of Apple's Metal Performance Shaders (MPS) for GPU
# acceleration and set the global device accordingly.
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

W0925 17:34:31.716000 5276 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
Using device: mps


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [2]:
# This special import uses the 'ipynb' library to load functions directly
# from the `reward_computation.ipynb` file. This helps keep the project organized
# by separating the complex reward logic from the main training pipeline.
from ipynb.fs.full.reward_computation import (
    # The main function that combines all sub-scores into a final weighted reward.
    compute_reward, 
    # Calculates the score based on adherence to simplification rules (e.g., no compounds).
    rule_compliance_score, 
    # Calculates the score for grammatical correctness using language_tool_python.
    calculate_grammar_score,
    # Calculates the meaning preservation score using the SBERT model.
    calculate_semantic_similarity
)

Loading data file - german_dict/german_utf8.dic
1.0
Loading BERT model...


No sentence-transformers model found with name deepset/gbert-large. Creating a new one with mean pooling.


config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Model similarity function set to: 'dot'
Model loaded.


## Choice for the  RewardModel

AutoModelForSequenceClassification
    - a family of AutoModel classes by huggingface
  - > loads a pretrained encoder and attach a classification head on top
That classification head is
 - for classification: outputs logits over N classes (e.g. positive/negative)
 - for regression: if I set num_labels=1, it outputs a single scalar

### BERT base vs DistilBERT

BERT base uncased
 - higher accuracy
 - 12 layers, 110M parameters.
 - High accuracy, but slower and heavier to train/infer.

DistilBERT - _Picking This one_
- faster PPO training
- A distilled (compressed) version of BERT.
- ~40% smaller, ~60% faster, only ~3% loss in accuracy.
- Often preferred as a reward model because PPO will call it a LOT (every generation gets scored).
- Faster inference = cheaper RL training.

In [3]:
# Update your model_init function: Modify your model initialization to move the model to the MPS device right after it's loaded.
def model_init():
    model_name = "distilbert-base-german-cased"
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=1, problem_type="regression"
    )
    return model.to(device) 

## Hyperparameter


### How Optuna Smartly Finds Hyperparameters

Optuna doesn't use a simple grid or random search. By default, it uses a smart Bayesian optimization algorithm called **TPE (Tree-structured Parzen Estimator)** that learns from past results to make better guesses over time.

#### The TPE Process
1.  **Explore:** It starts with a few random trials to gather initial data.
2.  **Model:** It divides the results into a "good" group (e.g., top 25% of scores) and a "bad" group.
3.  **Suggest:** For the next trial, it suggests a new set of hyperparameters that are statistically more likely to be in the "good" group than the "bad" group.

#### Analogy: Finding the Best Setting 💡
* **Grid Search:** Checks every single possible setting. Exhaustive but very slow.
* **Random Search:** Tries random settings. Better than grid search, but doesn't learn from its mistakes.
* **Optuna (TPE):** Plays "Hot and Cold." It uses the feedback from previous trials ("warmer" or "colder") to make its next guess much more intelligent, focusing its search on the most promising areas of the hyperparameter space.

In [4]:
# ==============================================================================
#  HYPERPARAMETER SEARCH CONFIGURATION
# ==============================================================================

# Define the number of different hyperparameter combinations for Optuna to try.
N_TRIALS = 5

# Define the search space for each hyperparameter in a centralized dictionary.
HP_SEARCH_SPACE = {
    "learning_rate": {
        "type": "float",
        "low": 1e-5,
        "high": 5e-5,
        "log": True,
        # What it is: Controls how much the model's weights are updated during training.
        # Why it's important: It's the most critical hyperparameter. Too high, and the model
        # won't converge; too low, and training will be extremely slow. A log scale
        # is used because the optimal value can vary by orders of magnitude.
    },
    "num_train_epochs": {
        "type": "int",
        "low": 6,
        "high": 15,
        # What it is: The total number of times the training algorithm will pass
        # through the entire training dataset.
        # Why it's important: Balances underfitting and overfitting. Too few epochs,
        # and the model won't learn enough; too many, and it may memorize the
        # training data and perform poorly on new data.
    },
    "per_device_train_batch_size": {
        "type": "categorical",
        "choices": [32, 64],
        # What it is: The number of training examples used in a single forward/backward pass.
        # Why it's important: Affects training stability and memory usage. Larger
        # batches provide more stable gradient estimates but require more GPU RAM.
    },
    "weight_decay": {
        "type": "float",
        "low": 0.0,
        "high": 0.1,
        # What it is: A regularization technique that penalizes large weights in the model.
        # Why it's important: Helps prevent overfitting by keeping the model's weights small
        # and simple, improving its ability to generalize to new data.
    },
    "lr_scheduler_type": {
        "type": "categorical",
        "choices": ["linear", "cosine"],
        # What it is: The strategy for changing the learning rate during training.
        # Why it's important: A good schedule (like a linear decay) helps the model
        # converge faster and more reliably than a constant learning rate.
    },
    "warmup_ratio": {
        "type": "float",
        "low": 0.0,
        "high": 0.1,
        # What it is: The fraction of total training steps used for a "warmup" phase.
        # Why it's important: The learning rate starts at 0 and slowly increases to its
        # target value. This prevents large, unstable updates at the beginning of
        # training, allowing the model to stabilize first.
    },
    "gradient_accumulation_steps": {
        "type": "categorical",
        "choices": [1, 2],
        # What it is: The number of smaller batches to process before performing a
        # single model weight update.
        # Why it's important: It's a trick to simulate a larger batch size without
        # using more memory. An effective batch size of 32 (16 * 2) can lead
        # to more stable training.
    }
}

In [5]:

"""
def model_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 3e-5, log=True), # typical 1e-5 to 3e-5 WHAT TO ADD
        "num_train_epochs": trial.suggest_int("num_train_epochs", 10, 10), # Kept it small for speed, HOW TO SET??
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 16]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.0),
    }
"""

def model_hp_space(trial):
    """
    Dynamically creates the hyperparameter search space for an Optuna trial by
    reading from the global HP_SEARCH_SPACE configuration dictionary.
    """
    params = {}
    # Loop through each parameter defined in the configuration.
    for name, config in HP_SEARCH_SPACE.items():
        param_type = config["type"]
        
        # Call the appropriate Optuna `suggest` method based on the parameter's type.
        if param_type == "float":
            params[name] = trial.suggest_float(
                name, config["low"], config["high"], log=config.get("log", False)
            )
        elif param_type == "int":
            params[name] = trial.suggest_int(
                name, config["low"], config["high"]
            )
        elif param_type == "categorical":
            params[name] = trial.suggest_categorical(
                name, config["choices"]
            )
            
    return params

## CANDIDATE_WEIGHTS

In [6]:
CANDIDATE_WEIGHTS = [
    {"name": "balanced", "weights": {"rules_score": 0.5, "meaning_score": 0.25, "grammar_score": 0.25}}, #baseline
    {"name": "rules_heavy", "weights": {"rules_score": 0.7, "meaning_score": 0.2, "grammar_score": 0.1}},
    {"name": "meaning_heavy", "weights": {"rules_score": 0.2, "meaning_score": 0.7, "grammar_score": 0.1}},
    {"name": "grammar_focused", "weights": {"rules_score": 0.2, "meaning_score": 0.1, "grammar_score": 0.7}},
]

## Data Load & Preprocessing

In [7]:
# Predefine the preprocessing function
def preprocess_function(examples, tokenizer):
    """Tokenizes the 'simplified' text column."""
    # Important: Tokenization is applied on the 'simplified' column now, which becomes the 'text' for the RM
    return tokenizer(examples["simplified"], truncation=True, padding="max_length", max_length=128)

In [8]:
# --- 1. Load, Split, and Tokenize Data ONCE ---
print("--- Loading and tokenizing data once ---")
df = pd.read_csv("data/ordered_simplifications_with_rules.csv", index_col=0)
df.info()

--- Loading and tokenizing data once ---
<class 'pandas.core.frame.DataFrame'>
Index: 11107 entries, 1 to 16546
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   original_sentence     11107 non-null  object
 1   final_simplification  11107 non-null  object
 2   applied_rules         11107 non-null  object
dtypes: object(3)
memory usage: 347.1+ KB


In [9]:
df.head()

Unnamed: 0_level_0,original_sentence,final_simplification,applied_rules
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Der Iran wird teilweise aus dem Atom-Abkommen ...,Der Iran wird teilweise aus dem Atom-Abkommen ...,['normalize_verb_tense']
2,Brüssel Ursula von der Leyen ist die Präsi·Den...,Brüssel Ursula von der Leyen ist die Präsi·Den...,['split_compound']
3,Am Mittwoch hat sie ihre 1. Rede zur Lage der ...,Am Mittwoch hat sie ihre 1. Rede zur Lage der ...,['convert_word_to_number']
4,Bis zum Jahr 2030 soll es in der Europäische U...,Bis zum Jahr 2030 soll es in der Europäische U...,['normalize_verb_tense']
5,"Das ist sehr viel, denn in den letzten 29 Jahr...","Das hat ist sehr viel, denn in den letzten 29 ...",['normalize_verb_tense']


In [10]:
df.rename(columns={"original_sentence": "original", "final_simplification": "simplified"}, inplace=True)

In [11]:
#use for Smaller Dataset
#df = df.sample(n=50, random_state=42) #reduce size for testing

In [12]:
# Create the initial dataset from the full DataFrame
full_dataset = Dataset.from_pandas(df)
# Split it into train and test sets
split_dataset = full_dataset.train_test_split(test_size=0.15, seed=42)

In [13]:
# Load the tokenizer and apply tokenization on both split sets
tokenizer_rm = AutoTokenizer.from_pretrained("distilbert-base-german-cased")

# Tokenize the base train and test sets without the labels
# We keep the original text columns to calculate rewards later
tokenized_train_base = split_dataset["train"].map(
    lambda examples: preprocess_function(examples, tokenizer_rm),
    batched=True
)
tokenized_test_base = split_dataset["test"].map(
    lambda examples: preprocess_function(examples, tokenizer_rm),
    batched=True
)
print("--- Data tokenized successfully. Starting grid search... ---")

Map:   0%|          | 0/9440 [00:00<?, ? examples/s]

Map:   0%|          | 0/1667 [00:00<?, ? examples/s]

--- Data tokenized successfully. Starting grid search... ---


## Eval Custom Metrics for regression RM
- for the chosen regression RM model MSE loss is chosen
- the following metrics are also loggeed
  - MSE (Mean Squared Error) → matches your training loss, so you can track consistency.
  - MAE (Mean Absolute Error) → more interpretable (average absolute difference).
  - R² (Coefficient of Determination) → tells you how well your model explains variance (1 = perfect, 0 = baseline).

In [14]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.squeeze()   # shape: (batch,)
    labels = labels.squeeze()

    mse = mean_squared_error(labels, preds)
    mae = mean_absolute_error(labels, preds)
    r2  = r2_score(labels, preds)

    return {"mse": mse, "mae": mae, "r2": r2}

## Redirecting Standard Output 

In [15]:
"""
rrect: the print statements are coming directly from the german-compound-splitter library itself, which makes them impossible to remove by editing your own code.

This is a common issue with older or research-focused libraries. The definitive way to solve this is to temporarily redirect all output from the subprocesses to a "black hole" so the print statements are silenced.

The Solution: Redirecting Standard Output
We will create a small helper class that redirects all system output (like print) to /dev/null (an operating system-level trash can) just for the duration of the .map() call. This is an advanced but very effective technique.

Step 1: Add the Helper Class
In a new cell in your supervised_fine_tuningSFT.ipynb notebook, before your main training loop, copy and paste this helper class.
Step 2: Update Your .map() Call
Now, in your main training loop, you will wrap your .map() call inside this new context manager.

"""

import os
import sys
from contextlib import contextmanager

@contextmanager
def suppress_stdout_stderr():
    """A context manager that redirects stdout and stderr to devnull"""
    # This works on Mac/Linux to silence all output, including from underlying libraries
    with open(os.devnull, 'w') as fnull:
        old_stdout, old_stderr = sys.stdout, sys.stderr
        sys.stdout, sys.stderr = fnull, fnull
        try:
            yield
        finally:
            sys.stdout, sys.stderr = old_stdout, old_stderr

## Helper Functions for Reward Calculation

In [16]:
# ==============================================================================
#  HELPER FUNCTIONS FOR REWARD CALCULATION
# ==============================================================================

def compute_stable_parts_batched(examples):
    """
    Calculates the reward components that are stable during multiprocessing.
    
    This function processes a BATCH of text examples for high efficiency. It's
    specifically designed to be the target function for `datasets.map()` when
    running in parallel. It handles the spaCy and language_tool calculations.
    
    Args:
        examples (dict): A dictionary from `datasets.map` where each key
                         (e.g., 'simplified') maps to a list of values.
                         
    Returns:
        dict: A dictionary containing lists of the calculated scores, which
              will be added as new columns to the dataset.
    """
    # Extract the list of sentences for this batch.
    simplified_batch = examples['simplified']
    
    # Use spaCy's nlp.pipe() for highly optimized, batched NLP processing.
    # It's significantly faster than calling nlp() on each sentence individually.
    # Note: nlp.pipe() returns a generator. Since generators are consumed after one
    # iteration, we must create two separate ones for the two list comprehensions below.
    docs_for_rules = nlp.pipe(simplified_batch)
    docs_for_grammar = nlp.pipe(simplified_batch)
    
    # Calculate the scores for each document in the batch.
    rules_scores = [rule_compliance_score(doc) for doc in docs_for_rules]
    grammar_scores = [calculate_grammar_score(doc.text) for doc in docs_for_grammar]

    # The returned dictionary's keys will become the new column names in the dataset.
    return {"rules_score": rules_scores, "grammar_score": grammar_scores}


def calculate_rewards_for_split(dataset_split, weights, num_proc, desc_prefix=""):
    """
    Orchestrates the full two-stage reward calculation for a given dataset split.
    
    This function encapsulates the entire complex reward logic:
    - Stage 1: Computes stable scores (rules, grammar) in parallel for speed.
    - Stage 2: Computes the SBERT meaning score sequentially to avoid crashes.
    - Stage 3: Combines the scores from both stages using the provided weights.
    
    Args:
        dataset_split (Dataset): The dataset slice (e.g., train or test) to process.
        weights (dict): The dictionary of weights for the current grid search configuration.
        num_proc (int): The number of CPU cores to use for Stage 1.
        desc_prefix (str): A string (like "train" or "test") to label the progress bars.
        
    Returns:
        list: A list of the final, weighted reward scores for the dataset split.
    """
    # --- STAGE 1: Parallel calculation of stable scores ---
    # We silence all stdout/stderr from subprocesses to prevent the noisy
    # `german-compound-splitter` library from creating an I/O bottleneck.
    with suppress_stdout_stderr():
        stable_scores_ds = dataset_split.map(
            compute_stable_parts_batched,
            batched=True,
            batch_size=100,
            num_proc=num_proc,
            desc=f"Calculating stable scores ({desc_prefix})"
        )

    # --- STAGE 2: Sequential calculation of the SBERT meaning score ---
    # This part is run on a single core because the SentenceTransformer model
    # was found to be incompatible with multiprocessing, causing crashes.
    meaning_scores = [
        calculate_semantic_similarity(ex['original'], ex['simplified']) 
        for ex in tqdm(dataset_split, desc=f"Calculating meaning scores ({desc_prefix})")
    ]
    
    # --- STAGE 3: Combine all calculated scores ---
    # This loop combines the results from the parallel and sequential stages
    # into a final, weighted reward score for each example.
    final_rewards = []
    for i in range(len(dataset_split)):
        r_s = stable_scores_ds[i]["rules_score"]  # From Stage 1
        g_s = stable_scores_ds[i]["grammar_score"] # From Stage 1
        m_s = meaning_scores[i]                   # From Stage 2
        
        # Apply the weights for the current experiment configuration.
        reward = (weights["rules_score"] * r_s +
                  weights["grammar_score"] * g_s +
                  weights["meaning_score"] * m_s)
        final_rewards.append(reward)
        
    return final_rewards

## Helper Functions for Training Pipeline 

In [17]:

# ==============================================================================
#  HELPER FUNCTIONS FOR THE TRAINING PIPELINE
# ==============================================================================

def prepare_datasets_for_run(tokenized_train_base, tokenized_test_base, train_rewards, test_rewards):
    """Adds reward labels to the tokenized datasets and sets the final PyTorch format."""
    # Add the calculated rewards as the 'labels' column for this training run.
    train_dataset = tokenized_train_base.add_column("labels", train_rewards)
    test_dataset = tokenized_test_base.add_column("labels", test_rewards)
    
    # Set the dataset format to PyTorch tensors for compatibility with the Trainer.
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    
    return train_dataset, test_dataset


def run_hyperparameter_search(trainer, config_name):
    """Runs the hyperparameter search for a given trainer and returns the best run."""
    print(f"--- Running hyperparameter search for {config_name} ---")
    
    # This call now uses the N_TRIALS variable defined in your configuration.
    best_run = trainer.hyperparameter_search(
        direction="minimize",      # We want to minimize the objective (MSE).
        hp_space=model_hp_space,   # The function that builds the search space.
        n_trials=N_TRIALS          # The number of trials, controlled from the top of the notebook.
    )
    print(f"Best run for {config_name}: {best_run}")
    return best_run


def train_and_save_final_model(trainer, best_run, output_dir_base, config_name):
    """Trains the final model using the best hyperparameters and saves it."""
    print(f"--- Training final model for {config_name} with best hyperparameters ---")
    
    # Apply the best parameters found during the search to the trainer's arguments.
    for k, v in best_run.hyperparameters.items():
        setattr(trainer.args, k, v)
    
    # Start the final training run.
    trainer.train()

    # Save the final, optimized model to a unique directory.
    final_output_dir = f"{output_dir_base}_final"
    trainer.save_model(final_output_dir)
    print(f"--- Saved final optimized model to {final_output_dir} ---\n")

"""
Reusable Summary Function:  It takes a single result dictionary and prints it in a clean format.
"""
def print_run_summary(result):
    """Prints a formatted summary for a single training run."""
    print(f"\n--- Summary for Configuration: {result['config_name']} ---")
    print("  Best Hyperparameters Found:")
    for param, value in result['best_hyperparameters'].items():
        print(f"    - {param}: {value}")
    
    print("  Final Evaluation Metrics:")
    for metric, value in result['final_metrics'].items():
        # We only print metrics that start with 'eval_' and format them nicely
        if 'eval_' in metric:
            print(f"    - {metric}: {value:.4f}")
    print("-" * 50)


## Trainig Loop with Grid Search for all 4 CANDIDATE_WEIGHTS

In [18]:
# ==============================================================================
#  MAIN GRID SEARCH AND TRAINING LOOP 
# ==============================================================================

# --- Initialize a list to store results ---
all_results = []

# This main loop iterates through each set of candidate weights, calculates the
# corresponding rewards, and then (in subsequent code) trains a reward model.
for config in CANDIDATE_WEIGHTS:
    # --- SETUP FOR THE CURRENT LOOP ITERATION ---
    config_name = config["name"]
    weights = config["weights"]
    output_dir_base = f"rm_out_{config_name}"

    print(f"\n--- Processing configuration: {config_name} ---")
    print(f"Weights: {weights}")

    # Set the number of CPU cores for parallel processing.
    num_cores_to_use = 12

    # --- REWARD CALCULATION ---
    # Call reusable orchestration function to get the rewards for both the
    # training and test sets for this specific weight configuration.
    
    print("--- Calculating rewards for the TRAINING set ---")
    train_rewards = calculate_rewards_for_split(
        split_dataset["train"], 
        weights, 
        num_cores_to_use, 
        desc_prefix="train"
    )

    print("--- Calculating rewards for the TEST set ---")
    test_rewards = calculate_rewards_for_split(
        split_dataset["test"], 
        weights, 
        num_cores_to_use, 
        desc_prefix="test"
    )
    
    print(f"--- Finished calculating rewards for {config_name} ---")


    # The 'train_rewards' and 'test_rewards' variables are now ready to be
    # used to train the reward model for this specific 'config'.

    # ==============================================================================
    #  POST-REWARD CALCULATION: DATASET PREP & MODEL TRAINING
    # ==============================================================================
    # This code block is inside the `for config in CANDIDATE_WEIGHTS:` loop.
    # The 'train_rewards' and 'test_rewards' variables are now ready.

    # --- b. Dataset Preparation ---
    # Call our helper function to create the final datasets for this specific training run.
    # This function takes the pre-tokenized text and merges it with the `train_rewards`
    # and `test_rewards` lists that were just calculated using the current `weights`.
    train_ds, test_ds = prepare_datasets_for_run(
        tokenized_train_base, tokenized_test_base, train_rewards, test_rewards
    )

    # --- c. Trainer Setup ---
    # First, we configure the training process using the `TrainingArguments` class.
    # This object holds all the settings for how the model will be trained and evaluated.
    training_args = TrainingArguments(
        output_dir=output_dir_base,      # Directory to save model checkpoints and outputs.
        use_mps_device=True,             # Explicitly leverage the Apple Silicon GPU (Metal Performance Shaders).
        evaluation_strategy="epoch",     # Run an evaluation on the test set at the end of each training epoch.
        save_strategy="epoch",           # Save a model checkpoint at the end of each epoch.
        load_best_model_at_end=True,     # After training, automatically load the checkpoint with the best performance.
        metric_for_best_model="mse",     # Use Mean Squared Error (MSE) as the key metric to determine the "best" model.
        greater_is_better=False,         # Specifies that a lower value for `metric_for_best_model` (MSE) is better.
        logging_strategy="epoch",        # How often to log training metrics. "epoch" is less noisy than "steps".,
        dataloader_pin_memory=False,     # Disable the feature and silence the warning since not applicable for Mac with an M-series chip uses a Unified Memory Architecture 
        save_total_limit=1, # Only keep the single best checkpoint on disk

    )

    # Next, we initialize the `Trainer` object.
    # This is the main orchestrator from the Hugging Face library that brings together
    # the model, datasets, and training configuration.
    trainer = Trainer(
        # `model_init` is a function that returns a fresh, untrained model.
        # This is crucial for hyperparameter search to ensure each trial starts from scratch.
        model_init=model_init,
        args=training_args,                 # The training configuration we just defined.
        train_dataset=train_ds,             # The prepared training dataset.
        eval_dataset=test_ds,               # The prepared test dataset.
        compute_metrics=compute_metrics,    # The function to calculate metrics like MSE, MAE, and R².
        tokenizer=tokenizer_rm,             # The tokenizer, used for data collation.
    )

    # --- d. Hyperparameter Search and Final Training ---
    # Now we execute the main logic using our custom helper functions for clarity.
    # First, find the best set of hyperparameters for the current reward model configuration.
    best_run = run_hyperparameter_search(trainer, config_name)
    # Then, use those best hyperparameters to train a final, optimized model.
    train_and_save_final_model(trainer, best_run, output_dir_base, config_name)

    # --- e. Evaluate the Final Model and Store Results for Summary ---
    # After the final model for this configuration has been trained, we run a final
    # evaluation on the test set to get its performance metrics.
    print(f"--- Evaluating final model for {config_name} ---")
    final_metrics = trainer.evaluate()

    # We append the results of this entire run into a list. This allows us to
    # print a full summary of all configurations after the main loop has finished.
    current_result = {
        "config_name": config_name,
        "best_hyperparameters": best_run.hyperparameters,
        "final_metrics": final_metrics
    }
    all_results.append(current_result)
    print_run_summary(current_result)


--- Processing configuration: balanced ---
Weights: {'rules_score': 0.5, 'meaning_score': 0.25, 'grammar_score': 0.25}
--- Calculating rewards for the TRAINING set ---


Calculating stable scores (train) (num_proc=12):   0%|          | 0/9440 [00:00<?, ? examples/s]

Calculating meaning scores (train):   0%|          | 0/9440 [00:00<?, ?it/s]

--- Calculating rewards for the TEST set ---


Calculating stable scores (test) (num_proc=12):   0%|          | 0/1667 [00:00<?, ? examples/s]

Calculating meaning scores (test):   0%|          | 0/1667 [00:00<?, ?it/s]

--- Finished calculating rewards for balanced ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-09-25 17:42:49,547] A new study created in memory with name: no-name-9e81224d-b3f9-490d-b1ef-5d47e12afa1c


--- Running hyperparameter search for balanced ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3835 [00:00<?, ?it/s]

{'loss': 0.0889, 'grad_norm': 0.6149807572364807, 'learning_rate': 1.1738029500704622e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0017206856282427907, 'eval_mse': 0.0017206858610734344, 'eval_mae': 0.031871020793914795, 'eval_r2': 0.4916765093803406, 'eval_runtime': 5.3921, 'eval_samples_per_second': 309.154, 'eval_steps_per_second': 38.76, 'epoch': 1.0}
{'loss': 0.0035, 'grad_norm': 0.6118635535240173, 'learning_rate': 1.0759860375645902e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011559044942259789, 'eval_mse': 0.0011559044942259789, 'eval_mae': 0.022543255239725113, 'eval_r2': 0.6585237383842468, 'eval_runtime': 5.3066, 'eval_samples_per_second': 314.138, 'eval_steps_per_second': 39.385, 'epoch': 2.0}
{'loss': 0.0025, 'grad_norm': 0.1685061901807785, 'learning_rate': 9.781691250587186e-06, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008749928092584014, 'eval_mse': 0.0008749927510507405, 'eval_mae': 0.01980210281908512, 'eval_r2': 0.7415103912353516, 'eval_runtime': 5.3696, 'eval_samples_per_second': 310.453, 'eval_steps_per_second': 38.923, 'epoch': 3.0}
{'loss': 0.0022, 'grad_norm': 0.2651714086532593, 'learning_rate': 8.803522125528467e-06, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007781535969115794, 'eval_mse': 0.0007781537133269012, 'eval_mae': 0.017810635268688202, 'eval_r2': 0.7701185345649719, 'eval_runtime': 5.3561, 'eval_samples_per_second': 311.235, 'eval_steps_per_second': 39.021, 'epoch': 4.0}
{'loss': 0.0019, 'grad_norm': 0.10033570975065231, 'learning_rate': 7.825353000469749e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008618143037892878, 'eval_mse': 0.0008618143037892878, 'eval_mae': 0.019128547981381416, 'eval_r2': 0.7454035878181458, 'eval_runtime': 5.3581, 'eval_samples_per_second': 311.115, 'eval_steps_per_second': 39.006, 'epoch': 5.0}
{'loss': 0.0018, 'grad_norm': 0.0912465825676918, 'learning_rate': 6.84718387541103e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007076786132529378, 'eval_mse': 0.0007076786132529378, 'eval_mae': 0.015969645231962204, 'eval_r2': 0.7909382581710815, 'eval_runtime': 5.3369, 'eval_samples_per_second': 312.354, 'eval_steps_per_second': 39.161, 'epoch': 6.0}
{'loss': 0.0017, 'grad_norm': 0.09280157834291458, 'learning_rate': 5.869014750352311e-06, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006936481804586947, 'eval_mse': 0.0006936481222510338, 'eval_mae': 0.015338687226176262, 'eval_r2': 0.7950831055641174, 'eval_runtime': 5.3142, 'eval_samples_per_second': 313.689, 'eval_steps_per_second': 39.329, 'epoch': 7.0}
{'loss': 0.0017, 'grad_norm': 0.2732129693031311, 'learning_rate': 4.890845625293593e-06, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006802345160394907, 'eval_mse': 0.0006802345160394907, 'eval_mae': 0.016920821741223335, 'eval_r2': 0.799045741558075, 'eval_runtime': 5.2949, 'eval_samples_per_second': 314.833, 'eval_steps_per_second': 39.472, 'epoch': 8.0}
{'loss': 0.0016, 'grad_norm': 0.07469335943460464, 'learning_rate': 3.912676500234874e-06, 'epoch': 9.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006230058497749269, 'eval_mse': 0.0006230058497749269, 'eval_mae': 0.014605322852730751, 'eval_r2': 0.8159521818161011, 'eval_runtime': 5.3309, 'eval_samples_per_second': 312.707, 'eval_steps_per_second': 39.206, 'epoch': 9.0}
{'loss': 0.0015, 'grad_norm': 0.1982804387807846, 'learning_rate': 2.9345073751761555e-06, 'epoch': 10.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006190111744217575, 'eval_mse': 0.0006190111744217575, 'eval_mae': 0.01463674008846283, 'eval_r2': 0.817132294178009, 'eval_runtime': 5.329, 'eval_samples_per_second': 312.817, 'eval_steps_per_second': 39.219, 'epoch': 10.0}
{'loss': 0.0015, 'grad_norm': 0.28472477197647095, 'learning_rate': 1.956338250117437e-06, 'epoch': 11.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006351202609948814, 'eval_mse': 0.0006351202609948814, 'eval_mae': 0.01444779708981514, 'eval_r2': 0.8123733401298523, 'eval_runtime': 5.3256, 'eval_samples_per_second': 313.018, 'eval_steps_per_second': 39.245, 'epoch': 11.0}
{'loss': 0.0015, 'grad_norm': 0.08551666140556335, 'learning_rate': 9.781691250587186e-07, 'epoch': 12.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006090293172746897, 'eval_mse': 0.0006090293172746897, 'eval_mae': 0.014034810476005077, 'eval_r2': 0.8200811147689819, 'eval_runtime': 5.3016, 'eval_samples_per_second': 314.433, 'eval_steps_per_second': 39.422, 'epoch': 12.0}
{'loss': 0.0014, 'grad_norm': 0.10726308822631836, 'learning_rate': 0.0, 'epoch': 13.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006042738677933812, 'eval_mse': 0.000604273984208703, 'eval_mae': 0.014085013419389725, 'eval_r2': 0.8214859366416931, 'eval_runtime': 5.3139, 'eval_samples_per_second': 313.706, 'eval_steps_per_second': 39.331, 'epoch': 13.0}


[I 2025-09-25 18:01:11,198] Trial 0 finished with value: 0.8361752240452915 and parameters: {'learning_rate': 1.2192297874036976e-05, 'num_train_epochs': 13, 'per_device_train_batch_size': 32, 'weight_decay': 0.0751880734280757, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.04104434758331391, 'gradient_accumulation_steps': 1}. Best is trial 0 with value: 0.8361752240452915.


{'train_runtime': 1101.3329, 'train_samples_per_second': 111.429, 'train_steps_per_second': 3.482, 'train_loss': 0.008579036037800676, 'epoch': 13.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1184 [00:00<?, ?it/s]

{'loss': 0.1011, 'grad_norm': 0.20040953159332275, 'learning_rate': 2.197706472565551e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0020409133285284042, 'eval_mse': 0.0020409133285284042, 'eval_mae': 0.03243159502744675, 'eval_r2': 0.39707523584365845, 'eval_runtime': 5.3538, 'eval_samples_per_second': 311.367, 'eval_steps_per_second': 39.038, 'epoch': 1.0}
{'loss': 0.0036, 'grad_norm': 0.1362820714712143, 'learning_rate': 2.025971344420903e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010056666797026992, 'eval_mse': 0.0010056666797026992, 'eval_mae': 0.021795034408569336, 'eval_r2': 0.7029068470001221, 'eval_runtime': 5.3374, 'eval_samples_per_second': 312.326, 'eval_steps_per_second': 39.158, 'epoch': 2.0}
{'loss': 0.0025, 'grad_norm': 0.147621288895607, 'learning_rate': 1.6902998607806193e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008282236522063613, 'eval_mse': 0.0008282236522063613, 'eval_mae': 0.01803617551922798, 'eval_r2': 0.7553269267082214, 'eval_runtime': 5.3262, 'eval_samples_per_second': 312.982, 'eval_steps_per_second': 39.24, 'epoch': 3.0}
{'loss': 0.0021, 'grad_norm': 0.11797229945659637, 'learning_rate': 1.250524670645835e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007621173863299191, 'eval_mse': 0.0007621173863299191, 'eval_mae': 0.018159037455916405, 'eval_r2': 0.7748559713363647, 'eval_runtime': 5.326, 'eval_samples_per_second': 312.994, 'eval_steps_per_second': 39.242, 'epoch': 4.0}
{'loss': 0.0019, 'grad_norm': 0.1349983662366867, 'learning_rate': 7.850346627758324e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007213330245576799, 'eval_mse': 0.0007213330245576799, 'eval_mae': 0.016532139852643013, 'eval_r2': 0.7869044542312622, 'eval_runtime': 5.3435, 'eval_samples_per_second': 311.967, 'eval_steps_per_second': 39.113, 'epoch': 5.0}
{'loss': 0.0018, 'grad_norm': 0.1691134124994278, 'learning_rate': 3.76802331445006e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006965611246414483, 'eval_mse': 0.0006965611246414483, 'eval_mae': 0.01596735417842865, 'eval_r2': 0.7942225337028503, 'eval_runtime': 5.3364, 'eval_samples_per_second': 312.384, 'eval_steps_per_second': 39.165, 'epoch': 6.0}
{'loss': 0.0017, 'grad_norm': 0.11749234050512314, 'learning_rate': 9.859412534462208e-07, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006954515702091157, 'eval_mse': 0.0006954515702091157, 'eval_mae': 0.015524206683039665, 'eval_r2': 0.7945502996444702, 'eval_runtime': 5.3497, 'eval_samples_per_second': 311.605, 'eval_steps_per_second': 39.067, 'epoch': 7.0}
{'loss': 0.0017, 'grad_norm': 0.11794538795948029, 'learning_rate': 0.0, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006860618595965207, 'eval_mse': 0.0006860618595965207, 'eval_mae': 0.015422766096889973, 'eval_r2': 0.7973242402076721, 'eval_runtime': 5.3363, 'eval_samples_per_second': 312.388, 'eval_steps_per_second': 39.166, 'epoch': 8.0}


[I 2025-09-25 18:12:04,483] Trial 1 finished with value: 0.8134330681641586 and parameters: {'learning_rate': 2.21252021330482e-05, 'num_train_epochs': 8, 'per_device_train_batch_size': 64, 'weight_decay': 0.08846632434596037, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.07616912324226942, 'gradient_accumulation_steps': 1}. Best is trial 1 with value: 0.8134330681641586.


{'train_runtime': 652.7417, 'train_samples_per_second': 115.697, 'train_steps_per_second': 1.814, 'train_loss': 0.014533578153901003, 'epoch': 8.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2058 [00:00<?, ?it/s]

{'loss': 0.1136, 'grad_norm': 0.44137707352638245, 'learning_rate': 2.376841591046056e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0019201510585844517, 'eval_mse': 0.0019201510585844517, 'eval_mae': 0.030903106555342674, 'eval_r2': 0.4327507019042969, 'eval_runtime': 5.3542, 'eval_samples_per_second': 311.346, 'eval_steps_per_second': 39.035, 'epoch': 1.0}
{'loss': 0.0037, 'grad_norm': 0.15416249632835388, 'learning_rate': 2.1927638540105687e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0013897118624299765, 'eval_mse': 0.0013897118624299765, 'eval_mae': 0.029692813754081726, 'eval_r2': 0.5894525647163391, 'eval_runtime': 5.3509, 'eval_samples_per_second': 311.538, 'eval_steps_per_second': 39.059, 'epoch': 2.0}
{'loss': 0.0026, 'grad_norm': 0.19622156023979187, 'learning_rate': 2.0099298854685643e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008721326012164354, 'eval_mse': 0.0008721326012164354, 'eval_mae': 0.02069767750799656, 'eval_r2': 0.7423553466796875, 'eval_runtime': 5.3185, 'eval_samples_per_second': 313.433, 'eval_steps_per_second': 39.297, 'epoch': 3.0}
{'loss': 0.0021, 'grad_norm': 0.08081523329019547, 'learning_rate': 1.825852148433077e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008732465794309974, 'eval_mse': 0.0008732465794309974, 'eval_mae': 0.01893436163663864, 'eval_r2': 0.7420262694358826, 'eval_runtime': 5.3421, 'eval_samples_per_second': 312.05, 'eval_steps_per_second': 39.123, 'epoch': 4.0}
{'loss': 0.0019, 'grad_norm': 0.34198543429374695, 'learning_rate': 1.6430181798910727e-05, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007269560010172427, 'eval_mse': 0.0007269560010172427, 'eval_mae': 0.016799576580524445, 'eval_r2': 0.7852433323860168, 'eval_runtime': 5.2783, 'eval_samples_per_second': 315.823, 'eval_steps_per_second': 39.596, 'epoch': 5.0}
{'loss': 0.0017, 'grad_norm': 0.2561376094818115, 'learning_rate': 1.4589404428555853e-05, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006541888578794897, 'eval_mse': 0.0006541888578794897, 'eval_mae': 0.01568485051393509, 'eval_r2': 0.8067401051521301, 'eval_runtime': 5.3422, 'eval_samples_per_second': 312.041, 'eval_steps_per_second': 39.122, 'epoch': 6.0}
{'loss': 0.0016, 'grad_norm': 0.06567172706127167, 'learning_rate': 1.276106474313581e-05, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006901092710904777, 'eval_mse': 0.0006901092710904777, 'eval_mae': 0.015422414988279343, 'eval_r2': 0.796128511428833, 'eval_runtime': 5.3016, 'eval_samples_per_second': 314.435, 'eval_steps_per_second': 39.422, 'epoch': 7.0}
{'loss': 0.0016, 'grad_norm': 0.04533021152019501, 'learning_rate': 1.0920287372780937e-05, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006151744746603072, 'eval_mse': 0.0006151744746603072, 'eval_mae': 0.014388353563845158, 'eval_r2': 0.8182656764984131, 'eval_runtime': 5.3353, 'eval_samples_per_second': 312.45, 'eval_steps_per_second': 39.173, 'epoch': 8.0}
{'loss': 0.0015, 'grad_norm': 0.05057739466428757, 'learning_rate': 9.091947687360894e-06, 'epoch': 9.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006102263578213751, 'eval_mse': 0.0006102263578213751, 'eval_mae': 0.013991630636155605, 'eval_r2': 0.8197274804115295, 'eval_runtime': 5.2998, 'eval_samples_per_second': 314.542, 'eval_steps_per_second': 39.436, 'epoch': 9.0}
{'loss': 0.0015, 'grad_norm': 0.05638429895043373, 'learning_rate': 7.2511703170060215e-06, 'epoch': 10.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005901422118768096, 'eval_mse': 0.0005901422118768096, 'eval_mae': 0.013552896678447723, 'eval_r2': 0.8256607055664062, 'eval_runtime': 5.3083, 'eval_samples_per_second': 314.034, 'eval_steps_per_second': 39.372, 'epoch': 10.0}
{'loss': 0.0014, 'grad_norm': 0.14723852276802063, 'learning_rate': 5.422830631585978e-06, 'epoch': 11.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006530344835482538, 'eval_mse': 0.0006530345417559147, 'eval_mae': 0.015092913061380386, 'eval_r2': 0.8070811033248901, 'eval_runtime': 5.2989, 'eval_samples_per_second': 314.592, 'eval_steps_per_second': 39.442, 'epoch': 11.0}
{'loss': 0.0014, 'grad_norm': 0.07500504702329636, 'learning_rate': 3.582053261231105e-06, 'epoch': 12.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005955086089670658, 'eval_mse': 0.0005955085507594049, 'eval_mae': 0.01360662467777729, 'eval_r2': 0.8240754008293152, 'eval_runtime': 5.3138, 'eval_samples_per_second': 313.713, 'eval_steps_per_second': 39.332, 'epoch': 12.0}
{'loss': 0.0014, 'grad_norm': 0.06130663678050041, 'learning_rate': 1.7537135758110616e-06, 'epoch': 13.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005989314522594213, 'eval_mse': 0.0005989314522594213, 'eval_mae': 0.013727019540965557, 'eval_r2': 0.8230642080307007, 'eval_runtime': 5.3065, 'eval_samples_per_second': 314.145, 'eval_steps_per_second': 39.386, 'epoch': 13.0}
{'loss': 0.0013, 'grad_norm': 0.09000684320926666, 'learning_rate': 0.0, 'epoch': 13.95}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005821809754706919, 'eval_mse': 0.0005821809754706919, 'eval_mae': 0.013475881889462471, 'eval_r2': 0.8280125856399536, 'eval_runtime': 5.321, 'eval_samples_per_second': 313.285, 'eval_steps_per_second': 39.278, 'epoch': 13.95}


[I 2025-09-25 18:30:38,263] Trial 2 finished with value: 0.8420706485048868 and parameters: {'learning_rate': 2.4029607294091994e-05, 'num_train_epochs': 14, 'per_device_train_batch_size': 32, 'weight_decay': 0.03407631488054175, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0609905950043846, 'gradient_accumulation_steps': 2}. Best is trial 1 with value: 0.8134330681641586.


{'train_runtime': 1113.2578, 'train_samples_per_second': 118.715, 'train_steps_per_second': 1.849, 'train_loss': 0.009814016940932, 'epoch': 13.95}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/740 [00:00<?, ?it/s]

{'loss': 0.092, 'grad_norm': 0.16427545249462128, 'learning_rate': 4.912435885752183e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0018022367730736732, 'eval_mse': 0.0018022367730736732, 'eval_mae': 0.031252164393663406, 'eval_r2': 0.46758484840393066, 'eval_runtime': 5.2855, 'eval_samples_per_second': 315.392, 'eval_steps_per_second': 39.542, 'epoch': 1.0}
{'loss': 0.0036, 'grad_norm': 0.2049148678779602, 'learning_rate': 4.637029398082373e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0012676978949457407, 'eval_mse': 0.0012676978949457407, 'eval_mae': 0.023733077570796013, 'eval_r2': 0.6254978179931641, 'eval_runtime': 5.3168, 'eval_samples_per_second': 313.534, 'eval_steps_per_second': 39.309, 'epoch': 2.0}
{'loss': 0.0024, 'grad_norm': 0.07722193747758865, 'learning_rate': 4.130136718679198e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007911231950856745, 'eval_mse': 0.0007911231950856745, 'eval_mae': 0.016590911895036697, 'eval_r2': 0.766287088394165, 'eval_runtime': 5.2962, 'eval_samples_per_second': 314.753, 'eval_steps_per_second': 39.462, 'epoch': 3.0}
{'loss': 0.0021, 'grad_norm': 0.08152008801698685, 'learning_rate': 3.446073335574084e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007868007523939013, 'eval_mse': 0.0007868007523939013, 'eval_mae': 0.01714220643043518, 'eval_r2': 0.767564058303833, 'eval_runtime': 5.285, 'eval_samples_per_second': 315.422, 'eval_steps_per_second': 39.546, 'epoch': 4.0}
{'loss': 0.0019, 'grad_norm': 0.05209238827228546, 'learning_rate': 2.658139254877847e-05, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010351147502660751, 'eval_mse': 0.0010351147502660751, 'eval_mae': 0.0233449749648571, 'eval_r2': 0.6942073106765747, 'eval_runtime': 5.3175, 'eval_samples_per_second': 313.493, 'eval_steps_per_second': 39.304, 'epoch': 5.0}
{'loss': 0.0017, 'grad_norm': 0.08460547775030136, 'learning_rate': 1.8507646249603525e-05, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.000671639631036669, 'eval_mse': 0.000671639631036669, 'eval_mae': 0.014646892435848713, 'eval_r2': 0.8015848398208618, 'eval_runtime': 5.3109, 'eval_samples_per_second': 313.881, 'eval_steps_per_second': 39.353, 'epoch': 6.0}
{'loss': 0.0016, 'grad_norm': 0.076807901263237, 'learning_rate': 1.1104627233457343e-05, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006948675727471709, 'eval_mse': 0.0006948675727471709, 'eval_mae': 0.015281766653060913, 'eval_r2': 0.794722855091095, 'eval_runtime': 5.2961, 'eval_samples_per_second': 314.76, 'eval_steps_per_second': 39.463, 'epoch': 7.0}
{'loss': 0.0015, 'grad_norm': 0.0845348984003067, 'learning_rate': 5.165597283566585e-06, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00061286601703614, 'eval_mse': 0.00061286601703614, 'eval_mae': 0.013627169653773308, 'eval_r2': 0.8189476728439331, 'eval_runtime': 5.2935, 'eval_samples_per_second': 314.912, 'eval_steps_per_second': 39.482, 'epoch': 8.0}
{'loss': 0.0015, 'grad_norm': 0.08036831766366959, 'learning_rate': 1.3269461590837658e-06, 'epoch': 9.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006432142690755427, 'eval_mse': 0.0006432142690755427, 'eval_mae': 0.014238669537007809, 'eval_r2': 0.809982180595398, 'eval_runtime': 5.3113, 'eval_samples_per_second': 313.86, 'eval_steps_per_second': 39.35, 'epoch': 9.0}
{'loss': 0.0015, 'grad_norm': 0.25329896807670593, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006354464567266405, 'eval_mse': 0.0006354464567266405, 'eval_mae': 0.014034212566912174, 'eval_r2': 0.8122769594192505, 'eval_runtime': 5.3332, 'eval_samples_per_second': 312.57, 'eval_steps_per_second': 39.188, 'epoch': 10.0}


[I 2025-09-25 18:43:39,931] Trial 3 finished with value: 0.8269466184428893 and parameters: {'learning_rate': 4.953425392128116e-05, 'num_train_epochs': 10, 'per_device_train_batch_size': 64, 'weight_decay': 0.05713568976823746, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.04324611708596774, 'gradient_accumulation_steps': 2}. Best is trial 1 with value: 0.8134330681641586.


{'train_runtime': 781.2659, 'train_samples_per_second': 120.83, 'train_steps_per_second': 0.947, 'train_loss': 0.010990966705454363, 'epoch': 10.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/518 [00:00<?, ?it/s]

{'loss': 0.1094, 'grad_norm': 0.2614833116531372, 'learning_rate': 2.777416410183834e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0020683202892541885, 'eval_mse': 0.0020683202892541885, 'eval_mae': 0.03537445887923241, 'eval_r2': 0.3889787197113037, 'eval_runtime': 5.3016, 'eval_samples_per_second': 314.431, 'eval_steps_per_second': 39.422, 'epoch': 1.0}
{'loss': 0.0049, 'grad_norm': 0.14552251994609833, 'learning_rate': 2.441624744299029e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0013242079876363277, 'eval_mse': 0.0013242079876363277, 'eval_mae': 0.02525743842124939, 'eval_r2': 0.6088036894798279, 'eval_runtime': 5.31, 'eval_samples_per_second': 313.935, 'eval_steps_per_second': 39.36, 'epoch': 2.0}
{'loss': 0.0032, 'grad_norm': 0.17495253682136536, 'learning_rate': 1.878828955103441e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010209426982328296, 'eval_mse': 0.0010209426982328296, 'eval_mae': 0.021718353033065796, 'eval_r2': 0.6983940005302429, 'eval_runtime': 5.2976, 'eval_samples_per_second': 314.67, 'eval_steps_per_second': 39.452, 'epoch': 3.0}
{'loss': 0.0027, 'grad_norm': 0.11364882439374924, 'learning_rate': 1.2138538583157173e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00108220090623945, 'eval_mse': 0.0010822007898241282, 'eval_mae': 0.02178477868437767, 'eval_r2': 0.6802972555160522, 'eval_runtime': 5.3299, 'eval_samples_per_second': 312.764, 'eval_steps_per_second': 39.213, 'epoch': 4.0}
{'loss': 0.0023, 'grad_norm': 0.15699665248394012, 'learning_rate': 5.9418704269812715e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008606641204096377, 'eval_mse': 0.0008606641204096377, 'eval_mae': 0.01792788878083229, 'eval_r2': 0.7457433938980103, 'eval_runtime': 5.3296, 'eval_samples_per_second': 312.784, 'eval_steps_per_second': 39.215, 'epoch': 5.0}
{'loss': 0.0022, 'grad_norm': 0.051980145275592804, 'learning_rate': 1.5726698513939095e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008984950254671276, 'eval_mse': 0.0008984950254671276, 'eval_mae': 0.018573857843875885, 'eval_r2': 0.734567403793335, 'eval_runtime': 5.3165, 'eval_samples_per_second': 313.555, 'eval_steps_per_second': 39.312, 'epoch': 6.0}
{'loss': 0.0022, 'grad_norm': 0.1477523148059845, 'learning_rate': 0.0, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008423598483204842, 'eval_mse': 0.0008423598483204842, 'eval_mae': 0.017602935433387756, 'eval_r2': 0.7511507868766785, 'eval_runtime': 5.3259, 'eval_samples_per_second': 312.999, 'eval_steps_per_second': 39.242, 'epoch': 7.0}


[I 2025-09-25 18:52:47,808] Trial 4 finished with value: 0.7695960821583867 and parameters: {'learning_rate': 2.836269262850202e-05, 'num_train_epochs': 7, 'per_device_train_batch_size': 64, 'weight_decay': 0.027205054637907, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.05439758905893752, 'gradient_accumulation_steps': 2}. Best is trial 4 with value: 0.7695960821583867.


{'train_runtime': 547.3564, 'train_samples_per_second': 120.726, 'train_steps_per_second': 0.946, 'train_loss': 0.01813567671421412, 'epoch': 7.0}
Best run for balanced: BestRun(run_id='4', objective=0.7695960821583867, hyperparameters={'learning_rate': 2.836269262850202e-05, 'num_train_epochs': 7, 'per_device_train_batch_size': 64, 'weight_decay': 0.027205054637907, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.05439758905893752, 'gradient_accumulation_steps': 2}, run_summary=None)
--- Training final model for balanced with best hyperparameters ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/518 [00:00<?, ?it/s]

{'loss': 0.1094, 'grad_norm': 0.2632204294204712, 'learning_rate': 2.777416410183834e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.002069980837404728, 'eval_mse': 0.002069980837404728, 'eval_mae': 0.03537990152835846, 'eval_r2': 0.38848811388015747, 'eval_runtime': 5.3065, 'eval_samples_per_second': 314.144, 'eval_steps_per_second': 39.386, 'epoch': 1.0}
{'loss': 0.0049, 'grad_norm': 0.14478053152561188, 'learning_rate': 2.441624744299029e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0013237829552963376, 'eval_mse': 0.001323782722465694, 'eval_mae': 0.02522066794335842, 'eval_r2': 0.6089292764663696, 'eval_runtime': 5.2981, 'eval_samples_per_second': 314.638, 'eval_steps_per_second': 39.448, 'epoch': 2.0}
{'loss': 0.0033, 'grad_norm': 0.17443734407424927, 'learning_rate': 1.878828955103441e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010195095092058182, 'eval_mse': 0.0010195095092058182, 'eval_mae': 0.021587034687399864, 'eval_r2': 0.6988174319267273, 'eval_runtime': 5.308, 'eval_samples_per_second': 314.054, 'eval_steps_per_second': 39.374, 'epoch': 3.0}
{'loss': 0.0027, 'grad_norm': 0.11671292781829834, 'learning_rate': 1.2138538583157173e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011133585358038545, 'eval_mse': 0.0011133585358038545, 'eval_mae': 0.022300541400909424, 'eval_r2': 0.6710926294326782, 'eval_runtime': 5.3004, 'eval_samples_per_second': 314.505, 'eval_steps_per_second': 39.431, 'epoch': 4.0}
{'loss': 0.0023, 'grad_norm': 0.15322938561439514, 'learning_rate': 5.9418704269812715e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008635074482299387, 'eval_mse': 0.0008635074482299387, 'eval_mae': 0.017966674640774727, 'eval_r2': 0.7449034452438354, 'eval_runtime': 5.2919, 'eval_samples_per_second': 315.012, 'eval_steps_per_second': 39.495, 'epoch': 5.0}
{'loss': 0.0022, 'grad_norm': 0.05130746215581894, 'learning_rate': 1.5726698513939095e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.000896819110494107, 'eval_mse': 0.000896819110494107, 'eval_mae': 0.01853838376700878, 'eval_r2': 0.7350624799728394, 'eval_runtime': 5.2963, 'eval_samples_per_second': 314.748, 'eval_steps_per_second': 39.461, 'epoch': 6.0}
{'loss': 0.0022, 'grad_norm': 0.15996386110782623, 'learning_rate': 0.0, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008399349171668291, 'eval_mse': 0.0008399349171668291, 'eval_mae': 0.01758568175137043, 'eval_r2': 0.7518671751022339, 'eval_runtime': 5.344, 'eval_samples_per_second': 311.937, 'eval_steps_per_second': 39.109, 'epoch': 7.0}
{'train_runtime': 547.8809, 'train_samples_per_second': 120.61, 'train_steps_per_second': 0.945, 'train_loss': 0.01814060698489885, 'epoch': 7.0}
--- Saved final optimized model to rm_out_balanced_final ---

--- Evaluating final model for balanced ---


  0%|          | 0/209 [00:00<?, ?it/s]

Calculating stable scores (train) (num_proc=12):   0%|          | 0/9440 [00:00<?, ? examples/s]


--- Summary for Configuration: balanced ---
  Best Hyperparameters Found:
    - learning_rate: 2.836269262850202e-05
    - num_train_epochs: 7
    - per_device_train_batch_size: 64
    - weight_decay: 0.027205054637907
    - lr_scheduler_type: cosine
    - warmup_ratio: 0.05439758905893752
    - gradient_accumulation_steps: 2
  Final Evaluation Metrics:
    - eval_loss: 0.0008
    - eval_mse: 0.0008
    - eval_mae: 0.0176
    - eval_r2: 0.7519
    - eval_runtime: 5.3628
    - eval_samples_per_second: 310.8460
    - eval_steps_per_second: 38.9720
--------------------------------------------------

--- Processing configuration: rules_heavy ---
Weights: {'rules_score': 0.7, 'meaning_score': 0.2, 'grammar_score': 0.1}
--- Calculating rewards for the TRAINING set ---


Calculating meaning scores (train):   0%|          | 0/9440 [00:00<?, ?it/s]

--- Calculating rewards for the TEST set ---


Calculating stable scores (test) (num_proc=12):   0%|          | 0/1667 [00:00<?, ? examples/s]

Calculating meaning scores (test):   0%|          | 0/1667 [00:00<?, ?it/s]

--- Finished calculating rewards for rules_heavy ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-09-25 19:08:21,385] A new study created in memory with name: no-name-81515389-26c7-487f-9b6a-eebd7a9a500e
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Running hyperparameter search for rules_heavy ---


  0%|          | 0/814 [00:00<?, ?it/s]

{'loss': 0.1536, 'grad_norm': 0.28579503297805786, 'learning_rate': 2.20719094085351e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.003751960815861821, 'eval_mse': 0.003751960350200534, 'eval_mae': 0.046032268553972244, 'eval_r2': 0.43932241201400757, 'eval_runtime': 5.3002, 'eval_samples_per_second': 314.518, 'eval_steps_per_second': 39.433, 'epoch': 1.0}
{'loss': 0.0066, 'grad_norm': 0.10331618040800095, 'learning_rate': 2.13929012879357e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.002174922963604331, 'eval_mse': 0.002174922963604331, 'eval_mae': 0.03283417597413063, 'eval_r2': 0.6749884486198425, 'eval_runtime': 5.305, 'eval_samples_per_second': 314.234, 'eval_steps_per_second': 39.397, 'epoch': 2.0}
{'loss': 0.0041, 'grad_norm': 0.20010650157928467, 'learning_rate': 1.972988411741449e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0015696798218414187, 'eval_mse': 0.001569679589010775, 'eval_mae': 0.02414356917142868, 'eval_r2': 0.7654335498809814, 'eval_runtime': 5.295, 'eval_samples_per_second': 314.825, 'eval_steps_per_second': 39.471, 'epoch': 3.0}
{'loss': 0.0032, 'grad_norm': 0.15967579185962677, 'learning_rate': 1.7240949648935617e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0013602880062535405, 'eval_mse': 0.0013602880062535405, 'eval_mae': 0.02142287790775299, 'eval_r2': 0.7967241406440735, 'eval_runtime': 5.2959, 'eval_samples_per_second': 314.77, 'eval_steps_per_second': 39.464, 'epoch': 4.0}
{'loss': 0.0027, 'grad_norm': 0.1269015520811081, 'learning_rate': 1.4162703984721415e-05, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0013624172424897552, 'eval_mse': 0.0013624172424897552, 'eval_mae': 0.021831372752785683, 'eval_r2': 0.7964059710502625, 'eval_runtime': 5.283, 'eval_samples_per_second': 315.538, 'eval_steps_per_second': 39.561, 'epoch': 5.0}
{'loss': 0.0025, 'grad_norm': 0.07788901776075363, 'learning_rate': 1.0787775041463305e-05, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011732089333236217, 'eval_mse': 0.0011732089333236217, 'eval_mae': 0.018943114206194878, 'eval_r2': 0.8246804475784302, 'eval_runtime': 5.3015, 'eval_samples_per_second': 314.439, 'eval_steps_per_second': 39.423, 'epoch': 6.0}
{'loss': 0.0023, 'grad_norm': 0.24970075488090515, 'learning_rate': 7.436994400970515e-06, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011458260705694556, 'eval_mse': 0.0011458260705694556, 'eval_mae': 0.017831217497587204, 'eval_r2': 0.8287724256515503, 'eval_runtime': 5.2694, 'eval_samples_per_second': 316.357, 'eval_steps_per_second': 39.663, 'epoch': 7.0}
{'loss': 0.0022, 'grad_norm': 0.07331538945436478, 'learning_rate': 4.428898029854315e-06, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010993588948622346, 'eval_mse': 0.0010993588948622346, 'eval_mae': 0.017586683854460716, 'eval_r2': 0.8357163071632385, 'eval_runtime': 5.2807, 'eval_samples_per_second': 315.676, 'eval_steps_per_second': 39.578, 'epoch': 8.0}
{'loss': 0.0022, 'grad_norm': 0.06638889759778976, 'learning_rate': 2.0494452277638688e-06, 'epoch': 9.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010941608343273401, 'eval_mse': 0.001094160950742662, 'eval_mae': 0.01716623082756996, 'eval_r2': 0.8364930748939514, 'eval_runtime': 5.2866, 'eval_samples_per_second': 315.326, 'eval_steps_per_second': 39.534, 'epoch': 9.0}
{'loss': 0.0022, 'grad_norm': 0.29027488827705383, 'learning_rate': 5.248344181830554e-07, 'epoch': 10.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010961894877254963, 'eval_mse': 0.0010961894877254963, 'eval_mae': 0.017196431756019592, 'eval_r2': 0.8361899256706238, 'eval_runtime': 5.2786, 'eval_samples_per_second': 315.805, 'eval_steps_per_second': 39.594, 'epoch': 10.0}
{'loss': 0.0021, 'grad_norm': 0.2116180807352066, 'learning_rate': 0.0, 'epoch': 11.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00109425641130656, 'eval_mse': 0.00109425641130656, 'eval_mae': 0.017202870920300484, 'eval_r2': 0.8364788293838501, 'eval_runtime': 5.2607, 'eval_samples_per_second': 316.877, 'eval_steps_per_second': 39.728, 'epoch': 11.0}


[I 2025-09-25 19:22:55,294] Trial 0 finished with value: 0.8547759567154571 and parameters: {'learning_rate': 2.2083597361026016e-05, 'num_train_epochs': 11, 'per_device_train_batch_size': 64, 'weight_decay': 0.06649463153110868, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.07706148991353866, 'gradient_accumulation_steps': 2}. Best is trial 0 with value: 0.8547759567154571.


{'train_runtime': 873.52, 'train_samples_per_second': 118.875, 'train_steps_per_second': 0.932, 'train_loss': 0.016693900401088473, 'epoch': 11.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2220 [00:00<?, ?it/s]

{'loss': 0.0548, 'grad_norm': 0.24694494903087616, 'learning_rate': 3.394208670974157e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0020158174447715282, 'eval_mse': 0.0020158172119408846, 'eval_mae': 0.028002778068184853, 'eval_r2': 0.6987645626068115, 'eval_runtime': 5.3036, 'eval_samples_per_second': 314.315, 'eval_steps_per_second': 39.407, 'epoch': 1.0}
{'loss': 0.0034, 'grad_norm': 0.0988537147641182, 'learning_rate': 3.29867779784276e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0013074293965473771, 'eval_mse': 0.0013074293965473771, 'eval_mae': 0.022129349410533905, 'eval_r2': 0.8046231269836426, 'eval_runtime': 5.3158, 'eval_samples_per_second': 313.595, 'eval_steps_per_second': 39.317, 'epoch': 2.0}
{'loss': 0.0027, 'grad_norm': 0.20726193487644196, 'learning_rate': 3.131258033664018e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0012801132397726178, 'eval_mse': 0.0012801132397726178, 'eval_mae': 0.020918749272823334, 'eval_r2': 0.8087050914764404, 'eval_runtime': 5.2954, 'eval_samples_per_second': 314.804, 'eval_steps_per_second': 39.468, 'epoch': 3.0}
{'loss': 0.0022, 'grad_norm': 0.10984402894973755, 'learning_rate': 2.8995156167337945e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.001025106175802648, 'eval_mse': 0.001025106175802648, 'eval_mae': 0.01612180285155773, 'eval_r2': 0.84681236743927, 'eval_runtime': 5.2924, 'eval_samples_per_second': 314.98, 'eval_steps_per_second': 39.491, 'epoch': 4.0}
{'loss': 0.0019, 'grad_norm': 0.07070111483335495, 'learning_rate': 2.6139237331681643e-05, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011382934171706438, 'eval_mse': 0.0011382934171706438, 'eval_mae': 0.01711261458694935, 'eval_r2': 0.8298981189727783, 'eval_runtime': 5.2833, 'eval_samples_per_second': 315.523, 'eval_steps_per_second': 39.559, 'epoch': 5.0}
{'loss': 0.0017, 'grad_norm': 0.20923982560634613, 'learning_rate': 2.287389199895355e-05, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010137328645214438, 'eval_mse': 0.0010137328645214438, 'eval_mae': 0.01490267738699913, 'eval_r2': 0.8485119342803955, 'eval_runtime': 5.2976, 'eval_samples_per_second': 314.67, 'eval_steps_per_second': 39.452, 'epoch': 6.0}
{'loss': 0.0016, 'grad_norm': 0.3302299976348877, 'learning_rate': 1.934669164034109e-05, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010298023698851466, 'eval_mse': 0.0010298023698851466, 'eval_mae': 0.014413995668292046, 'eval_r2': 0.8461105823516846, 'eval_runtime': 5.3183, 'eval_samples_per_second': 313.446, 'eval_steps_per_second': 39.298, 'epoch': 7.0}
{'loss': 0.0014, 'grad_norm': 0.2813292145729065, 'learning_rate': 1.5717041798919935e-05, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011354273883625865, 'eval_mse': 0.0011354273883625865, 'eval_mae': 0.02126746065914631, 'eval_r2': 0.8303263783454895, 'eval_runtime': 5.3083, 'eval_samples_per_second': 314.036, 'eval_steps_per_second': 39.372, 'epoch': 8.0}
{'loss': 0.0014, 'grad_norm': 0.08501565456390381, 'learning_rate': 1.2148978039818578e-05, 'epoch': 9.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010492784203961492, 'eval_mse': 0.0010492784203961492, 'eval_mae': 0.015937773510813713, 'eval_r2': 0.843200147151947, 'eval_runtime': 5.298, 'eval_samples_per_second': 314.647, 'eval_steps_per_second': 39.449, 'epoch': 9.0}
{'loss': 0.0013, 'grad_norm': 0.10136645287275314, 'learning_rate': 8.80375265477582e-06, 'epoch': 10.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010258038528263569, 'eval_mse': 0.001025803736411035, 'eval_mae': 0.014520353637635708, 'eval_r2': 0.8467081189155579, 'eval_runtime': 5.2992, 'eval_samples_per_second': 314.574, 'eval_steps_per_second': 39.44, 'epoch': 10.0}
{'loss': 0.0012, 'grad_norm': 0.2243981957435608, 'learning_rate': 5.832547151786052e-06, 'epoch': 11.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0009919609874486923, 'eval_mse': 0.0009919608710333705, 'eval_mae': 0.013633211143314838, 'eval_r2': 0.8517654538154602, 'eval_runtime': 5.3239, 'eval_samples_per_second': 313.118, 'eval_steps_per_second': 39.257, 'epoch': 11.0}
{'loss': 0.0012, 'grad_norm': 0.10110381245613098, 'learning_rate': 3.3696398758934074e-06, 'epoch': 12.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0009926415514200926, 'eval_mse': 0.0009926415514200926, 'eval_mae': 0.0136531637981534, 'eval_r2': 0.8516637086868286, 'eval_runtime': 5.2827, 'eval_samples_per_second': 315.558, 'eval_steps_per_second': 39.563, 'epoch': 12.0}
{'loss': 0.0012, 'grad_norm': 0.17854227125644684, 'learning_rate': 1.526337538354053e-06, 'epoch': 13.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0009944376070052385, 'eval_mse': 0.0009944376070052385, 'eval_mae': 0.013609201647341251, 'eval_r2': 0.851395308971405, 'eval_runtime': 5.3213, 'eval_samples_per_second': 313.266, 'eval_steps_per_second': 39.276, 'epoch': 13.0}
{'loss': 0.0011, 'grad_norm': 0.08029761165380478, 'learning_rate': 3.8594490790638294e-07, 'epoch': 14.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0009834358934313059, 'eval_mse': 0.0009834358934313059, 'eval_mae': 0.013395581394433975, 'eval_r2': 0.8530393838882446, 'eval_runtime': 5.2911, 'eval_samples_per_second': 315.057, 'eval_steps_per_second': 39.5, 'epoch': 14.0}
{'loss': 0.0011, 'grad_norm': 0.059889305382966995, 'learning_rate': 0.0, 'epoch': 15.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0009861935395747423, 'eval_mse': 0.0009861935395747423, 'eval_mae': 0.013454502448439598, 'eval_r2': 0.8526272773742676, 'eval_runtime': 5.3317, 'eval_samples_per_second': 312.66, 'eval_steps_per_second': 39.2, 'epoch': 15.0}


[I 2025-09-25 19:42:54,140] Trial 1 finished with value: 0.8670679733622819 and parameters: {'learning_rate': 3.415954028463452e-05, 'num_train_epochs': 15, 'per_device_train_batch_size': 64, 'weight_decay': 0.08147242520333642, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.016219180833170754, 'gradient_accumulation_steps': 1}. Best is trial 0 with value: 0.8547759567154571.


{'train_runtime': 1198.4541, 'train_samples_per_second': 118.152, 'train_steps_per_second': 1.852, 'train_loss': 0.005203298788081418, 'epoch': 15.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1029 [00:00<?, ?it/s]

{'loss': 0.0582, 'grad_norm': 0.11034420877695084, 'learning_rate': 1.7382345472754486e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0026010912843048573, 'eval_mse': 0.0026010912843048573, 'eval_mae': 0.03536590188741684, 'eval_r2': 0.6113035678863525, 'eval_runtime': 5.2775, 'eval_samples_per_second': 315.87, 'eval_steps_per_second': 39.602, 'epoch': 1.0}
{'loss': 0.0044, 'grad_norm': 0.37429913878440857, 'learning_rate': 1.446558001927641e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0016729847993701696, 'eval_mse': 0.0016729847993701696, 'eval_mae': 0.024409500882029533, 'eval_r2': 0.7499960660934448, 'eval_runtime': 5.2893, 'eval_samples_per_second': 315.167, 'eval_steps_per_second': 39.514, 'epoch': 2.0}
{'loss': 0.0032, 'grad_norm': 0.16739600896835327, 'learning_rate': 1.1568522440483995e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0013387924991548061, 'eval_mse': 0.0013387924991548061, 'eval_mae': 0.020135287195444107, 'eval_r2': 0.7999363541603088, 'eval_runtime': 5.2624, 'eval_samples_per_second': 316.776, 'eval_steps_per_second': 39.716, 'epoch': 3.0}
{'loss': 0.0027, 'grad_norm': 0.11872798949480057, 'learning_rate': 8.651756987005918e-06, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011947947787120938, 'eval_mse': 0.0011947947787120938, 'eval_mae': 0.01845451258122921, 'eval_r2': 0.8214547634124756, 'eval_runtime': 5.2687, 'eval_samples_per_second': 316.398, 'eval_steps_per_second': 39.668, 'epoch': 4.0}
{'loss': 0.0024, 'grad_norm': 0.1032513827085495, 'learning_rate': 5.754699408213504e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0012483294121921062, 'eval_mse': 0.0012483294121921062, 'eval_mae': 0.01945149526000023, 'eval_r2': 0.8134547472000122, 'eval_runtime': 5.2698, 'eval_samples_per_second': 316.332, 'eval_steps_per_second': 39.66, 'epoch': 5.0}
{'loss': 0.0023, 'grad_norm': 0.1887279748916626, 'learning_rate': 2.8379339547354263e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011191894300282001, 'eval_mse': 0.001119189546443522, 'eval_mae': 0.017621541395783424, 'eval_r2': 0.8327529430389404, 'eval_runtime': 5.2763, 'eval_samples_per_second': 315.94, 'eval_steps_per_second': 39.611, 'epoch': 6.0}
{'loss': 0.0022, 'grad_norm': 0.05920027568936348, 'learning_rate': 0.0, 'epoch': 6.98}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011238922597840428, 'eval_mse': 0.0011238922597840428, 'eval_mae': 0.016997355967760086, 'eval_r2': 0.8320501446723938, 'eval_runtime': 5.2905, 'eval_samples_per_second': 315.09, 'eval_steps_per_second': 39.504, 'epoch': 6.98}


[I 2025-09-25 19:52:08,977] Trial 2 finished with value: 0.8501713928999379 and parameters: {'learning_rate': 1.982612193377666e-05, 'num_train_epochs': 7, 'per_device_train_batch_size': 32, 'weight_decay': 0.07215714672134213, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.022077796328367674, 'gradient_accumulation_steps': 2}. Best is trial 2 with value: 0.8501713928999379.


{'train_runtime': 554.4361, 'train_samples_per_second': 119.184, 'train_steps_per_second': 1.856, 'train_loss': 0.010759930047627441, 'epoch': 6.98}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1176 [00:00<?, ?it/s]

{'loss': 0.0884, 'grad_norm': 0.13559874892234802, 'learning_rate': 1.4680448191119189e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.002950018970295787, 'eval_mse': 0.002950018737465143, 'eval_mae': 0.040197160094976425, 'eval_r2': 0.5591613054275513, 'eval_runtime': 5.2542, 'eval_samples_per_second': 317.268, 'eval_steps_per_second': 39.777, 'epoch': 1.0}
{'loss': 0.005, 'grad_norm': 0.45815736055374146, 'learning_rate': 1.3284306223109533e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0018223695224151015, 'eval_mse': 0.001822369173169136, 'eval_mae': 0.026543010026216507, 'eval_r2': 0.7276726365089417, 'eval_runtime': 5.2848, 'eval_samples_per_second': 315.43, 'eval_steps_per_second': 39.547, 'epoch': 2.0}
{'loss': 0.0034, 'grad_norm': 0.1431235373020172, 'learning_rate': 1.092607069041651e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0014210528461262584, 'eval_mse': 0.0014210528461262584, 'eval_mae': 0.022134067490696907, 'eval_r2': 0.7876436710357666, 'eval_runtime': 5.2682, 'eval_samples_per_second': 316.43, 'eval_steps_per_second': 39.672, 'epoch': 3.0}
{'loss': 0.0028, 'grad_norm': 0.12393006682395935, 'learning_rate': 7.971928904397898e-06, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0012786713195964694, 'eval_mse': 0.0012786713195964694, 'eval_mae': 0.02028556726872921, 'eval_r2': 0.8089206218719482, 'eval_runtime': 5.2664, 'eval_samples_per_second': 316.536, 'eval_steps_per_second': 39.686, 'epoch': 4.0}
{'loss': 0.0026, 'grad_norm': 0.18403507769107819, 'learning_rate': 4.952959071863967e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.001276837196201086, 'eval_mse': 0.001276837196201086, 'eval_mae': 0.019741931930184364, 'eval_r2': 0.8091946840286255, 'eval_runtime': 5.2629, 'eval_samples_per_second': 316.745, 'eval_steps_per_second': 39.712, 'epoch': 5.0}
{'loss': 0.0024, 'grad_norm': 0.20920684933662415, 'learning_rate': 2.3379474657377515e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011848650174215436, 'eval_mse': 0.0011848650174215436, 'eval_mae': 0.019294779747724533, 'eval_r2': 0.8229386210441589, 'eval_runtime': 5.2748, 'eval_samples_per_second': 316.029, 'eval_steps_per_second': 39.622, 'epoch': 6.0}
{'loss': 0.0023, 'grad_norm': 0.14138393104076385, 'learning_rate': 5.970054669767861e-07, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011694986606016755, 'eval_mse': 0.0011694986606016755, 'eval_mae': 0.018167169764637947, 'eval_r2': 0.8252348899841309, 'eval_runtime': 5.2682, 'eval_samples_per_second': 316.429, 'eval_steps_per_second': 39.672, 'epoch': 7.0}
{'loss': 0.0024, 'grad_norm': 0.10306233167648315, 'learning_rate': 0.0, 'epoch': 7.97}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.001172416377812624, 'eval_mse': 0.001172416377812624, 'eval_mae': 0.01818801276385784, 'eval_r2': 0.8247988820075989, 'eval_runtime': 5.2986, 'eval_samples_per_second': 314.612, 'eval_steps_per_second': 39.444, 'epoch': 7.97}


[I 2025-09-25 20:02:42,143] Trial 3 finished with value: 0.8441593111492693 and parameters: {'learning_rate': 1.4942277347418218e-05, 'num_train_epochs': 8, 'per_device_train_batch_size': 32, 'weight_decay': 0.05285585376949559, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.04364105576407306, 'gradient_accumulation_steps': 2}. Best is trial 3 with value: 0.8441593111492693.


{'train_runtime': 632.7806, 'train_samples_per_second': 119.346, 'train_steps_per_second': 1.858, 'train_loss': 0.013662861672793927, 'epoch': 7.97}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1770 [00:00<?, ?it/s]

{'loss': 0.0349, 'grad_norm': 0.1251106709241867, 'learning_rate': 3.901836671573788e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0014011216117069125, 'eval_mse': 0.0014011216117069125, 'eval_mae': 0.020261388272047043, 'eval_r2': 0.7906221151351929, 'eval_runtime': 5.2607, 'eval_samples_per_second': 316.875, 'eval_steps_per_second': 39.728, 'epoch': 1.0}
{'loss': 0.0027, 'grad_norm': 0.17823658883571625, 'learning_rate': 3.1214693372590304e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0014773097354918718, 'eval_mse': 0.0014773097354918718, 'eval_mae': 0.02693716250360012, 'eval_r2': 0.779236912727356, 'eval_runtime': 5.2747, 'eval_samples_per_second': 316.04, 'eval_steps_per_second': 39.623, 'epoch': 2.0}
{'loss': 0.0022, 'grad_norm': 0.16598300635814667, 'learning_rate': 2.3411020029442728e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011089897016063333, 'eval_mse': 0.0011089895851910114, 'eval_mae': 0.020937480032444, 'eval_r2': 0.8342771530151367, 'eval_runtime': 5.2758, 'eval_samples_per_second': 315.97, 'eval_steps_per_second': 39.615, 'epoch': 3.0}
{'loss': 0.0019, 'grad_norm': 0.24510711431503296, 'learning_rate': 1.5607346686295152e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00105118821375072, 'eval_mse': 0.00105118821375072, 'eval_mae': 0.016077468171715736, 'eval_r2': 0.8429147601127625, 'eval_runtime': 5.2979, 'eval_samples_per_second': 314.652, 'eval_steps_per_second': 39.449, 'epoch': 4.0}
{'loss': 0.0016, 'grad_norm': 0.06832709908485413, 'learning_rate': 7.803673343147576e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010528945131227374, 'eval_mse': 0.0010528945131227374, 'eval_mae': 0.01635374315083027, 'eval_r2': 0.8426597714424133, 'eval_runtime': 5.2918, 'eval_samples_per_second': 315.016, 'eval_steps_per_second': 39.495, 'epoch': 5.0}
{'loss': 0.0015, 'grad_norm': 0.06749086081981659, 'learning_rate': 0.0, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010023904033005238, 'eval_mse': 0.0010023904033005238, 'eval_mae': 0.014997968450188637, 'eval_r2': 0.8502069115638733, 'eval_runtime': 5.2783, 'eval_samples_per_second': 315.82, 'eval_steps_per_second': 39.596, 'epoch': 6.0}


[I 2025-09-25 20:11:02,542] Trial 4 finished with value: 0.8662072704173625 and parameters: {'learning_rate': 4.489096157058114e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 32, 'weight_decay': 0.05643683973851996, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.04117425767057934, 'gradient_accumulation_steps': 1}. Best is trial 3 with value: 0.8441593111492693.


{'train_runtime': 499.9864, 'train_samples_per_second': 113.283, 'train_steps_per_second': 3.54, 'train_loss': 0.007459368493597386, 'epoch': 6.0}
Best run for rules_heavy: BestRun(run_id='3', objective=0.8441593111492693, hyperparameters={'learning_rate': 1.4942277347418218e-05, 'num_train_epochs': 8, 'per_device_train_batch_size': 32, 'weight_decay': 0.05285585376949559, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.04364105576407306, 'gradient_accumulation_steps': 2}, run_summary=None)
--- Training final model for rules_heavy with best hyperparameters ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1176 [00:00<?, ?it/s]

{'loss': 0.0879, 'grad_norm': 0.5165202021598816, 'learning_rate': 1.4680448191119189e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.002643800340592861, 'eval_mse': 0.002643800340592861, 'eval_mae': 0.03370248153805733, 'eval_r2': 0.6049213409423828, 'eval_runtime': 5.3425, 'eval_samples_per_second': 312.026, 'eval_steps_per_second': 39.12, 'epoch': 1.0}
{'loss': 0.0043, 'grad_norm': 0.7089717984199524, 'learning_rate': 1.3284306223109533e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0016049211844801903, 'eval_mse': 0.0016049211844801903, 'eval_mae': 0.024388771504163742, 'eval_r2': 0.7601671814918518, 'eval_runtime': 5.3008, 'eval_samples_per_second': 314.48, 'eval_steps_per_second': 39.428, 'epoch': 2.0}
{'loss': 0.0031, 'grad_norm': 0.19994042813777924, 'learning_rate': 1.092607069041651e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0013208257732912898, 'eval_mse': 0.0013208257732912898, 'eval_mae': 0.020841119810938835, 'eval_r2': 0.8026212453842163, 'eval_runtime': 5.3136, 'eval_samples_per_second': 313.722, 'eval_steps_per_second': 39.333, 'epoch': 3.0}
{'loss': 0.0026, 'grad_norm': 0.24008867144584656, 'learning_rate': 7.971928904397898e-06, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0012167225359007716, 'eval_mse': 0.0012167225359007716, 'eval_mae': 0.019266735762357712, 'eval_r2': 0.8181779980659485, 'eval_runtime': 5.3023, 'eval_samples_per_second': 314.391, 'eval_steps_per_second': 39.417, 'epoch': 4.0}
{'loss': 0.0024, 'grad_norm': 0.26535564661026, 'learning_rate': 4.952959071863967e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0012195168528705835, 'eval_mse': 0.0012195168528705835, 'eval_mae': 0.018907934427261353, 'eval_r2': 0.8177604079246521, 'eval_runtime': 5.2759, 'eval_samples_per_second': 315.963, 'eval_steps_per_second': 39.614, 'epoch': 5.0}
{'loss': 0.0022, 'grad_norm': 0.4213685393333435, 'learning_rate': 2.3379474657377515e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.001136871986091137, 'eval_mse': 0.001136871986091137, 'eval_mae': 0.018654359504580498, 'eval_r2': 0.830110490322113, 'eval_runtime': 5.2929, 'eval_samples_per_second': 314.95, 'eval_steps_per_second': 39.487, 'epoch': 6.0}
{'loss': 0.0022, 'grad_norm': 0.2464292198419571, 'learning_rate': 5.970054669767861e-07, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011179663706570864, 'eval_mse': 0.0011179664870724082, 'eval_mae': 0.017293641343712807, 'eval_r2': 0.8329356908798218, 'eval_runtime': 5.244, 'eval_samples_per_second': 317.886, 'eval_steps_per_second': 39.855, 'epoch': 7.0}
{'loss': 0.0022, 'grad_norm': 0.19494372606277466, 'learning_rate': 0.0, 'epoch': 7.97}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011215497506782413, 'eval_mse': 0.0011215497506782413, 'eval_mae': 0.017303425818681717, 'eval_r2': 0.8324002027511597, 'eval_runtime': 5.2763, 'eval_samples_per_second': 315.94, 'eval_steps_per_second': 39.611, 'epoch': 7.97}
{'train_runtime': 649.4566, 'train_samples_per_second': 116.282, 'train_steps_per_second': 1.811, 'train_loss': 0.013361282317208595, 'epoch': 7.97}
--- Saved final optimized model to rm_out_rules_heavy_final ---

--- Evaluating final model for rules_heavy ---


  0%|          | 0/209 [00:00<?, ?it/s]


--- Summary for Configuration: rules_heavy ---
  Best Hyperparameters Found:
    - learning_rate: 1.4942277347418218e-05
    - num_train_epochs: 8
    - per_device_train_batch_size: 32
    - weight_decay: 0.05285585376949559
    - lr_scheduler_type: cosine
    - warmup_ratio: 0.04364105576407306
    - gradient_accumulation_steps: 2
  Final Evaluation Metrics:
    - eval_loss: 0.0011
    - eval_mse: 0.0011
    - eval_mae: 0.0173
    - eval_r2: 0.8329
    - eval_runtime: 5.2897
    - eval_samples_per_second: 315.1380
    - eval_steps_per_second: 39.5100
--------------------------------------------------

--- Processing configuration: meaning_heavy ---
Weights: {'rules_score': 0.2, 'meaning_score': 0.7, 'grammar_score': 0.1}
--- Calculating rewards for the TRAINING set ---


Calculating stable scores (train) (num_proc=12):   0%|          | 0/9440 [00:00<?, ? examples/s]

Calculating meaning scores (train):   0%|          | 0/9440 [00:00<?, ?it/s]

--- Calculating rewards for the TEST set ---


Calculating stable scores (test) (num_proc=12):   0%|          | 0/1667 [00:00<?, ? examples/s]

Calculating meaning scores (test):   0%|          | 0/1667 [00:00<?, ?it/s]

--- Finished calculating rewards for meaning_heavy ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-09-25 20:28:18,284] A new study created in memory with name: no-name-3526fefc-ba53-47ea-b078-42f7d9b9eafd
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Running hyperparameter search for meaning_heavy ---


  0%|          | 0/1036 [00:00<?, ?it/s]

{'loss': 0.2159, 'grad_norm': 0.5377567410469055, 'learning_rate': 1.665325476992091e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0017088181339204311, 'eval_mse': 0.0017088181339204311, 'eval_mae': 0.028786707669496536, 'eval_r2': -0.13376319408416748, 'eval_runtime': 5.331, 'eval_samples_per_second': 312.698, 'eval_steps_per_second': 39.205, 'epoch': 1.0}
{'loss': 0.0057, 'grad_norm': 0.1065053790807724, 'learning_rate': 1.6364077587323845e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010533261811360717, 'eval_mse': 0.0010533261811360717, 'eval_mae': 0.021835265681147575, 'eval_r2': 0.30114132165908813, 'eval_runtime': 5.3109, 'eval_samples_per_second': 313.885, 'eval_steps_per_second': 39.353, 'epoch': 2.0}
{'loss': 0.0037, 'grad_norm': 0.1738842874765396, 'learning_rate': 1.5615509237119018e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008844033582136035, 'eval_mse': 0.0008844033000059426, 'eval_mae': 0.020397860556840897, 'eval_r2': 0.4132179021835327, 'eval_runtime': 5.3054, 'eval_samples_per_second': 314.209, 'eval_steps_per_second': 39.394, 'epoch': 3.0}
{'loss': 0.0028, 'grad_norm': 0.0840199738740921, 'learning_rate': 1.4450342581297061e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007936206529848278, 'eval_mse': 0.0007936206529848278, 'eval_mae': 0.019125068560242653, 'eval_r2': 0.4734501838684082, 'eval_runtime': 5.2868, 'eval_samples_per_second': 315.313, 'eval_steps_per_second': 39.532, 'epoch': 4.0}
{'loss': 0.0024, 'grad_norm': 0.2632709741592407, 'learning_rate': 1.2935185853942451e-05, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007783371838741004, 'eval_mse': 0.0007783371838741004, 'eval_mae': 0.01764771342277527, 'eval_r2': 0.48359042406082153, 'eval_runtime': 5.3045, 'eval_samples_per_second': 314.261, 'eval_steps_per_second': 39.4, 'epoch': 5.0}
{'loss': 0.0021, 'grad_norm': 0.11598999798297882, 'learning_rate': 1.115665491679997e-05, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007224938017316163, 'eval_mse': 0.0007224938017316163, 'eval_mae': 0.01632089912891388, 'eval_r2': 0.5206412672996521, 'eval_runtime': 5.2788, 'eval_samples_per_second': 315.789, 'eval_steps_per_second': 39.592, 'epoch': 6.0}
{'loss': 0.0021, 'grad_norm': 0.23771485686302185, 'learning_rate': 9.216421753354333e-06, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006902972236275673, 'eval_mse': 0.0006902972818352282, 'eval_mae': 0.016589635983109474, 'eval_r2': 0.5420029163360596, 'eval_runtime': 5.2911, 'eval_samples_per_second': 315.057, 'eval_steps_per_second': 39.5, 'epoch': 7.0}
{'loss': 0.0019, 'grad_norm': 0.07760289311408997, 'learning_rate': 7.225402260557717e-06, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.000697655079420656, 'eval_mse': 0.000697655079420656, 'eval_mae': 0.01597188599407673, 'eval_r2': 0.5371211767196655, 'eval_runtime': 5.3148, 'eval_samples_per_second': 313.652, 'eval_steps_per_second': 39.324, 'epoch': 8.0}
{'loss': 0.0019, 'grad_norm': 0.12413498759269714, 'learning_rate': 5.297415600484737e-06, 'epoch': 9.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008102816063910723, 'eval_mse': 0.0008102816063910723, 'eval_mae': 0.01922612451016903, 'eval_r2': 0.46239596605300903, 'eval_runtime': 5.3222, 'eval_samples_per_second': 313.214, 'eval_steps_per_second': 39.269, 'epoch': 9.0}
{'loss': 0.0018, 'grad_norm': 0.2685449421405792, 'learning_rate': 3.5426775831435425e-06, 'epoch': 10.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006463936879299581, 'eval_mse': 0.0006463936879299581, 'eval_mae': 0.015619346871972084, 'eval_r2': 0.5711320638656616, 'eval_runtime': 5.3086, 'eval_samples_per_second': 314.017, 'eval_steps_per_second': 39.37, 'epoch': 10.0}
{'loss': 0.0018, 'grad_norm': 0.18375660479068756, 'learning_rate': 2.061500039491513e-06, 'epoch': 11.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00062382040778175, 'eval_mse': 0.00062382040778175, 'eval_mae': 0.015169139951467514, 'eval_r2': 0.586108922958374, 'eval_runtime': 5.2825, 'eval_samples_per_second': 315.573, 'eval_steps_per_second': 39.565, 'epoch': 11.0}
{'loss': 0.0017, 'grad_norm': 0.0785273090004921, 'learning_rate': 9.385563680122772e-07, 'epoch': 12.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006357321981340647, 'eval_mse': 0.0006357322563417256, 'eval_mae': 0.015367044135928154, 'eval_r2': 0.5782056450843811, 'eval_runtime': 5.2908, 'eval_samples_per_second': 315.077, 'eval_steps_per_second': 39.503, 'epoch': 12.0}
{'loss': 0.0017, 'grad_norm': 0.1009344831109047, 'learning_rate': 2.380410721786453e-07, 'epoch': 13.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.000632758135907352, 'eval_mse': 0.000632758135907352, 'eval_mae': 0.01530089508742094, 'eval_r2': 0.5801789164543152, 'eval_runtime': 5.2758, 'eval_samples_per_second': 315.968, 'eval_steps_per_second': 39.614, 'epoch': 13.0}
{'loss': 0.0017, 'grad_norm': 0.06797780096530914, 'learning_rate': 0.0, 'epoch': 14.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006386126042343676, 'eval_mse': 0.0006386126042343676, 'eval_mae': 0.015408344566822052, 'eval_r2': 0.5762946009635925, 'eval_runtime': 5.3007, 'eval_samples_per_second': 314.486, 'eval_steps_per_second': 39.429, 'epoch': 14.0}


[I 2025-09-25 20:46:41,442] Trial 0 finished with value: 0.592341558134649 and parameters: {'learning_rate': 1.6656050041314846e-05, 'num_train_epochs': 14, 'per_device_train_batch_size': 64, 'weight_decay': 0.04666852806200035, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.06304990010176506, 'gradient_accumulation_steps': 2}. Best is trial 0 with value: 0.592341558134649.


{'train_runtime': 1102.8386, 'train_samples_per_second': 119.836, 'train_steps_per_second': 0.939, 'train_loss': 0.017661199858232356, 'epoch': 14.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1776 [00:00<?, ?it/s]

{'loss': 0.0456, 'grad_norm': 0.34583210945129395, 'learning_rate': 1.3210972634003978e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00103335315361619, 'eval_mse': 0.00103335315361619, 'eval_mae': 0.02343824692070484, 'eval_r2': 0.31439298391342163, 'eval_runtime': 5.2855, 'eval_samples_per_second': 315.393, 'eval_steps_per_second': 39.542, 'epoch': 1.0}
{'loss': 0.0035, 'grad_norm': 0.10034699738025665, 'learning_rate': 1.2009975121821798e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0011027920991182327, 'eval_mse': 0.0011027920991182327, 'eval_mae': 0.021827643737196922, 'eval_r2': 0.2683217525482178, 'eval_runtime': 5.2854, 'eval_samples_per_second': 315.398, 'eval_steps_per_second': 39.543, 'epoch': 2.0}
{'loss': 0.0026, 'grad_norm': 0.12179628014564514, 'learning_rate': 1.0808977609639617e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008011030149646103, 'eval_mse': 0.0008011029567569494, 'eval_mae': 0.019268034026026726, 'eval_r2': 0.46848583221435547, 'eval_runtime': 5.2787, 'eval_samples_per_second': 315.797, 'eval_steps_per_second': 39.593, 'epoch': 3.0}
{'loss': 0.0022, 'grad_norm': 0.09790074080228806, 'learning_rate': 9.607980097457438e-06, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007327792700380087, 'eval_mse': 0.0007327792700380087, 'eval_mae': 0.017196955159306526, 'eval_r2': 0.5138170719146729, 'eval_runtime': 5.291, 'eval_samples_per_second': 315.062, 'eval_steps_per_second': 39.501, 'epoch': 4.0}
{'loss': 0.002, 'grad_norm': 0.27068471908569336, 'learning_rate': 8.406982585275257e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007204783032648265, 'eval_mse': 0.0007204783032648265, 'eval_mae': 0.018412040546536446, 'eval_r2': 0.521978497505188, 'eval_runtime': 5.2769, 'eval_samples_per_second': 315.902, 'eval_steps_per_second': 39.606, 'epoch': 5.0}
{'loss': 0.0018, 'grad_norm': 0.3903214931488037, 'learning_rate': 7.205985073093078e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006572849815711379, 'eval_mse': 0.0006572849815711379, 'eval_mae': 0.01583607867360115, 'eval_r2': 0.5639058947563171, 'eval_runtime': 5.2943, 'eval_samples_per_second': 314.869, 'eval_steps_per_second': 39.477, 'epoch': 6.0}
{'loss': 0.0018, 'grad_norm': 0.07800478488206863, 'learning_rate': 6.004987560910899e-06, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.000739723676815629, 'eval_mse': 0.0007397236186079681, 'eval_mae': 0.017164727672934532, 'eval_r2': 0.5092096924781799, 'eval_runtime': 5.2857, 'eval_samples_per_second': 315.382, 'eval_steps_per_second': 39.541, 'epoch': 7.0}
{'loss': 0.0017, 'grad_norm': 0.090549997985363, 'learning_rate': 4.803990048728719e-06, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006524591590277851, 'eval_mse': 0.0006524591590277851, 'eval_mae': 0.015377243049442768, 'eval_r2': 0.5671076774597168, 'eval_runtime': 5.2961, 'eval_samples_per_second': 314.76, 'eval_steps_per_second': 39.463, 'epoch': 8.0}
{'loss': 0.0017, 'grad_norm': 0.14385519921779633, 'learning_rate': 3.602992536546539e-06, 'epoch': 9.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006447204505093396, 'eval_mse': 0.0006447204505093396, 'eval_mae': 0.015717316418886185, 'eval_r2': 0.5722421407699585, 'eval_runtime': 5.2717, 'eval_samples_per_second': 316.218, 'eval_steps_per_second': 39.646, 'epoch': 9.0}
{'loss': 0.0016, 'grad_norm': 0.0884338840842247, 'learning_rate': 2.4019950243643595e-06, 'epoch': 10.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006966283544898033, 'eval_mse': 0.0006966283544898033, 'eval_mae': 0.017511708661913872, 'eval_r2': 0.5378024578094482, 'eval_runtime': 5.2787, 'eval_samples_per_second': 315.795, 'eval_steps_per_second': 39.593, 'epoch': 10.0}
{'loss': 0.0016, 'grad_norm': 0.21684174239635468, 'learning_rate': 1.2009975121821797e-06, 'epoch': 11.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005908847087994218, 'eval_mse': 0.0005908847087994218, 'eval_mae': 0.01452929899096489, 'eval_r2': 0.6079609990119934, 'eval_runtime': 5.2655, 'eval_samples_per_second': 316.59, 'eval_steps_per_second': 39.692, 'epoch': 11.0}
{'loss': 0.0015, 'grad_norm': 0.12100034952163696, 'learning_rate': 0.0, 'epoch': 12.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006039853906258941, 'eval_mse': 0.0006039853324182332, 'eval_mae': 0.014920707792043686, 'eval_r2': 0.5992690324783325, 'eval_runtime': 5.2762, 'eval_samples_per_second': 315.945, 'eval_steps_per_second': 39.612, 'epoch': 12.0}


[I 2025-09-25 21:02:38,372] Trial 1 finished with value: 0.6147937256027944 and parameters: {'learning_rate': 1.4371395905909732e-05, 'num_train_epochs': 12, 'per_device_train_batch_size': 64, 'weight_decay': 0.038740667770757256, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.002652803013689209, 'gradient_accumulation_steps': 1}. Best is trial 0 with value: 0.592341558134649.


{'train_runtime': 956.5322, 'train_samples_per_second': 118.428, 'train_steps_per_second': 1.857, 'train_loss': 0.005645358028846818, 'epoch': 12.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1176 [00:00<?, ?it/s]

{'loss': 0.1024, 'grad_norm': 0.11736889183521271, 'learning_rate': 2.8589098845480426e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0009823221480473876, 'eval_mse': 0.0009823220316320658, 'eval_mae': 0.022250119596719742, 'eval_r2': 0.34825098514556885, 'eval_runtime': 5.2866, 'eval_samples_per_second': 315.328, 'eval_steps_per_second': 39.534, 'epoch': 1.0}
{'loss': 0.0029, 'grad_norm': 0.07929808646440506, 'learning_rate': 2.447715848675244e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008251303806900978, 'eval_mse': 0.0008251304388977587, 'eval_mae': 0.018168097361922264, 'eval_r2': 0.45254409313201904, 'eval_runtime': 5.2815, 'eval_samples_per_second': 315.627, 'eval_steps_per_second': 39.572, 'epoch': 2.0}
{'loss': 0.0022, 'grad_norm': 0.29012531042099, 'learning_rate': 2.0393001508826663e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007480623899027705, 'eval_mse': 0.0007480623899027705, 'eval_mae': 0.01904672011733055, 'eval_r2': 0.5036770701408386, 'eval_runtime': 5.2867, 'eval_samples_per_second': 315.32, 'eval_steps_per_second': 39.533, 'epoch': 3.0}
{'loss': 0.0019, 'grad_norm': 0.06937593966722488, 'learning_rate': 1.628106115009867e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007212135824374855, 'eval_mse': 0.0007212134660221636, 'eval_mae': 0.017420358955860138, 'eval_r2': 0.5214906930923462, 'eval_runtime': 5.3173, 'eval_samples_per_second': 313.507, 'eval_steps_per_second': 39.306, 'epoch': 4.0}
{'loss': 0.0018, 'grad_norm': 0.3480001389980316, 'learning_rate': 1.2196904172172893e-05, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007212128839455545, 'eval_mse': 0.0007212130003608763, 'eval_mae': 0.01921602338552475, 'eval_r2': 0.5214910507202148, 'eval_runtime': 5.2707, 'eval_samples_per_second': 316.274, 'eval_steps_per_second': 39.653, 'epoch': 5.0}
{'loss': 0.0016, 'grad_norm': 0.2391441911458969, 'learning_rate': 8.084963813444903e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005982514703646302, 'eval_mse': 0.0005982514703646302, 'eval_mae': 0.01449004840105772, 'eval_r2': 0.6030732989311218, 'eval_runtime': 5.318, 'eval_samples_per_second': 313.462, 'eval_steps_per_second': 39.3, 'epoch': 6.0}
{'loss': 0.0016, 'grad_norm': 0.3112633526325226, 'learning_rate': 4.000806835519127e-06, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005884912679903209, 'eval_mse': 0.0005884912679903209, 'eval_mae': 0.015750432386994362, 'eval_r2': 0.6095489859580994, 'eval_runtime': 5.2932, 'eval_samples_per_second': 314.93, 'eval_steps_per_second': 39.484, 'epoch': 7.0}
{'loss': 0.0015, 'grad_norm': 0.07148171961307526, 'learning_rate': 0.0, 'epoch': 7.97}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005732090212404728, 'eval_mse': 0.0005732090794481337, 'eval_mae': 0.014092964120209217, 'eval_r2': 0.6196883916854858, 'eval_runtime': 5.2985, 'eval_samples_per_second': 314.617, 'eval_steps_per_second': 39.445, 'epoch': 7.97}


[I 2025-09-25 21:13:11,255] Trial 2 finished with value: 0.6343545648851432 and parameters: {'learning_rate': 2.9978267885591236e-05, 'num_train_epochs': 8, 'per_device_train_batch_size': 32, 'weight_decay': 0.05787060956468202, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.08243506441787984, 'gradient_accumulation_steps': 2}. Best is trial 0 with value: 0.592341558134649.


{'train_runtime': 632.4723, 'train_samples_per_second': 119.404, 'train_steps_per_second': 1.859, 'train_loss': 0.0144814418210667, 'epoch': 7.97}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/814 [00:00<?, ?it/s]

{'loss': 0.1447, 'grad_norm': 0.24585410952568054, 'learning_rate': 1.6183725977793632e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0014969747280701995, 'eval_mse': 0.0014969747280701995, 'eval_mae': 0.02644188329577446, 'eval_r2': 0.006790220737457275, 'eval_runtime': 5.2936, 'eval_samples_per_second': 314.909, 'eval_steps_per_second': 39.482, 'epoch': 1.0}
{'loss': 0.0053, 'grad_norm': 0.10241752117872238, 'learning_rate': 1.5378401371655036e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.001051203696988523, 'eval_mse': 0.0010512035805732012, 'eval_mae': 0.021736405789852142, 'eval_r2': 0.3025495409965515, 'eval_runtime': 5.2776, 'eval_samples_per_second': 315.866, 'eval_steps_per_second': 39.602, 'epoch': 2.0}
{'loss': 0.0036, 'grad_norm': 0.2351228892803192, 'learning_rate': 1.3951718121092636e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008951796917244792, 'eval_mse': 0.0008951797499321401, 'eval_mae': 0.019700845703482628, 'eval_r2': 0.40606796741485596, 'eval_runtime': 5.2894, 'eval_samples_per_second': 315.159, 'eval_steps_per_second': 39.513, 'epoch': 3.0}
{'loss': 0.0029, 'grad_norm': 0.14941474795341492, 'learning_rate': 1.2026644314238088e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.000874845078215003, 'eval_mse': 0.0008748450200073421, 'eval_mae': 0.018912145867943764, 'eval_r2': 0.4195595979690552, 'eval_runtime': 5.2926, 'eval_samples_per_second': 314.965, 'eval_steps_per_second': 39.489, 'epoch': 4.0}
{'loss': 0.0025, 'grad_norm': 0.2376932054758072, 'learning_rate': 9.769105110336017e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007513191667385399, 'eval_mse': 0.0007513191667385399, 'eval_mae': 0.017810391262173653, 'eval_r2': 0.5015162825584412, 'eval_runtime': 5.3149, 'eval_samples_per_second': 313.649, 'eval_steps_per_second': 39.324, 'epoch': 5.0}
{'loss': 0.0022, 'grad_norm': 0.08217911422252655, 'learning_rate': 7.3736813875713014e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008040419197641313, 'eval_mse': 0.0008040419197641313, 'eval_mae': 0.01735563762485981, 'eval_r2': 0.46653586626052856, 'eval_runtime': 5.2947, 'eval_samples_per_second': 314.842, 'eval_steps_per_second': 39.473, 'epoch': 6.0}
{'loss': 0.0022, 'grad_norm': 0.15299008786678314, 'learning_rate': 5.0468385091122864e-06, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007233298383653164, 'eval_mse': 0.0007233297801576555, 'eval_mae': 0.016657058149576187, 'eval_r2': 0.5200866460800171, 'eval_runtime': 5.2903, 'eval_samples_per_second': 315.105, 'eval_steps_per_second': 39.506, 'epoch': 7.0}
{'loss': 0.002, 'grad_norm': 0.0635455772280693, 'learning_rate': 2.989130746577467e-06, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006880748551338911, 'eval_mse': 0.0006880748551338911, 'eval_mae': 0.01664305292069912, 'eval_r2': 0.543477475643158, 'eval_runtime': 5.29, 'eval_samples_per_second': 315.122, 'eval_steps_per_second': 39.508, 'epoch': 8.0}
{'loss': 0.002, 'grad_norm': 0.08041484653949738, 'learning_rate': 1.3779151898414261e-06, 'epoch': 9.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006904239999130368, 'eval_mse': 0.0006904239999130368, 'eval_mae': 0.016102802008390427, 'eval_r2': 0.5419188737869263, 'eval_runtime': 5.2878, 'eval_samples_per_second': 315.255, 'eval_steps_per_second': 39.525, 'epoch': 9.0}
{'loss': 0.002, 'grad_norm': 0.23011310398578644, 'learning_rate': 3.520650586617344e-07, 'epoch': 10.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007165506831370294, 'eval_mse': 0.0007165506831370294, 'eval_mae': 0.01647321693599224, 'eval_r2': 0.5245844125747681, 'eval_runtime': 5.2884, 'eval_samples_per_second': 315.219, 'eval_steps_per_second': 39.521, 'epoch': 10.0}
{'loss': 0.002, 'grad_norm': 0.15215785801410675, 'learning_rate': 0.0, 'epoch': 11.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006937851430848241, 'eval_mse': 0.000693785201292485, 'eval_mae': 0.01615152694284916, 'eval_r2': 0.5396888256072998, 'eval_runtime': 5.308, 'eval_samples_per_second': 314.056, 'eval_steps_per_second': 39.375, 'epoch': 11.0}


[I 2025-09-25 21:27:38,207] Trial 3 finished with value: 0.5565341377514414 and parameters: {'learning_rate': 1.6338721043242632e-05, 'num_train_epochs': 11, 'per_device_train_batch_size': 64, 'weight_decay': 0.08801855121750472, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.02977271097100294, 'gradient_accumulation_steps': 2}. Best is trial 3 with value: 0.5565341377514414.


{'train_runtime': 866.5524, 'train_samples_per_second': 119.831, 'train_steps_per_second': 0.939, 'train_loss': 0.015577820909989847, 'epoch': 11.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1036 [00:00<?, ?it/s]

{'loss': 0.2435, 'grad_norm': 0.6104529500007629, 'learning_rate': 1.4260803851810857e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0018239114433526993, 'eval_mse': 0.0018239114433526993, 'eval_mae': 0.03427232429385185, 'eval_r2': -0.21012508869171143, 'eval_runtime': 5.2752, 'eval_samples_per_second': 316.009, 'eval_steps_per_second': 39.62, 'epoch': 1.0}
{'loss': 0.0061, 'grad_norm': 0.10924327373504639, 'learning_rate': 1.502992712193464e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010536789195612073, 'eval_mse': 0.0010536789195612073, 'eval_mae': 0.02197161689400673, 'eval_r2': 0.3009072542190552, 'eval_runtime': 5.3208, 'eval_samples_per_second': 313.3, 'eval_steps_per_second': 39.28, 'epoch': 2.0}
{'loss': 0.0039, 'grad_norm': 0.14125442504882812, 'learning_rate': 1.4400921628291976e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0009236696059815586, 'eval_mse': 0.0009236696059815586, 'eval_mae': 0.020925156772136688, 'eval_r2': 0.38716554641723633, 'eval_runtime': 5.3336, 'eval_samples_per_second': 312.545, 'eval_steps_per_second': 39.185, 'epoch': 3.0}
{'loss': 0.003, 'grad_norm': 0.06960843503475189, 'learning_rate': 1.3373266423057122e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008198122377507389, 'eval_mse': 0.0008198122959583998, 'eval_mae': 0.019419582560658455, 'eval_r2': 0.4560725688934326, 'eval_runtime': 5.3381, 'eval_samples_per_second': 312.283, 'eval_steps_per_second': 39.153, 'epoch': 4.0}
{'loss': 0.0025, 'grad_norm': 0.30841243267059326, 'learning_rate': 1.2007307712644326e-05, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008001170353963971, 'eval_mse': 0.000800117093604058, 'eval_mae': 0.017830222845077515, 'eval_r2': 0.4691399335861206, 'eval_runtime': 5.3232, 'eval_samples_per_second': 313.156, 'eval_steps_per_second': 39.262, 'epoch': 5.0}
{'loss': 0.0022, 'grad_norm': 0.08770128339529037, 'learning_rate': 1.038325764006812e-05, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007708287448622286, 'eval_mse': 0.0007708287448622286, 'eval_mae': 0.01694766990840435, 'eval_r2': 0.4885721206665039, 'eval_runtime': 5.3194, 'eval_samples_per_second': 313.382, 'eval_steps_per_second': 39.29, 'epoch': 6.0}
{'loss': 0.0021, 'grad_norm': 0.20684203505516052, 'learning_rate': 8.596484048947186e-06, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007194025092758238, 'eval_mse': 0.0007194025092758238, 'eval_mae': 0.01674339361488819, 'eval_r2': 0.5226922631263733, 'eval_runtime': 5.3294, 'eval_samples_per_second': 312.792, 'eval_steps_per_second': 39.216, 'epoch': 7.0}
{'loss': 0.002, 'grad_norm': 0.06894320994615555, 'learning_rate': 6.751910270955482e-06, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007371223764494061, 'eval_mse': 0.0007371223764494061, 'eval_mae': 0.016462275758385658, 'eval_r2': 0.5109355449676514, 'eval_runtime': 5.3471, 'eval_samples_per_second': 311.756, 'eval_steps_per_second': 39.086, 'epoch': 8.0}
{'loss': 0.002, 'grad_norm': 0.09670048207044601, 'learning_rate': 4.957853793695838e-06, 'epoch': 9.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008338867337442935, 'eval_mse': 0.0008338867919519544, 'eval_mae': 0.01878943480551243, 'eval_r2': 0.4467344880104065, 'eval_runtime': 5.3236, 'eval_samples_per_second': 313.132, 'eval_steps_per_second': 39.259, 'epoch': 9.0}
{'loss': 0.0019, 'grad_norm': 0.26625606417655945, 'learning_rate': 3.319665616164848e-06, 'epoch': 10.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006984657375141978, 'eval_mse': 0.0006984657375141978, 'eval_mae': 0.0163236316293478, 'eval_r2': 0.5365833640098572, 'eval_runtime': 5.3351, 'eval_samples_per_second': 312.46, 'eval_steps_per_second': 39.175, 'epoch': 10.0}
{'loss': 0.0019, 'grad_norm': 0.18761703372001648, 'learning_rate': 1.9335438030471378e-06, 'epoch': 11.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006622860673815012, 'eval_mse': 0.0006622860673815012, 'eval_mae': 0.015582353807985783, 'eval_r2': 0.5605877637863159, 'eval_runtime': 5.3293, 'eval_samples_per_second': 312.797, 'eval_steps_per_second': 39.217, 'epoch': 11.0}
{'loss': 0.0018, 'grad_norm': 0.08588498830795288, 'learning_rate': 8.808845197332063e-07, 'epoch': 12.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006769195315428078, 'eval_mse': 0.0006769194733351469, 'eval_mae': 0.0158335343003273, 'eval_r2': 0.5508788228034973, 'eval_runtime': 5.33, 'eval_samples_per_second': 312.755, 'eval_steps_per_second': 39.212, 'epoch': 12.0}
{'loss': 0.0018, 'grad_norm': 0.11293109506368637, 'learning_rate': 2.235022678877695e-07, 'epoch': 13.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006716595962643623, 'eval_mse': 0.0006716595962643623, 'eval_mae': 0.015750721096992493, 'eval_r2': 0.5543686151504517, 'eval_runtime': 5.3314, 'eval_samples_per_second': 312.676, 'eval_steps_per_second': 39.202, 'epoch': 13.0}
{'loss': 0.0018, 'grad_norm': 0.06456534564495087, 'learning_rate': 0.0, 'epoch': 14.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006798297399654984, 'eval_mse': 0.0006798297399654984, 'eval_mae': 0.015883460640907288, 'eval_r2': 0.5489479303359985, 'eval_runtime': 5.3378, 'eval_samples_per_second': 312.298, 'eval_steps_per_second': 39.154, 'epoch': 14.0}


[I 2025-09-25 21:45:58,609] Trial 4 finished with value: 0.5655112207168713 and parameters: {'learning_rate': 1.5224371679635915e-05, 'num_train_epochs': 14, 'per_device_train_batch_size': 64, 'weight_decay': 0.062231633350477246, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.0756123696990563, 'gradient_accumulation_steps': 2}. Best is trial 3 with value: 0.5565341377514414.


{'train_runtime': 1099.9714, 'train_samples_per_second': 120.149, 'train_steps_per_second': 0.942, 'train_loss': 0.01974265634093045, 'epoch': 14.0}
Best run for meaning_heavy: BestRun(run_id='3', objective=0.5565341377514414, hyperparameters={'learning_rate': 1.6338721043242632e-05, 'num_train_epochs': 11, 'per_device_train_batch_size': 64, 'weight_decay': 0.08801855121750472, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.02977271097100294, 'gradient_accumulation_steps': 2}, run_summary=None)
--- Training final model for meaning_heavy with best hyperparameters ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/814 [00:00<?, ?it/s]

{'loss': 0.1447, 'grad_norm': 0.24585440754890442, 'learning_rate': 1.6183725977793632e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0014969733310863376, 'eval_mse': 0.0014969734475016594, 'eval_mae': 0.026441868394613266, 'eval_r2': 0.006791055202484131, 'eval_runtime': 5.3044, 'eval_samples_per_second': 314.267, 'eval_steps_per_second': 39.401, 'epoch': 1.0}
{'loss': 0.0053, 'grad_norm': 0.10230160504579544, 'learning_rate': 1.5378401371655036e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0010520577197894454, 'eval_mse': 0.0010520577197894454, 'eval_mae': 0.02174557000398636, 'eval_r2': 0.3019828796386719, 'eval_runtime': 5.2789, 'eval_samples_per_second': 315.787, 'eval_steps_per_second': 39.592, 'epoch': 2.0}
{'loss': 0.0036, 'grad_norm': 0.2331494241952896, 'learning_rate': 1.3951718121092636e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008945229928940535, 'eval_mse': 0.0008945229346863925, 'eval_mae': 0.019711242988705635, 'eval_r2': 0.40650373697280884, 'eval_runtime': 5.2744, 'eval_samples_per_second': 316.055, 'eval_steps_per_second': 39.625, 'epoch': 3.0}
{'loss': 0.0029, 'grad_norm': 0.1529507339000702, 'learning_rate': 1.2026644314238088e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008730589179322124, 'eval_mse': 0.0008730590343475342, 'eval_mae': 0.018893679603934288, 'eval_r2': 0.4207445979118347, 'eval_runtime': 5.2847, 'eval_samples_per_second': 315.438, 'eval_steps_per_second': 39.548, 'epoch': 4.0}
{'loss': 0.0025, 'grad_norm': 0.24003976583480835, 'learning_rate': 9.769105110336017e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007517145713791251, 'eval_mse': 0.0007517145713791251, 'eval_mae': 0.017847534269094467, 'eval_r2': 0.5012539625167847, 'eval_runtime': 5.2961, 'eval_samples_per_second': 314.759, 'eval_steps_per_second': 39.463, 'epoch': 5.0}
{'loss': 0.0022, 'grad_norm': 0.08287274837493896, 'learning_rate': 7.3736813875713014e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008049083990044892, 'eval_mse': 0.0008049083990044892, 'eval_mae': 0.017375566065311432, 'eval_r2': 0.4659609794616699, 'eval_runtime': 5.2883, 'eval_samples_per_second': 315.222, 'eval_steps_per_second': 39.521, 'epoch': 6.0}
{'loss': 0.0022, 'grad_norm': 0.15347732603549957, 'learning_rate': 5.0468385091122864e-06, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007234062068164349, 'eval_mse': 0.0007234062068164349, 'eval_mae': 0.0166764035820961, 'eval_r2': 0.5200358629226685, 'eval_runtime': 5.2901, 'eval_samples_per_second': 315.116, 'eval_steps_per_second': 39.508, 'epoch': 7.0}
{'loss': 0.002, 'grad_norm': 0.06295347958803177, 'learning_rate': 2.989130746577467e-06, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006873023812659085, 'eval_mse': 0.0006873023812659085, 'eval_mae': 0.016611412167549133, 'eval_r2': 0.5439900159835815, 'eval_runtime': 5.297, 'eval_samples_per_second': 314.706, 'eval_steps_per_second': 39.456, 'epoch': 8.0}
{'loss': 0.002, 'grad_norm': 0.08089703321456909, 'learning_rate': 1.3779151898414261e-06, 'epoch': 9.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006899556028656662, 'eval_mse': 0.0006899556028656662, 'eval_mae': 0.01609998568892479, 'eval_r2': 0.5422296524047852, 'eval_runtime': 5.2794, 'eval_samples_per_second': 315.756, 'eval_steps_per_second': 39.588, 'epoch': 9.0}
{'loss': 0.002, 'grad_norm': 0.2318420112133026, 'learning_rate': 3.520650586617344e-07, 'epoch': 10.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007164354319684207, 'eval_mse': 0.0007164354319684207, 'eval_mae': 0.016473818570375443, 'eval_r2': 0.5246608257293701, 'eval_runtime': 5.2864, 'eval_samples_per_second': 315.335, 'eval_steps_per_second': 39.535, 'epoch': 10.0}
{'loss': 0.002, 'grad_norm': 0.15020182728767395, 'learning_rate': 0.0, 'epoch': 11.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006938438164070249, 'eval_mse': 0.0006938438164070249, 'eval_mae': 0.016158640384674072, 'eval_r2': 0.5396499037742615, 'eval_runtime': 5.2731, 'eval_samples_per_second': 316.132, 'eval_steps_per_second': 39.635, 'epoch': 11.0}
{'train_runtime': 860.217, 'train_samples_per_second': 120.714, 'train_steps_per_second': 0.946, 'train_loss': 0.01557725159044055, 'epoch': 11.0}
--- Saved final optimized model to rm_out_meaning_heavy_final ---

--- Evaluating final model for meaning_heavy ---


  0%|          | 0/209 [00:00<?, ?it/s]

Calculating stable scores (train) (num_proc=12):   0%|          | 0/9440 [00:00<?, ? examples/s]


--- Summary for Configuration: meaning_heavy ---
  Best Hyperparameters Found:
    - learning_rate: 1.6338721043242632e-05
    - num_train_epochs: 11
    - per_device_train_batch_size: 64
    - weight_decay: 0.08801855121750472
    - lr_scheduler_type: cosine
    - warmup_ratio: 0.02977271097100294
    - gradient_accumulation_steps: 2
  Final Evaluation Metrics:
    - eval_loss: 0.0007
    - eval_mse: 0.0007
    - eval_mae: 0.0166
    - eval_r2: 0.5440
    - eval_runtime: 5.2713
    - eval_samples_per_second: 316.2390
    - eval_steps_per_second: 39.6480
--------------------------------------------------

--- Processing configuration: grammar_focused ---
Weights: {'rules_score': 0.2, 'meaning_score': 0.1, 'grammar_score': 0.7}
--- Calculating rewards for the TRAINING set ---


Calculating meaning scores (train):   0%|          | 0/9440 [00:00<?, ?it/s]

--- Calculating rewards for the TEST set ---


Calculating stable scores (test) (num_proc=12):   0%|          | 0/1667 [00:00<?, ? examples/s]

Calculating meaning scores (test):   0%|          | 0/1667 [00:00<?, ?it/s]

--- Finished calculating rewards for grammar_focused ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-09-25 22:06:42,865] A new study created in memory with name: no-name-454813b3-5323-4a2e-9bba-ac263e72bc90
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Running hyperparameter search for grammar_focused ---


  0%|          | 0/1323 [00:00<?, ?it/s]

{'loss': 0.1281, 'grad_norm': 0.2834423780441284, 'learning_rate': 2.4255943871754864e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0009492376120761037, 'eval_mse': 0.0009492376120761037, 'eval_mae': 0.021873291581869125, 'eval_r2': -0.2046501636505127, 'eval_runtime': 5.2386, 'eval_samples_per_second': 318.215, 'eval_steps_per_second': 39.896, 'epoch': 1.0}
{'loss': 0.0031, 'grad_norm': 0.21327441930770874, 'learning_rate': 2.1203325085173468e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005915566580370069, 'eval_mse': 0.0005915566580370069, 'eval_mae': 0.017430711537599564, 'eval_r2': 0.2492724061012268, 'eval_runtime': 5.2575, 'eval_samples_per_second': 317.073, 'eval_steps_per_second': 39.753, 'epoch': 2.0}
{'loss': 0.0022, 'grad_norm': 0.07098811864852905, 'learning_rate': 1.817133210120411e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005202414467930794, 'eval_mse': 0.0005202414467930794, 'eval_mae': 0.015820929780602455, 'eval_r2': 0.33977657556533813, 'eval_runtime': 5.2838, 'eval_samples_per_second': 315.494, 'eval_steps_per_second': 39.555, 'epoch': 3.0}
{'loss': 0.0019, 'grad_norm': 0.09079191088676453, 'learning_rate': 1.5118713314622718e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.000540912093129009, 'eval_mse': 0.000540912093129009, 'eval_mae': 0.014851117506623268, 'eval_r2': 0.31354397535324097, 'eval_runtime': 5.3009, 'eval_samples_per_second': 314.476, 'eval_steps_per_second': 39.427, 'epoch': 4.0}
{'loss': 0.0017, 'grad_norm': 0.3213924169540405, 'learning_rate': 1.2086720330653359e-05, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00048598338617011905, 'eval_mse': 0.00048598338617011905, 'eval_mae': 0.016088005155324936, 'eval_r2': 0.38325250148773193, 'eval_runtime': 5.2818, 'eval_samples_per_second': 315.614, 'eval_steps_per_second': 39.57, 'epoch': 5.0}
{'loss': 0.0016, 'grad_norm': 0.09743226319551468, 'learning_rate': 9.034101544071964e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0004693698137998581, 'eval_mse': 0.0004693698137998581, 'eval_mae': 0.013321101665496826, 'eval_r2': 0.40433627367019653, 'eval_runtime': 5.279, 'eval_samples_per_second': 315.779, 'eval_steps_per_second': 39.591, 'epoch': 6.0}
{'loss': 0.0016, 'grad_norm': 0.19821617007255554, 'learning_rate': 6.002108560102606e-06, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0004398328601382673, 'eval_mse': 0.0004398328601382673, 'eval_mae': 0.012852697633206844, 'eval_r2': 0.44182074069976807, 'eval_runtime': 5.2746, 'eval_samples_per_second': 316.041, 'eval_steps_per_second': 39.624, 'epoch': 7.0}
{'loss': 0.0015, 'grad_norm': 0.048551470041275024, 'learning_rate': 2.949489773521212e-06, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00048291755956597626, 'eval_mse': 0.00048291755956597626, 'eval_mae': 0.01385778933763504, 'eval_r2': 0.38714325428009033, 'eval_runtime': 5.2838, 'eval_samples_per_second': 315.492, 'eval_steps_per_second': 39.555, 'epoch': 8.0}
{'loss': 0.0015, 'grad_norm': 0.12565472722053528, 'learning_rate': 0.0, 'epoch': 8.97}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0004600746906362474, 'eval_mse': 0.0004600746906362474, 'eval_mae': 0.013298746198415756, 'eval_r2': 0.41613245010375977, 'eval_runtime': 5.2875, 'eval_samples_per_second': 315.273, 'eval_steps_per_second': 39.527, 'epoch': 8.97}


[I 2025-09-25 22:18:41,934] Trial 0 finished with value: 0.42989127099281177 and parameters: {'learning_rate': 2.468908572660763e-05, 'num_train_epochs': 9, 'per_device_train_batch_size': 32, 'weight_decay': 0.04456622391436104, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.09480543661713012, 'gradient_accumulation_steps': 2}. Best is trial 0 with value: 0.42989127099281177.


{'train_runtime': 718.6455, 'train_samples_per_second': 118.222, 'train_steps_per_second': 1.841, 'train_loss': 0.015911165278514768, 'epoch': 8.97}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/882 [00:00<?, ?it/s]

{'loss': 0.0285, 'grad_norm': 0.2782436013221741, 'learning_rate': 3.0355269065062348e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006454706308431923, 'eval_mse': 0.0006454706308431923, 'eval_mae': 0.018441341817378998, 'eval_r2': 0.18085181713104248, 'eval_runtime': 5.2863, 'eval_samples_per_second': 315.346, 'eval_steps_per_second': 39.537, 'epoch': 1.0}
{'loss': 0.0024, 'grad_norm': 0.11786005645990372, 'learning_rate': 2.4393359491879297e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005533770890906453, 'eval_mse': 0.0005533770890906453, 'eval_mae': 0.016748081892728806, 'eval_r2': 0.2977250814437866, 'eval_runtime': 5.3257, 'eval_samples_per_second': 313.013, 'eval_steps_per_second': 39.244, 'epoch': 2.0}
{'loss': 0.0019, 'grad_norm': 0.17561596632003784, 'learning_rate': 1.6257787755354905e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005416633794084191, 'eval_mse': 0.0005416633212007582, 'eval_mae': 0.014453272335231304, 'eval_r2': 0.31259065866470337, 'eval_runtime': 5.2577, 'eval_samples_per_second': 317.058, 'eval_steps_per_second': 39.751, 'epoch': 3.0}
{'loss': 0.0017, 'grad_norm': 0.11137939244508743, 'learning_rate': 8.064213505400748e-06, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00046451526577584445, 'eval_mse': 0.00046451526577584445, 'eval_mae': 0.013428816571831703, 'eval_r2': 0.4104970693588257, 'eval_runtime': 5.3043, 'eval_samples_per_second': 314.274, 'eval_steps_per_second': 39.402, 'epoch': 4.0}
{'loss': 0.0016, 'grad_norm': 0.11991415917873383, 'learning_rate': 2.1309453899105613e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00044276786502450705, 'eval_mse': 0.00044276786502450705, 'eval_mae': 0.013258546590805054, 'eval_r2': 0.4380960464477539, 'eval_runtime': 5.2508, 'eval_samples_per_second': 317.477, 'eval_steps_per_second': 39.804, 'epoch': 5.0}
{'loss': 0.0016, 'grad_norm': 0.05650680512189865, 'learning_rate': 0.0, 'epoch': 5.98}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00047708823694847524, 'eval_mse': 0.00047708823694847524, 'eval_mae': 0.01339559443295002, 'eval_r2': 0.39454102516174316, 'eval_runtime': 5.3259, 'eval_samples_per_second': 312.996, 'eval_steps_per_second': 39.242, 'epoch': 5.98}


[I 2025-09-25 22:26:37,042] Trial 1 finished with value: 0.40841370783164166 and parameters: {'learning_rate': 3.245757299728005e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 32, 'weight_decay': 0.011463245625413577, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.0028667669066542835, 'gradient_accumulation_steps': 2}. Best is trial 1 with value: 0.40841370783164166.


{'train_runtime': 474.7123, 'train_samples_per_second': 119.314, 'train_steps_per_second': 1.858, 'train_loss': 0.00629830892596926, 'epoch': 5.98}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/444 [00:00<?, ?it/s]

{'loss': 0.1897, 'grad_norm': 0.1952834278345108, 'learning_rate': 1.1866620135446003e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.000928518536966294, 'eval_mse': 0.000928518536966294, 'eval_mae': 0.024562038481235504, 'eval_r2': -0.17835617065429688, 'eval_runtime': 5.2768, 'eval_samples_per_second': 315.91, 'eval_steps_per_second': 39.607, 'epoch': 1.0}
{'loss': 0.0057, 'grad_norm': 0.11159450560808182, 'learning_rate': 9.493296108356803e-06, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006814906955696642, 'eval_mse': 0.0006814906955696642, 'eval_mae': 0.019402919337153435, 'eval_r2': 0.13513976335525513, 'eval_runtime': 5.2805, 'eval_samples_per_second': 315.689, 'eval_steps_per_second': 39.58, 'epoch': 2.0}
{'loss': 0.0041, 'grad_norm': 0.22053664922714233, 'learning_rate': 7.1199720812676026e-06, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.000710626074578613, 'eval_mse': 0.000710626074578613, 'eval_mae': 0.018915941938757896, 'eval_r2': 0.09816485643386841, 'eval_runtime': 5.2841, 'eval_samples_per_second': 315.477, 'eval_steps_per_second': 39.553, 'epoch': 3.0}
{'loss': 0.0034, 'grad_norm': 0.12846308946609497, 'learning_rate': 4.746648054178402e-06, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006070351810194552, 'eval_mse': 0.0006070351810194552, 'eval_mae': 0.017647089436650276, 'eval_r2': 0.22962915897369385, 'eval_runtime': 5.278, 'eval_samples_per_second': 315.839, 'eval_steps_per_second': 39.598, 'epoch': 4.0}
{'loss': 0.003, 'grad_norm': 0.2504008412361145, 'learning_rate': 2.373324027089201e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006072453106753528, 'eval_mse': 0.0006072453106753528, 'eval_mae': 0.017275260761380196, 'eval_r2': 0.22936242818832397, 'eval_runtime': 5.2736, 'eval_samples_per_second': 316.103, 'eval_steps_per_second': 39.631, 'epoch': 5.0}
{'loss': 0.0028, 'grad_norm': 0.08503253012895584, 'learning_rate': 0.0, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005925585282966495, 'eval_mse': 0.0005925585865043104, 'eval_mae': 0.017202531918883324, 'eval_r2': 0.24800091981887817, 'eval_runtime': 5.2862, 'eval_samples_per_second': 315.351, 'eval_steps_per_second': 39.537, 'epoch': 6.0}


[I 2025-09-25 22:34:29,082] Trial 2 finished with value: 0.2657960103242658 and parameters: {'learning_rate': 1.3053282148990605e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 64, 'weight_decay': 0.002884554211301771, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.08135044384600738, 'gradient_accumulation_steps': 2}. Best is trial 2 with value: 0.2657960103242658.


{'train_runtime': 471.638, 'train_samples_per_second': 120.092, 'train_steps_per_second': 0.941, 'train_loss': 0.03479731173531429, 'epoch': 6.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/3540 [00:00<?, ?it/s]

{'loss': 0.0421, 'grad_norm': 0.07500358670949936, 'learning_rate': 3.703590166239635e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007559121586382389, 'eval_mse': 0.0007559121586382389, 'eval_mae': 0.018345266580581665, 'eval_r2': 0.040693581104278564, 'eval_runtime': 5.2564, 'eval_samples_per_second': 317.136, 'eval_steps_per_second': 39.761, 'epoch': 1.0}
{'loss': 0.002, 'grad_norm': 0.35544246435165405, 'learning_rate': 3.3669001511269414e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005305277300067246, 'eval_mse': 0.0005305277300067246, 'eval_mae': 0.0142644252628088, 'eval_r2': 0.32672256231307983, 'eval_runtime': 5.2493, 'eval_samples_per_second': 317.564, 'eval_steps_per_second': 39.815, 'epoch': 2.0}
{'loss': 0.0017, 'grad_norm': 0.21545910835266113, 'learning_rate': 3.030210136014247e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0004288855125196278, 'eval_mse': 0.0004288855125196278, 'eval_mae': 0.015427189879119396, 'eval_r2': 0.45571374893188477, 'eval_runtime': 5.2772, 'eval_samples_per_second': 315.888, 'eval_steps_per_second': 39.604, 'epoch': 3.0}
{'loss': 0.0015, 'grad_norm': 0.3104972541332245, 'learning_rate': 2.6935201209015533e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006076836143620312, 'eval_mse': 0.0006076836143620312, 'eval_mae': 0.017796823754906654, 'eval_r2': 0.22880619764328003, 'eval_runtime': 5.2467, 'eval_samples_per_second': 317.726, 'eval_steps_per_second': 39.835, 'epoch': 4.0}
{'loss': 0.0015, 'grad_norm': 0.11271736025810242, 'learning_rate': 2.356830105788859e-05, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005126804462634027, 'eval_mse': 0.0005126803880557418, 'eval_mae': 0.01627757027745247, 'eval_r2': 0.34937208890914917, 'eval_runtime': 5.2896, 'eval_samples_per_second': 315.149, 'eval_steps_per_second': 39.512, 'epoch': 5.0}
{'loss': 0.0014, 'grad_norm': 0.11721663177013397, 'learning_rate': 2.0201400906761644e-05, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00032231604564003646, 'eval_mse': 0.00032231604564003646, 'eval_mae': 0.0118898656219244, 'eval_r2': 0.5909579992294312, 'eval_runtime': 5.2919, 'eval_samples_per_second': 315.011, 'eval_steps_per_second': 39.495, 'epoch': 6.0}
{'loss': 0.0013, 'grad_norm': 0.33185625076293945, 'learning_rate': 1.6834500755634707e-05, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00034015843993984163, 'eval_mse': 0.00034015843993984163, 'eval_mae': 0.014074373990297318, 'eval_r2': 0.5683146715164185, 'eval_runtime': 5.2923, 'eval_samples_per_second': 314.984, 'eval_steps_per_second': 39.491, 'epoch': 7.0}
{'loss': 0.0013, 'grad_norm': 0.18355946242809296, 'learning_rate': 1.3467600604507766e-05, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0002672937116585672, 'eval_mse': 0.0002672937116585672, 'eval_mae': 0.009720370173454285, 'eval_r2': 0.6607852578163147, 'eval_runtime': 5.27, 'eval_samples_per_second': 316.318, 'eval_steps_per_second': 39.658, 'epoch': 8.0}
{'loss': 0.0012, 'grad_norm': 0.07794883102178574, 'learning_rate': 1.0100700453380822e-05, 'epoch': 9.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0002487821038812399, 'eval_mse': 0.0002487821038812399, 'eval_mae': 0.009445684030652046, 'eval_r2': 0.6842777729034424, 'eval_runtime': 5.2724, 'eval_samples_per_second': 316.175, 'eval_steps_per_second': 39.64, 'epoch': 9.0}
{'loss': 0.0012, 'grad_norm': 0.05428371950984001, 'learning_rate': 6.733800302253883e-06, 'epoch': 10.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00023596313258167356, 'eval_mse': 0.00023596313258167356, 'eval_mae': 0.008966119959950447, 'eval_r2': 0.7005459666252136, 'eval_runtime': 5.2861, 'eval_samples_per_second': 315.355, 'eval_steps_per_second': 39.538, 'epoch': 10.0}
{'loss': 0.0011, 'grad_norm': 0.15250056982040405, 'learning_rate': 3.3669001511269416e-06, 'epoch': 11.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00022804512991569936, 'eval_mse': 0.00022804512991569936, 'eval_mae': 0.008910531178116798, 'eval_r2': 0.7105944752693176, 'eval_runtime': 5.2871, 'eval_samples_per_second': 315.296, 'eval_steps_per_second': 39.53, 'epoch': 11.0}
{'loss': 0.0011, 'grad_norm': 0.05238274484872818, 'learning_rate': 0.0, 'epoch': 12.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0002280049811815843, 'eval_mse': 0.0002280049811815843, 'eval_mae': 0.008853941224515438, 'eval_r2': 0.7106454372406006, 'eval_runtime': 5.2777, 'eval_samples_per_second': 315.86, 'eval_steps_per_second': 39.601, 'epoch': 12.0}


[I 2025-09-25 22:51:23,146] Trial 3 finished with value: 0.7197273834462976 and parameters: {'learning_rate': 3.960387635393385e-05, 'num_train_epochs': 12, 'per_device_train_batch_size': 32, 'weight_decay': 0.06389093720719712, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.01975699245800593, 'gradient_accumulation_steps': 1}. Best is trial 2 with value: 0.2657960103242658.


{'train_runtime': 1013.6402, 'train_samples_per_second': 111.756, 'train_steps_per_second': 3.492, 'train_loss': 0.004769368191896859, 'epoch': 12.0}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/2220 [00:00<?, ?it/s]

{'loss': 0.1384, 'grad_norm': 0.2732158899307251, 'learning_rate': 1.2164527634527193e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0007308580679818988, 'eval_mse': 0.0007308580679818988, 'eval_mae': 0.021007269620895386, 'eval_r2': 0.07248908281326294, 'eval_runtime': 5.2613, 'eval_samples_per_second': 316.844, 'eval_steps_per_second': 39.724, 'epoch': 1.0}
{'loss': 0.0039, 'grad_norm': 0.1020132377743721, 'learning_rate': 1.1882611298909567e-05, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006080757593736053, 'eval_mse': 0.0006080757593736053, 'eval_mae': 0.017325541004538536, 'eval_r2': 0.22830849885940552, 'eval_runtime': 5.2777, 'eval_samples_per_second': 315.859, 'eval_steps_per_second': 39.601, 'epoch': 2.0}
{'loss': 0.0026, 'grad_norm': 0.2794414758682251, 'learning_rate': 1.1330040273843767e-05, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005378508358262479, 'eval_mse': 0.0005378508358262479, 'eval_mae': 0.01636325754225254, 'eval_r2': 0.31742894649505615, 'eval_runtime': 5.2722, 'eval_samples_per_second': 316.185, 'eval_steps_per_second': 39.642, 'epoch': 3.0}
{'loss': 0.0021, 'grad_norm': 0.09532219171524048, 'learning_rate': 1.0532676168523642e-05, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00049490702804178, 'eval_mse': 0.00049490702804178, 'eval_mae': 0.015474433079361916, 'eval_r2': 0.37192773818969727, 'eval_runtime': 5.2905, 'eval_samples_per_second': 315.095, 'eval_steps_per_second': 39.505, 'epoch': 4.0}
{'loss': 0.0019, 'grad_norm': 0.3968140482902527, 'learning_rate': 9.527837477661307e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005045324214734137, 'eval_mse': 0.0005045324796810746, 'eval_mae': 0.01716572977602482, 'eval_r2': 0.3597123622894287, 'eval_runtime': 5.2772, 'eval_samples_per_second': 315.886, 'eval_steps_per_second': 39.604, 'epoch': 5.0}
{'loss': 0.0018, 'grad_norm': 0.283194363117218, 'learning_rate': 8.362552989135832e-06, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0004933997406624258, 'eval_mse': 0.0004933997406624258, 'eval_mae': 0.01698208972811699, 'eval_r2': 0.3738405704498291, 'eval_runtime': 5.2784, 'eval_samples_per_second': 315.815, 'eval_steps_per_second': 39.595, 'epoch': 6.0}
{'loss': 0.0017, 'grad_norm': 0.09016034752130508, 'learning_rate': 7.091360727336582e-06, 'epoch': 7.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0004486095567699522, 'eval_mse': 0.00044860958587378263, 'eval_mae': 0.013476352207362652, 'eval_r2': 0.4306824803352356, 'eval_runtime': 5.2898, 'eval_samples_per_second': 315.137, 'eval_steps_per_second': 39.51, 'epoch': 7.0}
{'loss': 0.0016, 'grad_norm': 0.08366722613573074, 'learning_rate': 5.773755446772226e-06, 'epoch': 8.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0004135398776270449, 'eval_mse': 0.00041353993583470583, 'eval_mae': 0.013932320289313793, 'eval_r2': 0.47518837451934814, 'eval_runtime': 5.2807, 'eval_samples_per_second': 315.675, 'eval_steps_per_second': 39.578, 'epoch': 8.0}
{'loss': 0.0016, 'grad_norm': 0.09015115350484848, 'learning_rate': 4.471404139181325e-06, 'epoch': 9.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0004067040572408587, 'eval_mse': 0.0004067040572408587, 'eval_mae': 0.01284914743155241, 'eval_r2': 0.48386359214782715, 'eval_runtime': 5.2823, 'eval_samples_per_second': 315.582, 'eval_steps_per_second': 39.566, 'epoch': 9.0}
{'loss': 0.0015, 'grad_norm': 0.22585731744766235, 'learning_rate': 3.2452598748881848e-06, 'epoch': 10.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0004263071750756353, 'eval_mse': 0.0004263071750756353, 'eval_mae': 0.013038797304034233, 'eval_r2': 0.4589858055114746, 'eval_runtime': 5.2735, 'eval_samples_per_second': 316.107, 'eval_steps_per_second': 39.632, 'epoch': 10.0}
{'loss': 0.0015, 'grad_norm': 0.1740007996559143, 'learning_rate': 2.152709057341395e-06, 'epoch': 11.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0003896373964380473, 'eval_mse': 0.0003896373964380473, 'eval_mae': 0.01265018992125988, 'eval_r2': 0.5055223703384399, 'eval_runtime': 5.2741, 'eval_samples_per_second': 316.073, 'eval_steps_per_second': 39.628, 'epoch': 11.0}
{'loss': 0.0015, 'grad_norm': 0.16248703002929688, 'learning_rate': 1.244885605957821e-06, 'epoch': 12.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0003989820834249258, 'eval_mse': 0.0003989820834249258, 'eval_mae': 0.012605050578713417, 'eval_r2': 0.49366331100463867, 'eval_runtime': 5.2881, 'eval_samples_per_second': 315.234, 'eval_steps_per_second': 39.522, 'epoch': 12.0}
{'loss': 0.0014, 'grad_norm': 0.25069528818130493, 'learning_rate': 5.642777697619121e-07, 'epoch': 13.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00040273231570608914, 'eval_mse': 0.00040273231570608914, 'eval_mae': 0.012671108357608318, 'eval_r2': 0.4889039993286133, 'eval_runtime': 5.2702, 'eval_samples_per_second': 316.309, 'eval_steps_per_second': 39.657, 'epoch': 13.0}
{'loss': 0.0014, 'grad_norm': 0.0823645070195198, 'learning_rate': 1.4273957850825097e-07, 'epoch': 14.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00038717189454473555, 'eval_mse': 0.00038717189454473555, 'eval_mae': 0.012480287812650204, 'eval_r2': 0.5086512565612793, 'eval_runtime': 5.2748, 'eval_samples_per_second': 316.03, 'eval_steps_per_second': 39.622, 'epoch': 14.0}
{'loss': 0.0015, 'grad_norm': 0.07303909212350845, 'learning_rate': 0.0, 'epoch': 15.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.00039220356848090887, 'eval_mse': 0.00039220356848090887, 'eval_mae': 0.012518585659563541, 'eval_r2': 0.5022657513618469, 'eval_runtime': 5.2895, 'eval_samples_per_second': 315.155, 'eval_steps_per_second': 39.513, 'epoch': 15.0}


[I 2025-09-25 23:11:22,002] Trial 4 finished with value: 0.5151765405898914 and parameters: {'learning_rate': 1.2199357684599883e-05, 'num_train_epochs': 15, 'per_device_train_batch_size': 64, 'weight_decay': 0.029804225317284428, 'lr_scheduler_type': 'cosine', 'warmup_ratio': 0.03360743684008022, 'gradient_accumulation_steps': 1}. Best is trial 2 with value: 0.2657960103242658.


{'train_runtime': 1198.4392, 'train_samples_per_second': 118.154, 'train_steps_per_second': 1.852, 'train_loss': 0.010953260803813333, 'epoch': 15.0}
Best run for grammar_focused: BestRun(run_id='2', objective=0.2657960103242658, hyperparameters={'learning_rate': 1.3053282148990605e-05, 'num_train_epochs': 6, 'per_device_train_batch_size': 64, 'weight_decay': 0.002884554211301771, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.08135044384600738, 'gradient_accumulation_steps': 2}, run_summary=None)
--- Training final model for grammar_focused with best hyperparameters ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/444 [00:00<?, ?it/s]

{'loss': 0.1891, 'grad_norm': 0.2747263014316559, 'learning_rate': 1.1866620135446003e-05, 'epoch': 1.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0008462203550152481, 'eval_mse': 0.0008462203550152481, 'eval_mae': 0.022172078490257263, 'eval_r2': -0.07391393184661865, 'eval_runtime': 5.3206, 'eval_samples_per_second': 313.312, 'eval_steps_per_second': 39.281, 'epoch': 1.0}
{'loss': 0.0049, 'grad_norm': 0.18352562189102173, 'learning_rate': 9.493296108356803e-06, 'epoch': 2.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006517216679640114, 'eval_mse': 0.0006517216679640114, 'eval_mae': 0.01842045783996582, 'eval_r2': 0.17291873693466187, 'eval_runtime': 5.3044, 'eval_samples_per_second': 314.268, 'eval_steps_per_second': 39.401, 'epoch': 2.0}
{'loss': 0.0034, 'grad_norm': 0.7312589883804321, 'learning_rate': 7.1199720812676026e-06, 'epoch': 3.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0006927505019120872, 'eval_mse': 0.0006927505019120872, 'eval_mae': 0.0181000754237175, 'eval_r2': 0.12085026502609253, 'eval_runtime': 5.3139, 'eval_samples_per_second': 313.703, 'eval_steps_per_second': 39.33, 'epoch': 3.0}
{'loss': 0.0028, 'grad_norm': 0.16988013684749603, 'learning_rate': 4.746648054178402e-06, 'epoch': 4.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005767805268988013, 'eval_mse': 0.0005767805268988013, 'eval_mae': 0.016463691368699074, 'eval_r2': 0.2680244445800781, 'eval_runtime': 5.3177, 'eval_samples_per_second': 313.481, 'eval_steps_per_second': 39.303, 'epoch': 4.0}
{'loss': 0.0025, 'grad_norm': 0.4116610288619995, 'learning_rate': 2.373324027089201e-06, 'epoch': 5.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.000575135403778404, 'eval_mse': 0.000575135403778404, 'eval_mae': 0.016122279688715935, 'eval_r2': 0.27011221647262573, 'eval_runtime': 5.3146, 'eval_samples_per_second': 313.663, 'eval_steps_per_second': 39.325, 'epoch': 5.0}
{'loss': 0.0024, 'grad_norm': 0.13217014074325562, 'learning_rate': 0.0, 'epoch': 6.0}


  0%|          | 0/209 [00:00<?, ?it/s]

{'eval_loss': 0.0005668141529895365, 'eval_mse': 0.0005668141529895365, 'eval_mae': 0.016070080921053886, 'eval_r2': 0.28067249059677124, 'eval_runtime': 5.3092, 'eval_samples_per_second': 313.984, 'eval_steps_per_second': 39.366, 'epoch': 6.0}
{'train_runtime': 468.9679, 'train_samples_per_second': 120.776, 'train_steps_per_second': 0.947, 'train_loss': 0.03418458206159575, 'epoch': 6.0}
--- Saved final optimized model to rm_out_grammar_focused_final ---

--- Evaluating final model for grammar_focused ---


  0%|          | 0/209 [00:00<?, ?it/s]


--- Summary for Configuration: grammar_focused ---
  Best Hyperparameters Found:
    - learning_rate: 1.3053282148990605e-05
    - num_train_epochs: 6
    - per_device_train_batch_size: 64
    - weight_decay: 0.002884554211301771
    - lr_scheduler_type: linear
    - warmup_ratio: 0.08135044384600738
    - gradient_accumulation_steps: 2
  Final Evaluation Metrics:
    - eval_loss: 0.0006
    - eval_mse: 0.0006
    - eval_mae: 0.0161
    - eval_r2: 0.2807
    - eval_runtime: 5.2948
    - eval_samples_per_second: 314.8390
    - eval_steps_per_second: 39.4730
--------------------------------------------------


In [19]:
# ==============================================================================
#  FINAL SUMMARY OF ALL RUNS (NOW USING THE HELPER FUNCTION)
# ==============================================================================

print("\n\n=================================================")
print("           GRID SEARCH FINAL SUMMARY")
print("=================================================\n")

for result in all_results:
    # Call the same helper function to print each result
    print_run_summary(result)



           GRID SEARCH FINAL SUMMARY


--- Summary for Configuration: balanced ---
  Best Hyperparameters Found:
    - learning_rate: 2.836269262850202e-05
    - num_train_epochs: 7
    - per_device_train_batch_size: 64
    - weight_decay: 0.027205054637907
    - lr_scheduler_type: cosine
    - warmup_ratio: 0.05439758905893752
    - gradient_accumulation_steps: 2
  Final Evaluation Metrics:
    - eval_loss: 0.0008
    - eval_mse: 0.0008
    - eval_mae: 0.0176
    - eval_r2: 0.7519
    - eval_runtime: 5.3628
    - eval_samples_per_second: 310.8460
    - eval_steps_per_second: 38.9720
--------------------------------------------------

--- Summary for Configuration: rules_heavy ---
  Best Hyperparameters Found:
    - learning_rate: 1.4942277347418218e-05
    - num_train_epochs: 8
    - per_device_train_batch_size: 32
    - weight_decay: 0.05285585376949559
    - lr_scheduler_type: cosine
    - warmup_ratio: 0.04364105576407306
    - gradient_accumulation_steps: 2
  Final Evaluation Me

## Test Code till here 

In [20]:
# class StopExecution(Exception):
#     def _render_traceback_(self):
#         # This special method prevents the ugly traceback from appearing
#         pass

In [21]:
# raise StopExecution

In [22]:
# # Save the final, trained reward model
# trainer.save_model("model_rm")
# tokenizer_rm.save_pretrained("model_rm")

## Model_rm has been trained with the whole dataset now