In [None]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead
from datasets import Dataset, DatasetDict
import torch
import spacy
nlp = spacy.load("de_core_news_lg")
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    GenerationConfig
)
# Set Log Verbosity
# Reduce the log output from the transformers library to keep the console clean.
transformers.logging.set_verbosity_warning()


# Disable Tokenizer Parallelism 
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Configure Hardware Device 
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
# Import core reward computation functions
from ipynb.fs.full.reward_computation import (
    compute_reward,
    rule_compliance_score,
    calculate_grammar_score,
    calculate_semantic_similarity
)

In [None]:
# Modify the model initialization to move the model to the MPS device right after it's loaded.
def model_init():
    model_name = "distilbert-base-german-cased"
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=1, problem_type="regression"
    )
    return model.to(device) 

## Hyperparameter


### How Optuna Smartly Finds Hyperparameters

Optuna doesn't use a simple grid or random search. By default, it uses a smart Bayesian optimization algorithm called **TPE (Tree-structured Parzen Estimator)** that learns from past results to make better guesses over time.

#### The TPE Process
1.  **Explore:** It starts with a few random trials to gather initial data.
2.  **Model:** It divides the results into a "good" group (e.g., top 25% of scores) and a "bad" group.
3.  **Suggest:** For the next trial, it suggests a new set of hyperparameters that are statistically more likely to be in the "good" group than the "bad" group.

#### Finding the Best Setting
* **Grid Search:** Checks every single possible setting. Exhaustive but very slow.
* **Random Search:** Tries random settings. Better than grid search, but doesn't learn from its mistakes.
* **Optuna (TPE):** Plays "Hot and Cold." It uses the feedback from previous trials ("warmer" or "colder") to make its next guess much more intelligent, focusing its search on the most promising areas of the hyperparameter space.

In [None]:
# ==============================================================================
#  HYPERPARAMETER SEARCH CONFIGURATION
# ==============================================================================

# Define the number of different hyperparameter combinations for Optuna to try.
N_TRIALS = 5

# Define the search space for each hyperparameter in a centralized dictionary.
HP_SEARCH_SPACE = {
    "learning_rate": {
        "type": "float",
        "low": 1e-5,
        "high": 5e-5,
        "log": True,
        # Definition: Controls how much the model's weights are updated during training.
        # Importance: It's the most critical hyperparameter. Too high, and the model
        # won't converge; too low, and training will be extremely slow. A log scale
        # is used because the optimal value can vary by orders of magnitude.
    },
    "num_train_epochs": {
        "type": "int",
        "low": 6,
        "high": 15,
        # Definition: The total number of times the training algorithm will pass
        # through the entire training dataset.
        # Importance: Balances underfitting and overfitting. Too few epochs,
        # and the model won't learn enough; too many, and it may memorize the
        # training data and perform poorly on new data.
    },
    "per_device_train_batch_size": {
        "type": "categorical",
        "choices": [32, 64],
        # Definition: The number of training examples used in a single forward/backward pass.
        # Importance: Affects training stability and memory usage. Larger
        # batches provide more stable gradient estimates but require more GPU RAM.
    },
    "weight_decay": {
        "type": "float",
        "low": 0.0,
        "high": 0.1,
        # Definition: A regularization technique that penalizes large weights in the model.
        # Importance: Helps prevent overfitting by keeping the model's weights small
        # and simple, improving its ability to generalize to new data.
    },
    "lr_scheduler_type": {
        "type": "categorical",
        "choices": ["linear", "cosine"],
        # What it is: The strategy for changing the learning rate during training.
        # Importance: A good schedule (like a linear decay) helps the model
        # converge faster and more reliably than a constant learning rate.
    },
    "warmup_ratio": {
        "type": "float",
        "low": 0.0,
        "high": 0.1,
        # What it is: The fraction of total training steps used for a "warmup" phase.
        # Importance: The learning rate starts at 0 and slowly increases to its
        # target value. This prevents large, unstable updates at the beginning of
        # training, allowing the model to stabilize first.
    },
    "gradient_accumulation_steps": {
        "type": "categorical",
        "choices": [1, 2],
        # What it is: The number of smaller batches to process before performing a
        # single model weight update.
        # Importance: It's a trick to simulate a larger batch size without
        # using more memory. An effective batch size of 32 (16 * 2) can lead
        # to more stable training.
    }
}

In [None]:
def model_hp_space(trial):
    """
    Dynamically creates the hyperparameter search space for an Optuna trial by
    reading from the global HP_SEARCH_SPACE configuration dictionary.
    """
    params = {}
    # Loop through each parameter defined in the configuration.
    for name, config in HP_SEARCH_SPACE.items():
        param_type = config["type"]
        
        # Call the appropriate Optuna `suggest` method based on the parameter's type.
        if param_type == "float":
            params[name] = trial.suggest_float(
                name, config["low"], config["high"], log=config.get("log", False)
            )
        elif param_type == "int":
            params[name] = trial.suggest_int(
                name, config["low"], config["high"]
            )
        elif param_type == "categorical":
            params[name] = trial.suggest_categorical(
                name, config["choices"]
            )
            
    return params

## CANDIDATE_WEIGHTS

In [None]:
CANDIDATE_WEIGHTS = [
    {"name": "balanced", "weights": {"rules_score": 0.5, "meaning_score": 0.25, "grammar_score": 0.25}}, #baseline
    {"name": "rules_heavy", "weights": {"rules_score": 0.7, "meaning_score": 0.2, "grammar_score": 0.1}},
    {"name": "meaning_heavy", "weights": {"rules_score": 0.2, "meaning_score": 0.7, "grammar_score": 0.1}},
    {"name": "grammar_focused", "weights": {"rules_score": 0.2, "meaning_score": 0.1, "grammar_score": 0.7}},
]

## Data Load & Preprocessing

In [None]:
# Predefine the preprocessing function
def preprocess_function(examples, tokenizer):
    """Tokenizes the 'simplified' text column."""
    # Important: Tokenization is applied on the 'simplified' column now, which becomes the 'text' for the RM
    return tokenizer(examples["simplified"], truncation=True, padding="max_length", max_length=128)

In [None]:
# --- 1. Load, Split, and Tokenize Data ONCE ---
print("--- Loading and tokenizing data once ---")
df = pd.read_csv("data/ordered_simplifications_with_rules.csv", index_col=0)
df.info()

In [None]:
df.head()

In [None]:
df.rename(columns={"original_sentence": "original", "final_simplification": "simplified"}, inplace=True)

In [None]:
#use for Smaller Dataset
#df = df.sample(n=50, random_state=42) #reduce size for testing

In [None]:
# Create the initial dataset from the full DataFrame
full_dataset = Dataset.from_pandas(df)
# Split it into train and test sets
split_dataset = full_dataset.train_test_split(test_size=0.15, seed=42)

In [None]:
# Load the tokenizer and apply tokenization on both split sets
tokenizer_rm = AutoTokenizer.from_pretrained("distilbert-base-german-cased")

# Tokenize the base train and test sets without the labels
# We keep the original text columns to calculate rewards later
tokenized_train_base = split_dataset["train"].map(
    lambda examples: preprocess_function(examples, tokenizer_rm),
    batched=True
)
tokenized_test_base = split_dataset["test"].map(
    lambda examples: preprocess_function(examples, tokenizer_rm),
    batched=True
)
print("--- Data tokenized successfully. Starting grid search... ---")

## Eval Custom Metrics for regression RM
- for the chosen regression RM model MSE loss is chosen
- the following metrics are also loggeed
  - MSE (Mean Squared Error) → matches your training loss, so you can track consistency.
  - MAE (Mean Absolute Error) → more interpretable (average absolute difference).
  - R² (Coefficient of Determination) → tells you how well your model explains variance (1 = perfect, 0 = baseline).

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.squeeze()   # shape: (batch,)
    labels = labels.squeeze()

    mse = mean_squared_error(labels, preds)
    mae = mean_absolute_error(labels, preds)
    r2  = r2_score(labels, preds)

    return {"mse": mse, "mae": mae, "r2": r2}

## Redirecting Standard Output 

In [None]:
#Implementing a fix to remove unnecessary output originating from compound-splitter library

import os
import sys
from contextlib import contextmanager

@contextmanager
def suppress_stdout_stderr():
    """A context manager that redirects stdout and stderr to devnull"""
    # This works on Mac/Linux to silence all output, including from underlying libraries
    with open(os.devnull, 'w') as fnull:
        old_stdout, old_stderr = sys.stdout, sys.stderr
        sys.stdout, sys.stderr = fnull, fnull
        try:
            yield
        finally:
            sys.stdout, sys.stderr = old_stdout, old_stderr

## Helper Functions for Reward Calculation

In [None]:
# ==============================================================================
#  HELPER FUNCTIONS FOR REWARD CALCULATION
# ==============================================================================

def compute_stable_parts_batched(examples):
    """
    Calculates the reward components that are stable during multiprocessing.
    
    This function processes a BATCH of text examples for high efficiency. It's
    specifically designed to be the target function for `datasets.map()` when
    running in parallel. It handles the spaCy and language_tool calculations.
    
    Args:
        examples (dict): A dictionary from `datasets.map` where each key
                         (e.g., 'simplified') maps to a list of values.
                         
    Returns:
        dict: A dictionary containing lists of the calculated scores, which
              will be added as new columns to the dataset.
    """
    # Extract the list of sentences for this batch.
    simplified_batch = examples['simplified']
    
    # Use spaCy's nlp.pipe() for highly optimized, batched NLP processing.
    # Note: nlp.pipe() returns a generator. Since generators are consumed after one
    # iteration, two separate ones need to be created for the two list comprehensions below.
    docs_for_rules = nlp.pipe(simplified_batch)
    docs_for_grammar = nlp.pipe(simplified_batch)
    
    # Calculate the scores for each document in the batch.
    rules_scores = [rule_compliance_score(doc) for doc in docs_for_rules]
    grammar_scores = [calculate_grammar_score(doc.text) for doc in docs_for_grammar]

    # The returned dictionary's keys will become the new column names in the dataset.
    return {"rules_score": rules_scores, "grammar_score": grammar_scores}


def calculate_rewards_for_split(dataset_split, weights, num_proc, desc_prefix=""):
    """
    Orchestrates the full two-stage reward calculation for a given dataset split.
    
    This function encapsulates the entire complex reward logic:
    - Stage 1: Computes stable scores (rules, grammar) in parallel for speed.
    - Stage 2: Computes the SBERT meaning score sequentially to avoid crashes.
    - Stage 3: Combines the scores from both stages using the provided weights.
    
    Args:
        dataset_split (Dataset): The dataset slice (e.g., train or test) to process.
        weights (dict): The dictionary of weights for the current grid search configuration.
        num_proc (int): The number of CPU cores to use for Stage 1.
        desc_prefix (str): A string (like "train" or "test") to label the progress bars.
        
    Returns:
        list: A list of the final, weighted reward scores for the dataset split.
    """
    # --- STAGE 1: Parallel calculation of stable scores ---
    # silence all stdout/stderr from subprocesses to prevent the noisy
    # `german-compound-splitter` library from creating an I/O bottleneck.
    with suppress_stdout_stderr():
        stable_scores_ds = dataset_split.map(
            compute_stable_parts_batched,
            batched=True,
            batch_size=100,
            num_proc=num_proc,
            desc=f"Calculating stable scores ({desc_prefix})"
        )

    # --- STAGE 2: Sequential calculation of the SBERT meaning score ---
    # This part is run on a single core because the SentenceTransformer model
    # was found to be incompatible with multiprocessing, causing crashes.
    meaning_scores = [
        calculate_semantic_similarity(ex['original'], ex['simplified']) 
        for ex in tqdm(dataset_split, desc=f"Calculating meaning scores ({desc_prefix})")
    ]
    
    # --- STAGE 3: Combine all calculated scores ---
    # This loop combines the results from the parallel and sequential stages
    # into a final, weighted reward score for each example.
    final_rewards = []
    for i in range(len(dataset_split)):
        r_s = stable_scores_ds[i]["rules_score"]  # From Stage 1
        g_s = stable_scores_ds[i]["grammar_score"] # From Stage 1
        m_s = meaning_scores[i]                   # From Stage 2
        
        # Apply the weights for the current experiment configuration.
        reward = (weights["rules_score"] * r_s +
                  weights["grammar_score"] * g_s +
                  weights["meaning_score"] * m_s)
        final_rewards.append(reward)
        
    return final_rewards

## Helper Functions for Training Pipeline 

In [None]:

# ==============================================================================
#  HELPER FUNCTIONS FOR THE TRAINING PIPELINE
# ==============================================================================

def prepare_datasets_for_run(tokenized_train_base, tokenized_test_base, train_rewards, test_rewards):
    """Adds reward labels to the tokenized datasets and sets the final PyTorch format."""
    # Add the calculated rewards as the 'labels' column for this training run.
    train_dataset = tokenized_train_base.add_column("labels", train_rewards)
    test_dataset = tokenized_test_base.add_column("labels", test_rewards)
    
    # Set the dataset format to PyTorch tensors for compatibility with the Trainer.
    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    
    return train_dataset, test_dataset


def run_hyperparameter_search(trainer, config_name):
    """Runs the hyperparameter search for a given trainer and returns the best run."""
    print(f"--- Running hyperparameter search for {config_name} ---")
    
    # This call now uses the N_TRIALS variable defined in your configuration.
    best_run = trainer.hyperparameter_search(
        direction="minimize",      # We want to minimize the objective (MSE).
        hp_space=model_hp_space,   # The function that builds the search space.
        n_trials=N_TRIALS          # The number of trials, controlled from the top of the notebook.
    )
    print(f"Best run for {config_name}: {best_run}")
    return best_run


def train_and_save_final_model(trainer, best_run, output_dir_base, config_name):
    """Trains the final model using the best hyperparameters and saves it."""
    print(f"--- Training final model for {config_name} with best hyperparameters ---")
    
    # Apply the best parameters found during the search to the trainer's arguments.
    for k, v in best_run.hyperparameters.items():
        setattr(trainer.args, k, v)
    
    # Start the final training run.
    trainer.train()

    # Save the final, optimized model to a unique directory.
    final_output_dir = f"{output_dir_base}_final"
    trainer.save_model(final_output_dir)
    print(f"--- Saved final optimized model to {final_output_dir} ---\n")

"""
Reusable Summary Function:  It takes a single result dictionary and prints it in a clean format.
"""
def print_run_summary(result):
    """Prints a formatted summary for a single training run."""
    print(f"\n--- Summary for Configuration: {result['config_name']} ---")
    print("  Best Hyperparameters Found:")
    for param, value in result['best_hyperparameters'].items():
        print(f"    - {param}: {value}")
    
    print("  Final Evaluation Metrics:")
    for metric, value in result['final_metrics'].items():
        # We only print metrics that start with 'eval_' and format them nicely
        if 'eval_' in metric:
            print(f"    - {metric}: {value:.4f}")
    print("-" * 50)


## Trainig Loop with Grid Search for all 4 CANDIDATE_WEIGHTS

In [None]:
# ==============================================================================
#  MAIN GRID SEARCH AND TRAINING LOOP 
# ==============================================================================

# --- Initialize a list to store results ---
all_results = []

# This main loop iterates through each set of candidate weights, calculates the
# corresponding rewards, and then (in subsequent code) trains a reward model.
for config in CANDIDATE_WEIGHTS:
    # --- SETUP FOR THE CURRENT LOOP ITERATION ---
    config_name = config["name"]
    weights = config["weights"]
    output_dir_base = f"rm_out_{config_name}"

    print(f"\n--- Processing configuration: {config_name} ---")
    print(f"Weights: {weights}")

    # Set the number of CPU cores for parallel processing.
    num_cores_to_use = 12

    # --- REWARD CALCULATION ---
    # Call reusable orchestration function to get the rewards for both the
    # training and test sets for this specific weight configuration.
    
    print("--- Calculating rewards for the TRAINING set ---")
    train_rewards = calculate_rewards_for_split(
        split_dataset["train"], 
        weights, 
        num_cores_to_use, 
        desc_prefix="train"
    )

    print("--- Calculating rewards for the TEST set ---")
    test_rewards = calculate_rewards_for_split(
        split_dataset["test"], 
        weights, 
        num_cores_to_use, 
        desc_prefix="test"
    )
    
    print(f"--- Finished calculating rewards for {config_name} ---")


    # The 'train_rewards' and 'test_rewards' variables are now ready to be
    # used to train the reward model for this specific 'config'.

    # ==============================================================================
    #  POST-REWARD CALCULATION: DATASET PREP & MODEL TRAINING
    # ==============================================================================
    # This code block is inside the `for config in CANDIDATE_WEIGHTS:` loop.
    # The 'train_rewards' and 'test_rewards' variables are now ready.

    # --- b. Dataset Preparation ---
    # Call our helper function to create the final datasets for this specific training run.
    # This function takes the pre-tokenized text and merges it with the `train_rewards`
    # and `test_rewards` lists that were just calculated using the current `weights`.
    train_ds, test_ds = prepare_datasets_for_run(
        tokenized_train_base, tokenized_test_base, train_rewards, test_rewards
    )

    # --- c. Trainer Setup ---
    # First, we configure the training process using the `TrainingArguments` class.
    # This object holds all the settings for how the model will be trained and evaluated.
    training_args = TrainingArguments(
        output_dir=output_dir_base,      # Directory to save model checkpoints and outputs.
        use_mps_device=True,             # Explicitly leverage the Apple Silicon GPU (Metal Performance Shaders).
        evaluation_strategy="epoch",     # Run an evaluation on the test set at the end of each training epoch.
        save_strategy="epoch",           # Save a model checkpoint at the end of each epoch.
        load_best_model_at_end=True,     # After training, automatically load the checkpoint with the best performance.
        metric_for_best_model="mse",     # Use Mean Squared Error (MSE) as the key metric to determine the "best" model.
        greater_is_better=False,         # Specifies that a lower value for `metric_for_best_model` (MSE) is better.
        logging_strategy="epoch",        # How often to log training metrics. "epoch" is less noisy than "steps".,
        dataloader_pin_memory=False,     # Disable the feature and silence the warning since not applicable for Mac with an M-series chip uses a Unified Memory Architecture 
        save_total_limit=1, # Only keep the single best checkpoint on disk

    )

    # Next, we initialize the `Trainer` object.
    # This is the main orchestrator from the Hugging Face library that brings together
    # the model, datasets, and training configuration.
    trainer = Trainer(
        # `model_init` is a function that returns a fresh, untrained model.
        # This is crucial for hyperparameter search to ensure each trial starts from scratch.
        model_init=model_init,
        args=training_args,                 # The training configuration we just defined.
        train_dataset=train_ds,             # The prepared training dataset.
        eval_dataset=test_ds,               # The prepared test dataset.
        compute_metrics=compute_metrics,    # The function to calculate metrics like MSE, MAE, and R².
        tokenizer=tokenizer_rm,             # The tokenizer, used for data collation.
    )

    # --- d. Hyperparameter Search and Final Training ---
    # Now we execute the main logic using our custom helper functions for clarity.
    # First, find the best set of hyperparameters for the current reward model configuration.
    best_run = run_hyperparameter_search(trainer, config_name)
    # Then, use those best hyperparameters to train a final, optimized model.
    train_and_save_final_model(trainer, best_run, output_dir_base, config_name)

    # --- e. Evaluate the Final Model and Store Results for Summary ---
    # After the final model for this configuration has been trained, we run a final
    # evaluation on the test set to get its performance metrics.
    print(f"--- Evaluating final model for {config_name} ---")
    final_metrics = trainer.evaluate()

    # We append the results of this entire run into a list. This allows us to
    # print a full summary of all configurations after the main loop has finished.
    current_result = {
        "config_name": config_name,
        "best_hyperparameters": best_run.hyperparameters,
        "final_metrics": final_metrics
    }
    all_results.append(current_result)
    print_run_summary(current_result)

In [None]:
# ==============================================================================
#  FINAL SUMMARY OF ALL RUNS (NOW USING THE HELPER FUNCTION)
# ==============================================================================

print("\n\n=================================================")
print("           GRID SEARCH FINAL SUMMARY")
print("=================================================\n")

for result in all_results:
    # Call the same helper function to print each result
    print_run_summary(result)

## Model_rm has been trained with the whole dataset now