# Evaluation

In [1]:
#EVAL_DATA = "eval_df_23Sep.csv"
EVAL_DATA = "eval_df_24Sep.csv"
EVAL_DATA_SUBSET = 0.2 # 1 for whole eval_df_23Sep

### Step 1: Setup and Loading the Final Model
#### The most important step is loading the model correctly. Because you trained with LoRA, you need to first load the original base model and then apply your saved adapter weights on top of it.

In [2]:
# = =============================================================================
#  1. IMPORTS & SETUP
# ==============================================================================
import torch
import pandas as pd
from datasets import Dataset
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from trl import AutoModelForCausalLMWithValueHead

# --- Configure Hardware Device ---
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


W0925 09:31:02.230000 13304 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


Using device: mps


In [3]:
# ==============================================================================
#  2. DEFINE PATHS & LOAD COMPONENTS
# ==============================================================================

# --- Define Paths ---
# The original base model your PPO model was fine-tuned from.
BASE_MODEL_ID = "frhew/sigdial_ft_a2"
# The path to your saved PPO model, which contains the LoRA adapter weights.
PPO_MODEL_PATH = "my_ppo_tuned_model_V24.09"
# The path to your custom-trained reward model.
RM_PATH = "rm_out_rules_heavy_final"

# --- Load Base Model ---
# First, load the original, pre-trained base model in bfloat16 for memory efficiency.
base_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.bfloat16,
)

# --- Load the PEFT Model (Policy Model) ---
# Now, load the LoRA adapters from your saved directory and apply them to the base model.
policy_model = PeftModel.from_pretrained(base_model, PPO_MODEL_PATH)
# Merge the adapter weights into the base model and unload the PEFT model.
# This creates a standard, standalone model that is easier for inference.
policy_model = policy_model.merge_and_unload()

# Move the final model to the GPU and set it to evaluation mode.
policy_model.to(device)
policy_model.eval()
print(f"\nSuccessfully loaded PPO-tuned policy model.")

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]




Successfully loaded PPO-tuned policy model.


In [4]:
# --- Load Tokenizers and Reward Model ---
policy_tokenizer = AutoTokenizer.from_pretrained(PPO_MODEL_PATH)
reward_model = AutoModelForSequenceClassification.from_pretrained(RM_PATH)
reward_tokenizer = AutoTokenizer.from_pretrained(RM_PATH)
reward_model.to(device)
reward_model.eval()
print("Successfully loaded tokenizers and reward model.")

# --- Load Evaluation Data ---
# Load the test set that you saved during training.
eval_df = pd.read_csv(EVAL_DATA)

#Add instruction to every query
eval_df['query'] = "Vereinfache diesen Satz: " + eval_df['query']

# --- Randomly sample X% of the data ---
# The `frac=0.1` parameter specifies that you want 10% of the rows.
# `random_state=42` ensures you get the exact same random sample every time.
eval_df = eval_df.sample(frac=EVAL_DATA_SUBSET, random_state=42)
print(f"\nLoaded {len(eval_df)} examples for evaluation.")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Successfully loaded tokenizers and reward model.

Loaded 333 examples for evaluation.


### Step 2: Generate and Evaluate Responses
#### Now that everything is loaded, you can generate new simplifications for your evaluation prompts and score them with your reward model.

In [5]:
# ==============================================================================
#  3. GENERATE & EVALUATE RESPONSES
# ==============================================================================
from tqdm.auto import tqdm

# --- Define Helper Function to Score Responses ---
def compute_rewards_from_rm(responses: list[str]) -> torch.Tensor:
    with torch.no_grad():
        inputs = reward_tokenizer(responses, return_tensors="pt", padding=True, truncation=True).to(device)
        rewards = reward_model(**inputs).logits.squeeze(-1)
    return rewards

# --- Generation Settings ---
generation_kwargs = {
    "min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True,
    "pad_token_id": policy_tokenizer.eos_token_id, "max_new_tokens": 60,
    "repetition_penalty": 1.0
}

In [6]:

# --- Generate Responses for the Evaluation Set ---
# We'll process in batches to avoid memory issues with very large eval sets.
batch_size = 16
ppo_responses = []

print("\nGenerating responses with the PPO-tuned model...")
for i in tqdm(range(0, len(eval_df), batch_size)):
    batch = eval_df[i:i+batch_size]
    prompts = batch["query"].tolist()
    
    # Tokenize prompts and move to the correct device
    inputs = policy_tokenizer(prompts, return_tensors="pt", padding=True).to(device)
    
    # Generate responses
    with torch.no_grad():
        response_tensors = policy_model.generate(**inputs, **generation_kwargs)

    # Decode only the newly generated part of the response
    decoded_responses = policy_tokenizer.batch_decode(response_tensors[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    ppo_responses.extend(decoded_responses)



Generating responses with the PPO-tuned model...


  0%|          | 0/21 [00:00<?, ?it/s]

In [7]:

# Add the new responses and their reward scores to the DataFrame
eval_df["ppo_response"] = ppo_responses
eval_df["ppo_reward_score"] = compute_rewards_from_rm(ppo_responses).cpu().numpy()

# Save the DataFrame with the new responses and scores to a CSV file
eval_df.to_csv("ppo_evaluation_results.csv", index=False)

# --- Print Final Results ---
print("\n--- Evaluation Summary ---")
# Quantitative Metric: It calculates the average reward score across all the generated responses in your evaluation set.
print(f"Average Reward Score for PPO-tuned model: {eval_df['ppo_reward_score'].mean():.4f}")

# Display a sample of the results for qualitative review
# Qualitative Evaluation: It prints a random sample of side-by-side comparisons of the original query, 
# the baseline simplification, and the new PPO response. 
# This is a crucial step for you to manually assess the actual quality and style of the generated text.
print("\n--- Side-by-Side Comparison (Sample) ---")
for index, row in eval_df.sample(n=10, random_state=42).iterrows():
    print(f"Query (Original): {row['query']}")
    print(f"Baseline (V1):    {row['final_simplification']}")
    print(f"PPO Response:     {row['ppo_response']}")
    print(f"PPO Reward Score: {row['ppo_reward_score']:.4f}")
    print("-" * 50)


--- Evaluation Summary ---
Average Reward Score for PPO-tuned model: 0.8676

--- Side-by-Side Comparison (Sample) ---
Query (Original): Vereinfache diesen Satz: So nennt man den Trainer von der deutschen Fußball·National·Mannschaft.
Baseline (V1):    So nennt man den Trainer von der deutschen Fußball·National·Mannschaft.
PPO Response:      So nennt man den Trainer von der deutschen Fußball-National-Mannschaft. So nennt man den Trainer von der deutschen Fußball-Nationalmannschaft. Der Trainer von der deutschen Fußball-Nationalmannschaft. So nennt man den Trainer der deutschen Fußball-National
PPO Reward Score: 0.8791
--------------------------------------------------
Query (Original): Vereinfache diesen Satz: Dabei erinnerte man an den so genannten D-Day vor 75 Jahren.
Baseline (V1):    man hat Dabei an den so genannten D-Day vor 75 Jahren erinnert.
PPO Response:      Der 6. Juni 1944 war der Tag, an dem die Alliierten in der Normandie als Alliierte in der Normandie. Der 6. Juni 1944 w