# Evaluation

Start with setting variables for the evaluation

In [None]:
EVAL_DATA = "sft_split_dataset/eval.csv"
EVAL_DATA_SUBSET = 1 # 1 for whole df, 0.1 for only 10% 

# Check variable names before Running
## 1) check query adjustment as well! SIMPLE vs DETAILED Query cmd+f: eval_df['query']
### 2) adjust the file name for eval output, defines folder name
SAVE_EVAL_SPECS = "PPO_model_2K_4E_simple_q_28"
# 3) check the which model to be evaluated
PPO_TO_EVALUATE = "PPO_model_2K_4E_simple_q_28"


#1A "PPO_model_2K_4E_simple_q_28" - Simple Query
#1B "PPO_model_2K_4E_28" Detailed Query
#2A "PPO_model_2K_4E_SFT" -- KL=0.05
#2B "PPO_model_2K_4E_SFT_KL_variation" -- KL=0.2

### Step 1: Setup and Loading the Final Model
The most important step is loading the model correctly: Because the training happened with LoRA, first the original base model needs to be loaded and then add the saved adapter weights on top of it.

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from trl import AutoModelForCausalLMWithValueHead
from tqdm.auto import tqdm

# --- Configure Hardware Device
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
### DEFINE PATH AND LOAD REQUIRED COMPONENTS
# The original base model your PPO model was fine-tuned from.
BASE_MODEL_ID = "frhew/sigdial_ft_a2"
# The path to your saved PPO model, which contains the LoRA adapter weights.
PPO_MODEL_PATH = PPO_TO_EVALUATE
# The path to your custom-trained reward model.
RM_PATH = "rm_out_rules_heavy_final"

# --- Load Base Model 
# First, load the original, pre-trained base model in bfloat16 for memory efficiency.
base_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.bfloat16,
)

# --- Load the PEFT Model (Policy Model
# Now, loading the LoRA adapters from your saved directory and applying them to the base model.
policy_model = PeftModel.from_pretrained(base_model, PPO_MODEL_PATH)
# Merge the adapter weights into the base model and unload the PEFT model.
# This creates a standard, standalone model that is easier for inference.
policy_model = policy_model.merge_and_unload()

# Move the final model to the GPU and set it to evaluation mode.
policy_model.to(device)
policy_model.eval()
print(f"\nSuccessfully loaded PPO-tuned policy model.")

In [None]:
# --- Load Tokenizers and Reward Model ---
policy_tokenizer = AutoTokenizer.from_pretrained(PPO_MODEL_PATH)
reward_model = AutoModelForSequenceClassification.from_pretrained(RM_PATH)
reward_tokenizer = AutoTokenizer.from_pretrained(RM_PATH)
reward_model.to(device)
reward_model.eval()
print("Successfully loaded tokenizers and reward model.")

# --- Load Evaluation Data
# Load the test set that was saved during training.
eval_df = pd.read_csv(EVAL_DATA)

#Add instruction to every query, keep EITHER uncommented
#### Simple Query 
eval_df['query'] = "Vereinfache diesen Satz: " + eval_df['original']

#### Instruction specific query
#eval_df['query'] = "Vereinfache den folgenden Satz nach Leichter-Sprache-Regeln: " + eval_df['original']

# --- Randomly sample X% of the data
eval_df = eval_df.sample(frac=EVAL_DATA_SUBSET, random_state=42)
print(f"\nLoaded {len(eval_df)} examples for evaluation.")

### Step 2: Generate and Evaluate Responses

- Generating new simplifications for the evaluation prompts and scoring them with the trained RM

In [None]:
# --- Define Helper Function to Score Responses
def compute_rewards_from_rm(responses: list[str]) -> torch.Tensor:
    with torch.no_grad():
        inputs = reward_tokenizer(responses, return_tensors="pt", padding=True, truncation=True).to(device)
        rewards = reward_model(**inputs).logits.squeeze(-1)
    return rewards

# --- Generation Settings
generation_kwargs = {
    "min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True,
    "pad_token_id": policy_tokenizer.eos_token_id, "max_new_tokens": 40,
    "repetition_penalty": 1.0
}

In [None]:
# --- Generate Responses for the Evaluation Set
# Proceeding in batches to avoid potential memory issues.
batch_size = 16
ppo_responses = []

print("\nGenerating responses with the PPO-tuned model...")
for i in tqdm(range(0, len(eval_df), batch_size)):
    batch = eval_df[i:i+batch_size]
    prompts = batch["query"].tolist()
    
    # Tokenize prompts and move to the correct device
    inputs = policy_tokenizer(prompts, return_tensors="pt", padding=True).to(device)
    
    # Generate responses
    with torch.no_grad():
        response_tensors = policy_model.generate(**inputs, **generation_kwargs)

    # Decode only the newly generated part of the response
    decoded_responses = policy_tokenizer.batch_decode(response_tensors[:, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    ppo_responses.extend(decoded_responses)


In [None]:
# Add the new responses and their reward scores to the DataFrame
eval_df["ppo_response"] = ppo_responses
eval_df["ppo_reward_score"] = compute_rewards_from_rm(ppo_responses).cpu().numpy()

# Save the DataFrame with the new responses and scores to a CSV file
eval_df.to_csv(f"ppo_evaluation_result_{SAVE_EVAL_SPECS}.csv", index=False)

# --- Print Final Results
print("\n--- Evaluation Summary ---")
# Quantitative Metric: It calculates the average reward score across all the generated responses in your evaluation set.
print(f"Average Reward Score for PPO-tuned model: {eval_df['ppo_reward_score'].mean():.4f}")

# Display a sample of the results for qualitative review
# Qualitative Evaluation: It prints a random sample of side-by-side comparisons of the original query, 
# the baseline simplification, and the new PPO response. 
print("\n--- Side-by-Side Comparison (Sample) ---")
for index, row in eval_df.sample(n=50, random_state=42).iterrows():
    print(f"Query (Original): {row['query']}")
    #print(f"Baseline (V1):    {row['final_simplification']}") rows did not get renamed
    print(f"Baseline (V1):    {row['simplified']}")
    print(f"PPO Response:     {row['ppo_response']}")
    print(f"PPO Reward Score: {row['ppo_reward_score']:.4f}")
    print("-" * 50)