In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json, ast, re
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from final_eval_calc_helper import *

In [None]:
from ipynb.fs.full.reward_computation import rule_compliance_score, calculate_semantic_similarity, calculate_grammar_score, compute_reward

In [None]:
EVAL_OUTPUT_DIR = "eval_final_results/"

## Load files and prepare dfs before applying calculation

In [None]:
# Load the results
df_ex1a = pd.read_csv("eval_final_results/1b_ppo_evaluation_result_PPO_model_2K_4E_simple_q_28.csv")
df_ex1b = pd.read_csv("eval_final_results/1a_ppo_evaluation_result_PPO_model_2K_4E_28.csv")

df_ex2a = pd.read_csv("eval_final_results/2a_ppo_evaluation_result_PPO_model_2K_4E_SFT_KL_0.05.csv")
df_ex2b = pd.read_csv("eval_final_results/2b_ppo_evaluation_result_2K_4E_SFT_KL_high.csv")

# Overall Analysis - This step calcualtes the defined metrics based on the log

In [None]:
# 1) Define reward weights #putting more stress on the semantic similarity
REWARD_WEIGHTS = {
    "rule_score": 0.3,
    "semantic_score": 0.5,
    "grammar_score": 0.2,
}


# 2) Define evaluation function
def eval_output(df, text_col="ppo_response", weights=REWARD_WEIGHTS):
    df_eval = df.copy()

    # --- Compute component scores ---
    df_eval["semantic_score"] = df_eval.apply(
        lambda r: calculate_semantic_similarity(r["original"], r[text_col]), axis=1
    )
    df_eval["grammar_score"] = df_eval[text_col].apply(calculate_grammar_score)
    
    df_eval["rule_score"] = df_eval.apply(
        lambda r: rule_compliance_score(r[text_col]), axis=1
    )

    # --- Weighted total reward (Reward Function) ---
    df_eval["total_reward"] = (
        df_eval["rule_score"] * weights["rule_score"]
        + df_eval["semantic_score"] * weights["semantic_score"]
        + df_eval["grammar_score"] * weights["grammar_score"]
    )

    # Input length
    #df_eval["input_len_chars"] = df_eval["original"].astype(str).str.len() # number of characters 
    df_eval["input_len_tokens"] = df_eval["original"].astype(str).str.split().map(len) # number of whitespace-split tokens

    # Output length
    #df_eval["output_len_chars"] = df_eval[text_col].astype(str).str.len()
    df_eval["output_len_tokens"] = df_eval[text_col].astype(str).str.split().map(len)

    # Ratios (output vs input)
    #df_eval["len_ratio_chars"] = df_eval["output_len_chars"] / df_eval["input_len_chars"]
    df_eval["len_ratio_tokens"] = df_eval["output_len_tokens"] / df_eval["input_len_tokens"]

    return df_eval

# 3) Load PPO evaluation files into dict
dfs = {
    "exp1a": df_ex1a,
    "exp1b": df_ex1b,
    "exp2a": df_ex2a,
    "exp2b": df_ex2b,
}

# 4) Evaluate all experiments (PPO only, no rule explode)
results = {}
for name, df in dfs.items():
    df_eval = eval_output(df, text_col="ppo_response")
    results[name] = df_eval
    df_eval.to_csv(f"eval_final_results/{name}_per_sentence_scored.csv", index=False)
    print(f"Finished {name}")

# 5) Build summary (mean + median per metric)
records = []
for name, df_eval in results.items():
    records.append({
        "experiment": name,
        "mean_semantic": df_eval["semantic_score"].mean(),
        "median_semantic": df_eval["semantic_score"].median(),
        "mean_rule": df_eval["rule_score"].mean(),
        "median_rule": df_eval["rule_score"].median(),
        "mean_grammar": df_eval["grammar_score"].mean(),
        "median_grammar": df_eval["grammar_score"].median(),
        "mean_total_reward": df_eval["total_reward"].mean(),
        "median_total_reward": df_eval["total_reward"].median(),
        "len_ratio_tokens": df_eval["len_ratio_tokens"].median()
    })

summary_df = pd.DataFrame(records)
summary_df.to_csv("eval_final_results/ppo_combined_summary.csv", index=False)

print(summary_df)




## Load the generated log to continue analysis

In [None]:
# Load the results
df_ex1a_prep = pd.read_csv("eval_final_results/exp1a_per_sentence_scored.csv")
df_ex1b_prep = pd.read_csv("eval_final_results/exp1b_per_sentence_scored.csv")

df_ex2a_prep = pd.read_csv("eval_final_results/exp2a_per_sentence_scored.csv")
df_ex2b_prep = pd.read_csv("eval_final_results/exp2b_per_sentence_scored.csv")

In [None]:
# 3) Load PPO evaluation files into dict
dfs = {
    "1A": df_ex1a_prep,
    "1B": df_ex1b_prep,
    "2A": df_ex2a_prep,
    "2B": df_ex2b_prep,
}

In [None]:
ppo_scores_summary = []

for name, df in dfs.items():
    if "ppo_reward_score" in df.columns:
        ppo_scores_summary.append({
            "experiment": name,
            "mean_ppo_score": df["ppo_reward_score"].mean(),
            "median_ppo_score": df["ppo_reward_score"].median(),
            "min_ppo_score": df["ppo_reward_score"].min(),
            "max_ppo_score": df["ppo_reward_score"].max(),
        })
    else:
        print(f"⚠️ {name} has no 'ppo_reward_score' column")

ppo_summary_df = pd.DataFrame(ppo_scores_summary)
print(ppo_summary_df)

# Save to CSV if needed
ppo_summary_df.to_csv("eval_final_results/ppo_reward_summary.csv", index=False)


In [None]:
def check_outputs(df, text_col="ppo_response"):
    """
    Run structural sanity checks on PPO outputs.
    1) Check if output ends with proper punctuation.
    2) Check if output contains unwanted metadata (e.g., Quelle, Source, http).
    """
    valid_endings = (".", "!", "?")
    unwanted_patterns = re.compile(r"(quelle|source|http[s]?://)", re.IGNORECASE)

    # Check sentence endings
    df["ends_with_punct"] = df[text_col].astype(str).str.strip().str.endswith(valid_endings)

    # Check unwanted patterns
    df["contains_metadata"] = df[text_col].astype(str).str.contains(unwanted_patterns, regex=True)

    # Summary stats
    stats = {
        "total_outputs": len(df),
        "valid_endings": df["ends_with_punct"].sum(),
        "invalid_endings": (~df["ends_with_punct"]).sum(),
        "percent_valid_endings": df["ends_with_punct"].mean() * 100,
        "contains_metadata": df["contains_metadata"].sum(),
        "percent_metadata": df["contains_metadata"].mean() * 100,
    }

    return stats, df

In [None]:
results = {}
for name, df in dfs.items():
    stats, df_checked = check_outputs(df, text_col="ppo_response")
    results[name] = stats
    print(f"\n{name} results:")
    for k, v in stats.items():
        print(f"  {k}: {v}")

# Optionally create a summary DataFrame
summary_df = pd.DataFrame(results).T
summary_df.to_csv("eval_final_results/ppo_output_sanity_checks.csv")


In [None]:
# Plot: Semantic vs Rule Adherence for all PPO outputs
fig, axes = plt.subplots(2, 2, figsize=(12, 10), sharex=True, sharey=True)
axes = axes.flatten()

for ax, (name, df) in zip(axes, dfs.items()):
    ax.scatter(df["semantic_score"], df["rule_score"], alpha=0.5, s=15)
    ax.set_title(f"{name}: Semantic Score vs Rule Adherence")
    ax.set_xlabel("Semantic Score")
    ax.set_ylabel("Rule Score")
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)

plt.suptitle("Trade-off Between Semantic Preservation and Rule Adherence Across PPO Variants", fontsize=14)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.savefig('semanticvsrule.pdf')
plt.savefig('semanticvsrule.png')
plt.show()

# This section calcualtes a rule-specific deep-dive into the metrics performance

## Aggregated Rule Analysis

In [None]:
def _parse_any(obj):
    """
    Recursively parse/flatten lists and stringified lists into a Python list of strings.
    """
    # None / NaN
    if obj is None or (isinstance(obj, float) and pd.isna(obj)):
        return []

    # Already a list -> flatten each element
    if isinstance(obj, list):
        out = []
        for el in obj:
            out.extend(_parse_any(el))
        return out

    # Strings: may be raw rule, JSON, python literal, or a quoted string containing a list
    if isinstance(obj, str):
        s = obj.strip()
        if s in ("", "[]"):
            return []

        # If it looks like a quoted list string:  '"[...]"' or '\'[...]\''
        if len(s) >= 4 and s[0] in ("'", '"') and s[-1] == s[0] and s[1] == '[' and s[-2] == ']':
            s = s[1:-1].strip()  # unwrap quotes around the whole [ ... ]

        # Try JSON
        try:
            val = json.loads(s)
            # If val is list or nested, recurse
            if isinstance(val, (list, str)):
                return _parse_any(val)
        except Exception:
            pass

        # Try python literal (handles single quotes)
        try:
            val = ast.literal_eval(s)
            if isinstance(val, (list, str)):
                return _parse_any(val)
        except Exception:
            pass

        # As a last parsing attempt, handle bracketed single item: ["rule"]
        m = re.match(r'^\[\s*[\'"]?([A-Za-z0-9_]+)[\'"]?\s*\]$', s)
        if m:
            return [m.group(1)]

        # Comma-separated fallback (no brackets)
        if "," in s and "[" not in s and "]" not in s:
            parts = [p.strip().strip("'\"") for p in s.split(",") if p.strip()]
            return parts

        # Otherwise treat as atomic rule token
        return [s.strip("'\"")]

    # Anything else -> string
    return [str(obj).strip()]

In [None]:
def preprocess_df(df_in, verbose=False):
        # --- Work on a copy ---
    df = df_in.copy()

    # Parse applied_rules
    df["applied_rules_list"] = df["applied_rules"].apply(_parse_any)

    # Keep only allowed rules & dedup
    df["applied_rules_list"] = df["applied_rules_list"].apply(
        lambda L: dedup_preserve_order([r for r in L if r in ALLOWED_RULES])
    )

    # Explode
    df_out = (
        df.explode("applied_rules_list")
          .rename(columns={"applied_rules_list": "rule"})
          .reset_index(drop=True)
    )
     # Normalize unwanted rule strings
    df_out["rule"] = df_out["rule"].apply(normalize_rule_string)
    


    # Drop empty rules
    df_out = df_out[~df_out["rule"].isna() & (df_out["rule"].astype(str).str.strip() != "")]
    # Exclude punctuation category
    df_out = df_out[df_out["rule"] != "clean_punctuation"]
   

    # Verbose output
    if verbose:
        print("Parsed sample:", df["applied_rules_list"].head(10).tolist())
        print("Original rows:", len(df))
        print("Exploded rows:", len(df_out))
        print("\nCounts per rule:")
        print(df_out["rule"].value_counts())

    return df_out

In [None]:
dfs = [df_ex1a_prep, df_ex1b_prep, df_ex2a_prep, df_ex2b_prep]

processed = [preprocess_df(df, verbose=False) for df in dfs]
df1a_prepared, df1b_prepared, df2a_prepared, df2b_prepared = processed

## Apply Calculation

In [None]:
# 1) Define reward weights #putting more stress on the semantic similarity
REWARD_WEIGHTS = {
    "rule_score": 0.3,
    "semantic_score": 0.5,
    "grammar_score": 0.2,
}
def eval_output_aggr(df, text_col, weights=REWARD_WEIGHTS):

    df_eval = df.copy()
# 3) Compute rewards per sentence again
    df_eval["semantic_score"] = df_eval.apply(
        lambda r: calculate_semantic_similarity(r["original"], r[text_col]), axis=1
    )
    df_eval["grammar_score"] = df_eval[text_col].apply(calculate_grammar_score)
    
    df_eval["rule_score"] = df_eval.apply(
        lambda r: rule_compliance_score(r[text_col]), axis=1
    )

    df_eval["total_reward"] = ( df_eval["rule_score"] * REWARD_WEIGHTS["rule_score"]
        + df_eval["semantic_score"] * REWARD_WEIGHTS["semantic_score"]
        + df_eval["grammar_score"] * REWARD_WEIGHTS["grammar_score"]
)
    # Input length
    df_eval["input_len_chars"] = df_eval["original"].astype(str).str.len()
    df_eval["input_len_tokens"] = df_eval["original"].astype(str).str.split().map(len)
    # --- Output length 
    df_eval["output_len_chars"] = df_eval[text_col].astype(str).str.len() #number of characters
    df_eval["output_len_tokens"] = df_eval[text_col].astype(str).str.split().map(len) #number of white split tokens
    # Ratios (output vs input)
    df_eval["len_ratio_chars"] = df_eval["output_len_chars"] / df_eval["input_len_chars"]
    df_eval["len_ratio_tokens"] = df_eval["output_len_tokens"] / df_eval["input_len_tokens"]    

    # --- Overall summary
    summary = {
        "mean_total_reward": df_eval["total_reward"].mean(), #total reward already computed
        "mean_semantic": df_eval["semantic_score"].mean(),
        "mean_grammar": df_eval["grammar_score"].mean(),
        "mean_rule": df_eval["rule_score"].mean(),
        "mean_len_chars": df_eval["output_len_chars"].mean(),
        "mean_len_tokens": df_eval["output_len_tokens"].mean(),
    }

    return df_eval, summary

def rule_aggregated_output(df, text_col):
    agg_rule = (
        df.groupby("rule")[["semantic_score","grammar_score","rule_score","total_reward"]]
        .mean()
        .sort_values("total_reward", ascending=False)
    )
    return agg_rule.reset_index()


In [None]:
# Bundle variables
dfs_bundle = {
    "df1a": df1a_prepared,
    "df1b": df1b_prepared,
    "df2a": df2a_prepared,
    "df2b": df2b_prepared,
}

text_cols = ["ppo_response", "simplified"]

In [None]:
# Apply calculation on all files & save output
all_results = {}

for name, df in dfs_bundle.items():
    for col in text_cols:
        tag = f"{name}_{col}"   # e.g., df1a_ppo_response, df1a_simplified
        
        # --- Sentence-level eval ---
        df_eval, summary = eval_output_aggr(df.copy(), text_col=col)
        
        # --- Rule-level aggregates ---
        agg = rule_aggregated_output(df_eval, text_col=col)
        
        # --- Save to dict ---
        all_results[tag] = {
            "per_sentence": df_eval,
            "summary": summary,
            "per_rule": agg
        }
        
        # --- Save to disk ---
        df_eval.to_csv(f"eval_final_results/aggregated/{tag}_per_sentence.csv", index=False)
        agg.to_csv(f"eval_final_results/aggregated/{tag}_per_rule.csv", index=False)

        print(f"Finished {tag}")
