### Assess performance and decide which hand-crafted rules are meaning-preserving and thus safe to include in the reward model (RM) or PPO training.

In [None]:
import pandas as pd
import json
import re
import ast
import numpy as np
from collections import defaultdict
from bert_score import score
from collections import Counter

In [None]:
# Set variables - link to the corresponding log file
LOG_PATH = "simplification_logs/all_parsed_log_2025-09-14_12-38-08.csv"
# Output of final cleaned up file 
OUTPUT_FILENAME = 'master_data/output_assessment/ordered_simplifications_with_rules_clean.csv'

In [None]:
df = pd.read_csv(LOG_PATH)

In [None]:
df.info()

#### There are non-null rows in simplified, identified to come from word_to_number() vconversion. They need to be filtered out.

In [None]:
df = df.dropna(how='any', axis=0)
df.info()

In [None]:
df.head(10)

In [None]:
#df.to_csv("master_data/output_assessment/all_simplifications.csv", index=False)

# Filter out and aggregate from simplification log

In [None]:
df.info()

In [None]:
# Define rule categories
PARTIAL_RULES = {"split_compound", "convert_word_to_number"}
SPLIT_RULES = {"rewrite_apposition", "simplify_subordinate"}

In [None]:
def normalize(s: str) -> str:
    """Whitespace-normalize a string for duplicate checks."""
    s = str(s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def split_into_sentences(text: str):
    """
    Very lightweight sentence splitter for cleanup/dedup.
    Splits on . ! ? while keeping punctuation.
    """
    text = text.strip()
    if not text:
        return []
    parts = re.split(r"(?<=[.!?])\s+", text)
    return [normalize(p) for p in parts if normalize(p)]

def dedup_preserve_order(items):
    """Remove duplicates while preserving order."""
    seen = set()
    out = []
    for it in items:
        key = normalize(it)
        if key not in seen:
            out.append(it)
            seen.add(key)
    return out

records = []

# Process sentence-by-sentence groups
for uid, group in df.groupby("uid", sort=False):
    group = group.reset_index(drop=True)

    # Collect applied rules
    applied_rules_list = group[group["applied"] == True]["rule"].tolist()
    if not applied_rules_list:
        continue

    # Deduplicate rules while preserving order
    seen_rules = set()
    unique_applied_rules = []
    for r in applied_rules_list:
        if r not in seen_rules:
            unique_applied_rules.append(r)
            seen_rules.add(r)

    # Did any split-type rule fire?
    split_applied = any(r in SPLIT_RULES for r in unique_applied_rules)

    # Start from the true original
    main_sentence = normalize(group["initial_original_sentence"].iloc[0])
    sentences = [main_sentence]
    seen_sentences = {main_sentence}

    # Replay transformations
    for _, row in group.iterrows():
        if not row["applied"]:
            continue

        rule = row["rule"]
        simplified_piece = normalize(row["simplified"]) if pd.notna(row["simplified"]) else ""
        original_piece   = normalize(row["original"]) if pd.notna(row["original"]) else ""

        if rule in PARTIAL_RULES:
            # Patch fragment into the last sentence
            if original_piece and original_piece in sentences[-1]:
                sentences[-1] = sentences[-1].replace(original_piece, simplified_piece, 1)

        elif rule in SPLIT_RULES:
            # Append new sentence(s), deduped
            if simplified_piece:
                new_sents = split_into_sentences(simplified_piece) or [simplified_piece]
                for ns in new_sents:
                    ns_norm = normalize(ns)
                    if ns_norm not in seen_sentences:
                        sentences.append(ns)
                        seen_sentences.add(ns_norm)

        else:
            # Full-sentence rewrite
            if simplified_piece:
                sentences[-1] = simplified_piece

    # --- Post-processing
    sentences = dedup_preserve_order(sentences)

    original_raw  = group["initial_original_sentence"].iloc[0]
    original_norm = normalize(original_raw)

    if split_applied:
        # Check if the original still appears exactly as one of the collected sentences
        has_exact_original = any(normalize(s) == original_norm for s in sentences)
        has_transformed    = any(original_norm in normalize(s) and normalize(s) != original_norm for s in sentences)

        if has_exact_original and not has_transformed:
            # Only drop the original if it is *unchanged* and other sentences exist
            if len(sentences) > 1:
                sentences = [s for s in sentences if normalize(s) != original_norm]

    # Join final sentences
    final_text = " ".join(sentences).strip()


    # Store result
    records.append({
        "uid": uid,
        "original_sentence": group["initial_original_sentence"].iloc[0],
        "final_simplification": final_text,
        "applied_rules": unique_applied_rules
    })

# Build DataFrame
result_df = pd.DataFrame(records)

In [None]:
result_df.tail(15)

In [None]:
# Sort the final result by UID to approximate the original file order
result_df = result_df.sort_values(by='uid').reset_index(drop=True)
result_df

In [None]:
result_df.head()

In [None]:
result_df.info()

In [None]:
df_cleanup = result_df.copy()

In [None]:
df_cleanup.info()

### Apply last step cleanup

In [None]:

df_cleanup.columns = df_cleanup.columns.str.strip() # This removes leading/trailing spaces from each column name

def clean_all_whitespace(sentence):
  """
  Replaces multiple spaces inside a string with a single space,
  and then strips leading/trailing whitespace.
  """
  # 0: If the input is not a string, return it as is
  if not isinstance(sentence, str):
      return sentence
  # 1: Clean up all internal whitespace first.
  sentence = re.sub(r'\s+', ' ', sentence).strip()
  # 2: Strip whitespace from the beginning and end
  sentence = re.sub(r'\s+([.,:;?!])', r'\1', sentence)
  return sentence

columns_to_clean = ['original_sentence', 'final_simplification']

print(f"Attempting to strip whitespace from columns: {', '.join(columns_to_clean)}")

# Loop through the identified columns and apply the strip() method
for col in columns_to_clean:
  if col in df_cleanup.columns and df_cleanup[col].dtype == 'object':
    print(f"Cleaning column: '{col}'...")
    # Apply our new, more powerful cleaning function to each sentence in the column
    df_cleanup[col] = df_cleanup[col].apply(clean_all_whitespace)
  else:
    print(f"Column '{col}' not found or is not a text column.")

In [None]:
print(df_cleanup.head().to_markdown(index=False))

In [None]:
df_cleanup.head(20)

In [None]:
df_cleanup.to_csv(OUTPUT_FILENAME, index=False)

In [None]:
print(f"\nSaved the final, ordered file: '{OUTPUT_FILENAME}'")
print("\nHere is a preview of the new format:")
print(df_cleanup.head().to_markdown(index=False))

In [None]:
df_cleanup.info()

In [None]:
#only keelp original_sentence and final_simplification to save in a different setting for assessmen
final_pairs = df_cleanup[['uid', 'original_sentence', 'final_simplification']]
final_pairs.info()

In [None]:
final_pairs.to_csv("master_data/output_assessment/final_simplified_pairs_cleaned_FINAL.csv", index=False)

# Calculate BERT Score


In [None]:
# Utilize the final variable or import the saved file
df_rules = df_cleanup.copy()

#df_rules = pd.read_csv(OUTPUT_FILENAME)

In [None]:
# Calculation: original -> simplified

originals = df_rules["original_sentence"].tolist()
simplifieds = df_rules["final_simplification"].tolist()

# Compute BERTScore using German-specific model
P, R, F1 = score(simplifieds, originals, model_type="xlm-roberta-large", lang="de")

# Add scores back to dataframe
df_rules["bertscore_f1"] = F1.tolist()

# Save the results
df_rules.to_csv("master_data/output_assessment/bert_score_results.csv", index=False)