### Decide which hand-crafted rules are meaning-preserving and thus safe to include in the reward model (RM) or PPO training.

- look into an LLM that provides a targeted German support:
  - "xlm-roberta-base"
  - "dbmdz/bert-base-german-uncased"
  - deepset/gbert-base
  - bert-base-german-dbmdz-uncased

- simplification score to be:
  - the rule-compliance tracker
  - inserting SARI as well would be too 'simple minded'

possible simplification score combination
- combine 
- reward = alpha * simplification_score + beta * bert_score
- alpha, beta can be tuned depending on priorities (which score is more critical?)

In [1]:
# pip install torch torchvision transformers
# pip install bert-score

In [2]:
from bert_score import score
from transformers import AutoTokenizer, AutoModel
import matplotlib.pyplot as plt
import pandas as pd
import re

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
input_path = "master_data/0_original/all.txt"
output_path = "master_data/3_simplified/all_simplified_plain.txt"
log_path = "simplification_logs/all_parsed_log_2025-08-31_11-54-52.csv"

In [4]:
df = pd.read_csv(log_path)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108449 entries, 0 to 108448
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   uid         108449 non-null  int64 
 1   original    108449 non-null  object
 2   rule        108449 non-null  object
 3   applied     108449 non-null  bool  
 4   simplified  108449 non-null  object
 5   doc_name    108449 non-null  object
dtypes: bool(1), int64(1), object(4)
memory usage: 4.2+ MB


In [6]:
df.head(15)

Unnamed: 0,uid,original,rule,applied,simplified,doc_name
0,1,Der Iran wird teilweise aus dem Atom-Abkommen ...,clean_punctuation,False,Der Iran wird teilweise aus dem Atom-Abkommen ...,all_parsed.txt
1,1,Der Iran wird teilweise aus dem Atom-Abkommen ...,rewrite_apposition,False,Der Iran wird teilweise aus dem Atom-Abkommen ...,all_parsed.txt
2,1,Der Iran wird teilweise aus dem Atom-Abkommen ...,simplify_subordinate,False,Der Iran wird teilweise aus dem Atom-Abkommen ...,all_parsed.txt
3,1,Der Iran wird teilweise aus dem Atom-Abkommen ...,convert_passive_to_active,False,Der Iran wird teilweise aus dem Atom-Abkommen ...,all_parsed.txt
4,1,Der Iran wird teilweise aus dem Atom-Abkommen ...,normalize_verb_tense,True,Der Iran wird teilweise aus dem Atom-Abkommen ...,all_parsed.txt
5,2,Präsidentin,split_compound,True,Präsi·Dentin,all_parsed.txt
6,2,Brüssel Ursula von der Leyen ist die Präsi·Den...,clean_punctuation,False,Brüssel Ursula von der Leyen ist die Präsi·Den...,all_parsed.txt
7,2,Brüssel Ursula von der Leyen ist die Präsi·Den...,rewrite_apposition,False,Brüssel Ursula von der Leyen ist die Präsi·Den...,all_parsed.txt
8,2,Brüssel Ursula von der Leyen ist die Präsi·Den...,simplify_subordinate,False,Brüssel Ursula von der Leyen ist die Präsi·Den...,all_parsed.txt
9,2,Brüssel Ursula von der Leyen ist die Präsi·Den...,convert_passive_to_active,False,Brüssel Ursula von der Leyen ist die Präsi·Den...,all_parsed.txt


In [7]:
df_compound = df[df["rule"] == "split_compound"]

In [8]:
df_compound.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24784 entries, 5 to 108443
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   uid         24784 non-null  int64 
 1   original    24784 non-null  object
 2   rule        24784 non-null  object
 3   applied     24784 non-null  bool  
 4   simplified  24784 non-null  object
 5   doc_name    24784 non-null  object
dtypes: bool(1), int64(1), object(4)
memory usage: 1.2+ MB


In [9]:
filtered_comp = df_compound[df_compound["applied"] == True]
filtered_comp

Unnamed: 0,uid,original,rule,applied,simplified,doc_name
5,2,Präsidentin,split_compound,True,Präsi·Dentin,all_parsed.txt
46,8,Lade-Stationen,split_compound,True,Lade·Stationen,all_parsed.txt
47,8,Elektro-Fahrzeuge,split_compound,True,Elektro·Fahrzeuge,all_parsed.txt
59,10,Union-Gesetze,split_compound,True,Union·Gesetze,all_parsed.txt
87,15,Corona-Virus,split_compound,True,Corona·Virus,all_parsed.txt
...,...,...,...,...,...,...
108290,16520,Gesundheits-Krise,split_compound,True,Gesundheits·Krise,all_parsed.txt
108296,16521,Gesundheits-Minister,split_compound,True,Gesundheits·Minister,all_parsed.txt
108313,16524,Gesundheits-Minister,split_compound,True,Gesundheits·Minister,all_parsed.txt
108390,16537,Fußball-Trainer,split_compound,True,Fußball·Trainer,all_parsed.txt


In [10]:
# def assess_rule_output(df):
#     results = []

#     for uid, group in df.groupby("uid"):
#         original = group["original"].iloc[0]              # the very first "original" sentence
#         simplified = group["simplified"].iloc[-1]         # the last simplification
#         applied_rules = group.loc[group["applied"] == True, "rule"].tolist()

#         results.append({
#             "uid": uid,
#             "original": original,
#             "simplified": simplified,
#             "applied_rules": applied_rules
#         })

#     return pd.DataFrame(results)

## Trying again

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108449 entries, 0 to 108448
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   uid         108449 non-null  int64 
 1   original    108449 non-null  object
 2   rule        108449 non-null  object
 3   applied     108449 non-null  bool  
 4   simplified  108449 non-null  object
 5   doc_name    108449 non-null  object
dtypes: bool(1), int64(1), object(4)
memory usage: 4.2+ MB


In [12]:
#Filter out only applied rules

df_applied = df[df["applied"] == True]
df_applied.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14001 entries, 4 to 108440
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   uid         14001 non-null  int64 
 1   original    14001 non-null  object
 2   rule        14001 non-null  object
 3   applied     14001 non-null  bool  
 4   simplified  14001 non-null  object
 5   doc_name    14001 non-null  object
dtypes: bool(1), int64(1), object(4)
memory usage: 670.0+ KB


In [13]:
df_applied.head(15)

Unnamed: 0,uid,original,rule,applied,simplified,doc_name
4,1,Der Iran wird teilweise aus dem Atom-Abkommen ...,normalize_verb_tense,True,Der Iran wird teilweise aus dem Atom-Abkommen ...,all_parsed.txt
5,2,Präsidentin,split_compound,True,Präsi·Dentin,all_parsed.txt
22,4,Bis zum Jahr 2030 soll es in der Europäische U...,normalize_verb_tense,True,Bis zum Jahr 2030 soll es in der Europäische U...,all_parsed.txt
29,5,"Das ist sehr viel , denn in den letzten 29 Jah...",normalize_verb_tense,True,"Das hat ist sehr viel , denn in den letzten 29...",all_parsed.txt
42,7,Die Europäische Union-Kommission will vor alle...,normalize_verb_tense,True,Die Europäische Union-Kommission will vor alle...,all_parsed.txt
46,8,Lade-Stationen,split_compound,True,Lade·Stationen,all_parsed.txt
47,8,Elektro-Fahrzeuge,split_compound,True,Elektro·Fahrzeuge,all_parsed.txt
59,10,Union-Gesetze,split_compound,True,Union·Gesetze,all_parsed.txt
87,15,Corona-Virus,split_compound,True,Corona·Virus,all_parsed.txt
103,17,Vor allem in Europa und in den USA schrumpft d...,simplify_subordinate,True,man angenommen hat.,all_parsed.txt


In [14]:
# df_applied.to_csv("master_data/output_assessment/all_applied_rules.csv", index=False)

In [15]:
#OUTDATED code for the original simplification approach on original/complex sentences

# # Get the last applied simplification per sentence UID
# # (Assume rules are applied in order of appearance)
# last_applied_per_uid = df_applied.groupby("uid").tail(1)

# # Also get original sentences from any row (all identical for a UID)
# originals_per_uid = df.groupby("uid").first().reset_index()[["uid", "original"]]

# # Merge to get (original, final simplified) pairs
# final_pairs = pd.merge(originals_per_uid, last_applied_per_uid[["uid", "simplified"]], on="uid")

# # Extract all applied rules per UID (True only)
# # gives out UID and a second column of all applied rules according to uid

# applied_rules_per_uid = (
#     df[df["applied"] == True]
#     .groupby("uid")["rule"]
#     .apply(list)
#     .reset_index()
#     .rename(columns={"rule": "applied_rules"})
# )

# # Merge with the final_pairs (which already has original + final simplified)
# final_pairs_with_rules = pd.merge(final_pairs, applied_rules_per_uid, on="uid", how="left")

In [16]:
# Get the unique original sentences in the order of their first appearance
#unique_originals = df_applied['original'].unique()
grouped = df.groupby('uid')

processed_data = []

#for sentence in unique_originals:
for uid, group in grouped:
        # Get all rows for the current original sentence
        #group = df_applied[df_applied['original'] == sentence]
        
        # Find all rules that were successfully applied for this group
        applied_rules_list = group[group['applied'] == True]['rule'].tolist()
        
        # We only want to include sentences where at least one rule was applied
        if not applied_rules_list:
            continue

        # De-duplicate the list of rules while preserving order
        unique_applied_rules = list(dict.fromkeys(applied_rules_list))

        # Heuristic: The "main" original sentence is the longest one in the group
        main_original_sentence = group.loc[group['original'].str.len().idxmax(), 'original']

        # The final simplification is the 'simplified' text from the very last logged step
        final_simplification_text = group.loc[group.index.max(), 'simplified']

        # Alternative (if you want the very last simplification regardless of application):    
        # The final simplification is the 'simplified' text from the very last entry in the group
        #final_simplification_text = group['simplified'].iloc[-1]
        
        # Append the structured data
        processed_data.append({
            'uid': uid,
            'original_sentence': main_original_sentence,
            'final_simplification': final_simplification_text,
            'applied_rules': unique_applied_rules
        })

# Create the final DataFrame from our processed list
result_df = pd.DataFrame(processed_data)

In [17]:
# Sort the final result by UID to approximate the original file order
result_df = result_df.sort_values(by='uid').reset_index(drop=True)
result_df

Unnamed: 0,uid,original_sentence,final_simplification,applied_rules
0,1,Der Iran wird teilweise aus dem Atom-Abkommen ...,Der Iran wird teilweise aus dem Atom-Abkommen ...,[normalize_verb_tense]
1,2,Brüssel Ursula von der Leyen ist die Präsi·Den...,Brüssel Ursula von der Leyen ist die Präsi·Den...,[split_compound]
2,4,Bis zum Jahr 2030 soll es in der Europäische U...,Bis zum Jahr 2030 soll es in der Europäische U...,[normalize_verb_tense]
3,5,"Das ist sehr viel , denn in den letzten 29 Jah...","Das hat ist sehr viel , denn in den letzten 29...",[normalize_verb_tense]
4,7,Die Europäische Union-Kommission will vor alle...,Die Europäische Union-Kommission will vor alle...,[normalize_verb_tense]
...,...,...,...,...
10708,16535,Im Ramadan essen und trinken die Muslime tagsü...,Im Ramadan essen und trinken die Muslime tagsü...,[normalize_verb_tense]
10709,16537,Der Fußball·Trainer Adi Hütter wechselt zu Mön...,Der Fußball·Trainer Adi Hütter wechselt zu Mön...,[split_compound]
10710,16539,Mönchengladbach Adi Hütter ist ein Fußball·Tra...,Mönchengladbach Adi Hütter ist ein Fußball·Tra...,[split_compound]
10711,16541,Hütter wird aber mit Saison-Ende in der deutsc...,Hütter wird aber mit Saison-Ende in der deutsc...,[normalize_verb_tense]


In [19]:
result_df.head()

Unnamed: 0,uid,original_sentence,final_simplification,applied_rules
0,1,Der Iran wird teilweise aus dem Atom-Abkommen ...,Der Iran wird teilweise aus dem Atom-Abkommen ...,[normalize_verb_tense]
1,2,Brüssel Ursula von der Leyen ist die Präsi·Den...,Brüssel Ursula von der Leyen ist die Präsi·Den...,[split_compound]
2,4,Bis zum Jahr 2030 soll es in der Europäische U...,Bis zum Jahr 2030 soll es in der Europäische U...,[normalize_verb_tense]
3,5,"Das ist sehr viel , denn in den letzten 29 Jah...","Das hat ist sehr viel , denn in den letzten 29...",[normalize_verb_tense]
4,7,Die Europäische Union-Kommission will vor alle...,Die Europäische Union-Kommission will vor alle...,[normalize_verb_tense]


In [None]:
df_cleanup = result_df.copy()

In [31]:
df_cleanup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10713 entries, 0 to 10712
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   uid                   10713 non-null  int64 
 1   original_sentence     10713 non-null  object
 2   final_simplification  10713 non-null  object
 3   applied_rules         10713 non-null  object
dtypes: int64(1), object(3)
memory usage: 334.9+ KB


In [34]:

df_cleanup.columns = df_cleanup.columns.str.strip() # This removes leading/trailing spaces from each column name

def clean_all_whitespace(sentence):
  """
  Replaces multiple spaces inside a string with a single space,
  and then strips leading/trailing whitespace.
  """
  # Step 1: Clean up all internal whitespace first.
  sentence = re.sub(r'\s+', ' ', sentence).strip()
  # 2. Strip whitespace from the beginning and end
  sentence = re.sub(r'\s+([.,:;?!])', r'\1', sentence)
  return sentence

columns_to_clean = ['original_sentence', 'final_simplification']

print(f"Attempting to strip whitespace from columns: {', '.join(columns_to_clean)}")

# Loop through the identified columns and apply the strip() method
for col in columns_to_clean:
  if col in df_cleanup.columns and df_cleanup[col].dtype == 'object':
    print(f"Cleaning column: '{col}'...")
    # Apply our new, more powerful cleaning function to each sentence in the column
    df_cleanup[col] = df_cleanup[col].apply(clean_all_whitespace)
  else:
    print(f"Column '{col}' not found or is not a text column.")

Attempting to strip whitespace from columns: original_sentence, final_simplification
Cleaning column: 'original_sentence'...
Cleaning column: 'final_simplification'...


In [35]:
print(df_cleanup.head().to_markdown(index=False))

|   uid | original_sentence                                                                              | final_simplification                                                                           | applied_rules            |
|------:|:-----------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------|:-------------------------|
|     1 | Der Iran wird teilweise aus dem Atom-Abkommen aussteigen.                                      | Der Iran wird teilweise aus dem Atom-Abkommen aussteigen.                                      | ['normalize_verb_tense'] |
|     2 | Brüssel Ursula von der Leyen ist die Präsi·Dentin von der Europäische Union-Kommission.        | Brüssel Ursula von der Leyen ist die Präsi·Dentin von der Europäische Union-Kommission.        | ['split_compound']       |
|     4 | Bis zum Jahr 2030 soll es in der Europäische Union um 55 Prozent w

In [36]:
output_filename = 'master_data/output_assessment/ordered_simplifications_with_rules_clean.csv'
df_cleanup.to_csv(output_filename, index=False)

In [33]:
# --- Save and Display Results ---
output_filename = 'master_data/output_assessment/ordered_simplifications_with_rules.csv'
result_df.to_csv(output_filename, index=False)

print(f"\nSuccessfully created a new, ordered file: '{output_filename}'")
print("\nHere is a preview of the new format:")
print(result_df.head().to_markdown(index=False))


Successfully created a new, ordered file: 'master_data/output_assessment/ordered_simplifications_with_rules.csv'

Here is a preview of the new format:
|   uid | original_sentence                                                                               | final_simplification                                                                            | applied_rules            |
|------:|:------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------------|:-------------------------|
|     1 | Der Iran wird teilweise aus dem Atom-Abkommen aussteigen .                                      | Der Iran wird teilweise aus dem Atom-Abkommen aussteigen .                                      | ['normalize_verb_tense'] |
|     2 | Brüssel Ursula von der Leyen ist die Präsi·Dentin von der Europäische Union-Kommission .        | Brüssel Ursula von der Leyen ist die Präs

In [None]:
final_pairs.to_csv("final_simplified_pairs.csv", index=False)
final_pairs_with_rules.to_csv("final_simplified_pairs_with_rules.csv", index=False)
filtered_comp.to_csv("filtered_compound_applied.csv", index=False)

In [19]:
filtered_comp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5598 entries, 1 to 98220
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   uid         5598 non-null   int64 
 1   original    5598 non-null   object
 2   rule        5598 non-null   object
 3   applied     5598 non-null   bool  
 4   simplified  5598 non-null   object
 5   doc_name    5598 non-null   object
dtypes: bool(1), int64(1), object(4)
memory usage: 267.9+ KB


In [20]:
comp_formatted = filtered_comp.drop(
    columns=['applied', 'doc_name']
).rename(
    columns={'rule': 'applied_rules'}
)

# 2. IMPORTANT: The 'applied_rules' in your final DataFrame is a list.
#    We need to wrap the rule name (e.g., 'split_compound') in a list
#    to match the format of the other DataFrame.
comp_formatted['applied_rules'] = comp_formatted['applied_rules'].apply(lambda x: [x])


In [21]:
comp_formatted.head()

Unnamed: 0,uid,original,applied_rules,simplified
1,1,Atom-Abkommen,[split_compound],Atom·Abkommen
7,2,Präsidentin,[split_compound],Präsi·Dentin
18,3,Treibhaus-Gase,[split_compound],Treibhaus·Gase
46,7,Wasserstoff,[split_compound],Wasser·Stoff
50,7,Elektro-Fahrzeuge,[split_compound],Elektro·Fahrzeuge


In [22]:
df_combined = pd.concat([final_pairs_with_rules, comp_formatted], ignore_index=True)

In [23]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13178 entries, 0 to 13177
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uid            13178 non-null  int64 
 1   original       13178 non-null  object
 2   simplified     13178 non-null  object
 3   applied_rules  13178 non-null  object
dtypes: int64(1), object(3)
memory usage: 411.9+ KB


In [24]:
filtered_comp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5598 entries, 1 to 98220
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   uid         5598 non-null   int64 
 1   original    5598 non-null   object
 2   rule        5598 non-null   object
 3   applied     5598 non-null   bool  
 4   simplified  5598 non-null   object
 5   doc_name    5598 non-null   object
dtypes: bool(1), int64(1), object(4)
memory usage: 267.9+ KB


In [25]:
final_pairs_with_rules.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7580 entries, 0 to 7579
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   uid            7580 non-null   int64 
 1   original       7580 non-null   object
 2   simplified     7580 non-null   object
 3   applied_rules  7580 non-null   object
dtypes: int64(1), object(3)
memory usage: 237.0+ KB


In [26]:
df_combined.to_csv("combined_pairs_with_rules_applied.csv", index=False)

# Assess the Performance using roberta

In [None]:
# Load your exported sentence pairs
df = pd.read_csv("final_simplified_pairs.csv")

originals = df["original"].tolist()
simplifieds = df["simplified"].tolist()

# Compute BERTScore using German-specific model
P, R, F1 = score(simplifieds, originals, model_type="xlm-roberta-base", lang="de")

# Add scores back to dataframe
df["bertscore_f1"] = F1.tolist()

# Save the results
df.to_csv("bert_score_results.csv", index=False)
print("Done! Results saved to 'bert_score_results.csv'")

In [None]:
# calculate with the csv files that includes applied rules
from collections import defaultdict

# Load final output with applied rules
df_rules = pd.read_csv("final_simplified_pairs_with_rules.csv")  # <- export this first from our current state

# Group sentence pairs by rule
rule_to_pairs = defaultdict(list)

for _, row in df_rules.iterrows():
    if pd.isna(row["applied_rules"]):
        continue
    rules = eval(row["applied_rules"]) if isinstance(row["applied_rules"], str) else row["applied_rules"]
    for rule in rules:
        rule_to_pairs[rule].append((row["original"], row["simplified"]))

# Compute average BERTScore-F1 per rule
rule_to_f1 = {}

for rule, pairs in rule_to_pairs.items():
    o, s = zip(*pairs)
    _, _, F1 = score(s, o, model_type="xlm-roberta-base", lang="de") #bert-base-german-dbmdz-uncased #
    rule_to_f1[rule] = sum(F1.tolist()) / len(F1)

# Print results
print("Average BERTScore-F1 per rule:")
for rule, f1 in sorted(rule_to_f1.items(), key=lambda x: x[1]):
    print(f"{rule}: {f1:.4f}")

In [None]:
# #Example calc

# # Original vs. Simplified sentences
# originals = ["Der Hund läuft schnell zur Tür."]
# simplifieds = ["Der Hund rennt zur Tür."]

# # Compute BERTScore using a German or multilingual model
# P, R, F1 = score(simplifieds, originals, lang="de", model_type="bert-base-multilingual-cased")

# print(f"Precision: {P.mean().item():.4f}")
# print(f"Recall: {R.mean().item():.4f}")
# print(f"F1: {F1.mean().item():.4f}")
