# This notebook focuses on calculating quantifying metrics to assess the simplification quality of the pipeline

### Primary Metric
- SARI

- Flesch Reading Ease formula (de)
- Wiener Sachtextformel (the lower the score, the easier to read)
  - 1 (WSTF-1) – For newspaper articles
  - 2 (WSTF-2) – For brochures or technical texts
  - 3 (WSTF-3) – For official documents
  - 4 (WSTF-4) – For general texts (the most general/modern one, usually recommended unless you have a reason to choose another)
  - usually 1 for newspapers, 2 for technical/brochure, 3 for official gov/admin texts
- LIX (Lesbarkeitsindex), RIX (slightly adjusted)
  - either from scratch or implemented from github source

In [61]:
import os
from easse.sari import corpus_sari
import os
import textstat
import csv
import re

In [138]:
# current working dir
BASE_DIR = os.getcwd()

ORIGINAL = os.path.join(BASE_DIR, "preprocessed_texts", "apa-rst", "0_original")
SYSTEM = os.path.join(BASE_DIR, "preprocessed_texts", "apa-rst", "3_simplified")
REFERENCE = os.path.join(BASE_DIR, "preprocessed_texts", "apa-rst", "0_1_referenceB1")

In [117]:
def get_plain_files(dir_path):
    """
    Returns list of filenames that end with _plain"""
    return [f for f in os.listdir(dir_path) if f.endswith("_plain.txt")]
def get_files(dir_path):
    """
    Returns list of filenames that end with -a2.txt"""
    return [f for f in os.listdir(dir_path) if not f.startswith('.')]

def read_aligned_triplets(original_path, system_path, reference_path):
    # Each file: one line per input sentence (system/reference may be multi-sentence per line)
    with open(original_path, encoding="utf-8") as f:
        originals = [line.strip() for line in f if line.strip()]
    with open(system_path, encoding="utf-8") as f:
        systems = [line.strip() for line in f if line.strip()]
    with open(reference_path, encoding="utf-8") as f:
        references = [line.strip() for line in f if line.strip()]

    # Sanity check
    assert len(originals) == len(systems) == len(references), \
        f"Lengths do not match: {len(originals)}, {len(systems)}, {len(references)}"
    return originals, systems, references

In [118]:
plain_files = get_plain_files(SYSTEM) # system output files
reference_files = get_files(REFERENCE)
original_files = get_files(ORIGINAL)
print(original_files)

['sent_4-freitag-28-1-22-or.txt', 'sent_3-freitag-28-1-22-or.txt', 'sent_1-29-11-21-or.txt', 'sent_1-freitag-28-1-22-or.txt', 'sent_2-freitag-28-1-22-or.txt', 'sent_5-freitag-28-1-22-or.txt', 'sent_3-18-1-22-or.txt', 'sent_4-21-2-18-or.txt', 'sent_3-dienstag-8-2-22-or.txt', 'sent_4-18-1-22-or.txt', 'sent_2-29-11-21-or.txt', 'sent_4-dienstag-8-2-22-or.txt', 'sent_3-21-2-18-or.txt', 'sent_2-21-2-18-or.txt', 'sent_5-29-11-21-or.txt', 'sent_5-18-1-22-or.txt', 'sent_5-21-2-18-or.txt', 'sent_2-18-1-22-or.txt', 'sent_4-29-11-21-or.txt', 'sent_1-dienstag-8-2-22-or.txt', 'sent_5-dienstag-8-2-22-or.txt', 'sent_1-21-2-18-or.txt', 'sent_1-18-1-22-or.txt', 'sent_2-dienstag-8-2-22-or.txt', 'sent_3-29-11-21-or.txt']


In [119]:
all_originals = []
all_systems = []
all_references = []

for plain_file in plain_files:
    # Get the base, e.g. sent_3-29-11-21-or
    base = plain_file.replace('_simplified_plain.txt', '')
    
    # Original: exact match
    original_file = os.path.join(ORIGINAL, base + ".txt")

    # Reference: remove -or or -trial if present, add -a2.txt
    if base.endswith('-or'):
        ref_base = base[:-3]  # remove '-or'
    elif base.endswith('-trial'):
        ref_base = base[:-6]  # remove '-trial'
    else:
        ref_base = base

    reference_file = os.path.join(REFERENCE, ref_base + "-b1.txt") #check for a2 or b1
    system_file = os.path.join(SYSTEM, plain_file)

    # Print for debugging
    # print(f"\nSystem: {system_file}\nOriginal: {original_file}\nReference: {reference_file}")

    # Check files exist
    if not (os.path.exists(original_file) and os.path.exists(reference_file)):
        print(f"Skipping {plain_file}: missing original or reference file")
        continue

    try:
        original, system, reference = read_aligned_triplets(
            original_file, system_file, reference_file
        )
        for o, s, r in zip(original, system, reference):
            all_originals.append(o)
            all_systems.append(s)
            all_references.append([r])  # Single reference per input
    except AssertionError as e:
        print(f"Alignment error in {plain_file}: {e} -- flattening to whole document.")
        with open(original_file, encoding="utf-8") as f:
            o = " ".join([line.strip() for line in f if line.strip()])
        with open(system_file, encoding="utf-8") as f:
            s = " ".join([line.strip() for line in f if line.strip()])
        with open(reference_file, encoding="utf-8") as f:
            r = " ".join([line.strip() for line in f if line.strip()])

        all_originals.append(o)
        all_systems.append(s)
        all_references.append([r]) 

Alignment error in sent_1-18-1-22-or_simplified_plain.txt: Lengths do not match: 20, 20, 6 -- flattening to whole document.
Alignment error in sent_4-18-1-22-or_simplified_plain.txt: Lengths do not match: 10, 10, 8 -- flattening to whole document.
Alignment error in sent_5-21-2-18-or_simplified_plain.txt: Lengths do not match: 24, 27, 8 -- flattening to whole document.
Alignment error in sent_3-29-11-21-or_simplified_plain.txt: Lengths do not match: 22, 22, 10 -- flattening to whole document.
Alignment error in sent_4-29-11-21-or_simplified_plain.txt: Lengths do not match: 32, 33, 11 -- flattening to whole document.
Alignment error in sent_3-21-2-18-or_simplified_plain.txt: Lengths do not match: 9, 9, 7 -- flattening to whole document.
Alignment error in sent_2-18-1-22-or_simplified_plain.txt: Lengths do not match: 64, 70, 9 -- flattening to whole document.
Alignment error in sent_3-freitag-28-1-22-or_simplified_plain.txt: Lengths do not match: 9, 10, 8 -- flattening to whole document.

In [120]:
print(len(all_originals), len(all_systems), len(all_references))
print([type(ref) for ref in all_references])
print([len(ref) for ref in all_references])


25 25 25
[<class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>, <class 'list'>]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [115]:
def print_sari_per_sample(all_originals, all_systems, all_references, n=24):
    for i in range(min(n, len(all_originals))):
        sari = corpus_sari([all_originals[i]], [all_systems[i]], [all_references[i]])
        print(f"SARI for sample {i}: {sari:.2f}")

# Usage:
print_sari_per_sample(all_originals, all_systems, all_references, 24)

SARI for sample 0: 28.08
SARI for sample 1: 28.95
SARI for sample 2: 27.71
SARI for sample 3: 30.35
SARI for sample 4: 24.63
SARI for sample 5: 27.94
SARI for sample 6: 26.41
SARI for sample 7: 25.35
SARI for sample 8: 30.74
SARI for sample 9: 27.87
SARI for sample 10: 28.40
SARI for sample 11: 24.83
SARI for sample 12: 29.18
SARI for sample 13: 30.73
SARI for sample 14: 33.23
SARI for sample 15: 37.72
SARI for sample 16: 27.83
SARI for sample 17: 27.75
SARI for sample 18: 23.91
SARI for sample 19: 31.85
SARI for sample 20: 28.99
SARI for sample 21: 24.67
SARI for sample 22: 28.16
SARI for sample 23: 27.98


Above is the A2 calculation. Below is the B1 calculation

In [None]:
def print_sari_per_sample(all_originals, all_systems, all_references, n):
    for i in range(min(n, len(all_originals))):
        sari = corpus_sari([all_originals[i]], [all_systems[i]], [all_references[i]])
        print(f"SARI for sample {i}: {sari:.2f} ---")

# Usage:
print_sari_per_sample(all_originals, all_systems, all_references, 25)

SARI for sample 0: 25.44 ---
SARI for sample 1: 33.11 ---
SARI for sample 2: 27.98 ---
SARI for sample 3: 30.85 ---
SARI for sample 4: 23.85 ---
SARI for sample 5: 29.60 ---
SARI for sample 6: 25.93 ---
SARI for sample 7: 26.44 ---
SARI for sample 8: 34.15 ---
SARI for sample 9: 27.84 ---
SARI for sample 10: 29.23 ---
SARI for sample 11: 25.76 ---
SARI for sample 12: 34.01 ---
SARI for sample 13: 31.93 ---
SARI for sample 14: 38.62 ---
SARI for sample 15: 44.58 ---
SARI for sample 16: 28.13 ---
SARI for sample 17: 29.05 ---
SARI for sample 18: 24.13 ---
SARI for sample 19: 33.39 ---
SARI for sample 20: 28.49 ---
SARI for sample 21: 25.06 ---
SARI for sample 22: 28.93 ---
SARI for sample 23: 27.49 ---
SARI for sample 24: 22.45 ---


# LIX + FLESCH + WIENER

In [6]:
def get_plain_files(dir_path):
    """Returns list of filenames that end with _plain.txt"""
    return [f for f in os.listdir(dir_path) if f.endswith("_plain.txt")]

def get_files(dir_path):
    """
    Returns list of filenames that end with -a2.txt"""
    return [f for f in os.listdir(dir_path) if not f.startswith('.')]

def calc_lix(text):
    """
    Custom LIX (Lesbarkeitsindex) calculation for German.
    Formula: LIX = (words/sentences) + (100 * long_words/words)
    where long_words = words with >= 7 letters.
    """
    # Split sentences (naive, works for most German texts)
    sentences = re.split(r'[.!?]+', text)
    sentences = [s for s in sentences if s.strip()]
    words = re.findall(r'\w+', text)
    long_words = [w for w in words if len(w) >= 7]
    num_sentences = len(sentences)
    num_words = len(words)
    num_long_words = len(long_words)
    if num_sentences == 0 or num_words == 0:
        return 0.0
    lix = (num_words / num_sentences) + (num_long_words * 100 / num_words)
    return lix

def process_metrics_for_files(system_dir, PLAIN):
    if PLAIN == True:
        files = get_plain_files(system_dir)
    else:
        files = get_files(system_dir)
    results = []
    for fname in files:
        fpath = os.path.join(system_dir, fname)
        with open(fpath, encoding="utf-8") as f:
            # Join all lines into one text
            text = " ".join([line.strip() for line in f if line.strip()])
        # Calculate metrics
        lix = calc_lix(text)
        flesch = textstat.flesch_reading_ease(text)
        wiener = textstat.wiener_sachtextformel(text, 1)
        results.append({
            "filename": fname,
            "LIX": lix,
            "Flesch": flesch,
            "Wiener_Sachtextformel": wiener
        })
        print(f"{fname}: LIX={lix:.2f} | Flesch={flesch:.2f} | Wiener={wiener:.2f}")
    # Optionally, save to CSV
    # with open("readability_metrics_summary.csv", "w", encoding="utf-8", newline='') as out_csv:
    #     writer = csv.DictWriter(out_csv, fieldnames=["filename", "LIX", "Flesch", "Wiener_Sachtextformel"])
    #     writer.writeheader()
    #     writer.writerows(results)
    # print(f"Saved summary to readability_metrics_summary.csv")
    return results


In [149]:


PLAIN =False
og_scores = process_metrics_for_files(ORIGINAL,PLAIN)
og_scores

PLAIN = True
sys_scores = process_metrics_for_files(SYSTEM,PLAIN)
sys_scores

sent_4-freitag-28-1-22-or.txt: LIX=44.74 | Flesch=59.78 | Wiener=5.88
sent_3-freitag-28-1-22-or.txt: LIX=60.98 | Flesch=40.13 | Wiener=9.39
sent_1-29-11-21-or.txt: LIX=51.89 | Flesch=45.64 | Wiener=9.37
sent_1-freitag-28-1-22-or.txt: LIX=48.79 | Flesch=63.09 | Wiener=6.23
sent_2-freitag-28-1-22-or.txt: LIX=51.76 | Flesch=64.52 | Wiener=7.05
sent_5-freitag-28-1-22-or.txt: LIX=48.17 | Flesch=56.72 | Wiener=6.41
sent_3-18-1-22-or.txt: LIX=52.66 | Flesch=54.55 | Wiener=7.01
sent_4-21-2-18-or.txt: LIX=37.06 | Flesch=69.30 | Wiener=4.32
sent_3-dienstag-8-2-22-or.txt: LIX=57.38 | Flesch=54.20 | Wiener=8.27
sent_4-18-1-22-or.txt: LIX=44.57 | Flesch=59.34 | Wiener=6.94
sent_2-29-11-21-or.txt: LIX=47.90 | Flesch=53.34 | Wiener=7.12
sent_4-dienstag-8-2-22-or.txt: LIX=40.11 | Flesch=72.59 | Wiener=4.40
sent_3-21-2-18-or.txt: LIX=56.10 | Flesch=41.58 | Wiener=9.74
sent_2-21-2-18-or.txt: LIX=57.43 | Flesch=38.68 | Wiener=9.46
sent_5-29-11-21-or.txt: LIX=41.89 | Flesch=69.40 | Wiener=4.79
sent_5-18-1

[{'filename': 'sent_1-18-1-22-or_simplified_plain.txt',
  'LIX': 46.88967611336032,
  'Flesch': 57.055150912838656,
  'Wiener_Sachtextformel': 7.637591401648999},
 {'filename': 'sent_4-18-1-22-or_simplified_plain.txt',
  'LIX': 42.301587301587304,
  'Flesch': 58.697828674948255,
  'Wiener_Sachtextformel': 6.588515113871635},
 {'filename': 'sent_5-21-2-18-or_simplified_plain.txt',
  'LIX': 38.293298071341034,
  'Flesch': 61.91909883720933,
  'Wiener_Sachtextformel': 5.7495834625323},
 {'filename': 'sent_3-29-11-21-or_simplified_plain.txt',
  'LIX': 45.93333333333334,
  'Flesch': 65.08284495548963,
  'Wiener_Sachtextformel': 6.818469351420092},
 {'filename': 'sent_4-29-11-21-or_simplified_plain.txt',
  'LIX': 42.07179487179487,
  'Flesch': 60.35570692992647,
  'Wiener_Sachtextformel': 6.021579063104918},
 {'filename': 'sent_3-21-2-18-or_simplified_plain.txt',
  'LIX': 45.035353535353536,
  'Flesch': 52.26159090909093,
  'Wiener_Sachtextformel': 8.101225},
 {'filename': 'sent_2-18-1-22-or

In [150]:
import statistics

def safe_mode(lst):
    try:
        return statistics.mode(lst)
    except statistics.StatisticsError:
        # If multiple modes, return the smallest one (arbitrary choice)
        return min(statistics.multimode(lst))
    except:
        return None

def describe_and_compare(og_scores, sys_scores, metric):
    og_vals = [d[metric] for d in og_scores]
    sys_vals = [d[metric] for d in sys_scores]
    
    if not og_vals or not sys_vals:
        print(f"\n==== {metric} ====")
        print(f"Not enough data for {metric}: original={len(og_vals)}, system={len(sys_vals)}")
        return


    mean_og = statistics.mean(og_vals)
    median_og = statistics.median(og_vals)
    mode_og = safe_mode(og_vals)

    mean_sys = statistics.mean(sys_vals)
    median_sys = statistics.median(sys_vals)
    mode_sys = safe_mode(sys_vals)

    abs_diff_mean = abs(mean_og - mean_sys)
    abs_diff_median = abs(median_og - median_sys)
    abs_diff_mode = abs(mode_og - mode_sys)

    pct_diff_mean = (abs_diff_mean / mean_og * 100) if mean_og != 0 else float('nan')
    pct_diff_median = (abs_diff_median / median_og * 100) if median_og != 0 else float('nan')
    pct_diff_mode = (abs_diff_mode / mode_og * 100) if mode_og != 0 else float('nan')

    print(f"\n==== {metric} ====")
    print(f"Original: mean={mean_og:.2f}, median={median_og:.2f}, mode={mode_og:.2f}")
    print(f"System:   mean={mean_sys:.2f}, median={median_sys:.2f}, mode={mode_sys:.2f}")
    print(f"Difference in means:   abs={abs_diff_mean:.2f} | pct={pct_diff_mean:.2f}%")
    print(f"Difference in medians: abs={abs_diff_median:.2f} | pct={pct_diff_median:.2f}%")
    print(f"Difference in modes:   abs={abs_diff_mode:.2f} | pct={pct_diff_mode:.2f}%")

def report_stats(og_scores, sys_scores):
    for metric in ["LIX", "Flesch", "Wiener_Sachtextformel"]:
        describe_and_compare(og_scores, sys_scores, metric)


In [151]:


report_stats(og_scores, sys_scores)


==== LIX ====
Original: mean=50.13, median=50.66, mode=44.74
System:   mean=41.39, median=42.28, mode=46.89
Difference in means:   abs=8.74 | pct=17.43%
Difference in medians: abs=8.38 | pct=16.55%
Difference in modes:   abs=2.15 | pct=4.81%

==== Flesch ====
Original: mean=53.98, median=54.20, mode=59.78
System:   mean=62.95, median=61.96, mode=57.06
Difference in means:   abs=8.97 | pct=16.61%
Difference in medians: abs=7.77 | pct=14.33%
Difference in modes:   abs=2.72 | pct=4.55%

==== Wiener_Sachtextformel ====
Original: mean=7.37, median=7.12, mode=5.88
System:   mean=6.04, median=6.11, mode=7.64
Difference in means:   abs=1.33 | pct=18.03%
Difference in medians: abs=1.01 | pct=14.15%
Difference in modes:   abs=1.76 | pct=29.87%


In [2]:
import language_tool_python

tool = language_tool_python.LanguageTool('de')

# def grammar_score(texts):
#     # texts: list of sentences
#     scores = []
#     for sent in texts:
#         matches = tool.check(sent)
#         num_errors = len(matches)
#         scores.append({
#             "sentence": sent,
#             "errors": num_errors,
#             "score": 1 - num_errors / max(1, len(sent.split()))  # crude normalized "correctness"
#         })
#     return scores

# sentences = [
#     "Ich habe ein Hund gesehen.",
#     "Heute ist das Wetter schön.",
#     "Sie gehen in die Schule morgen."
# ]
# results = grammar_score(sentences)
# for r in results:
#     print(f"{r['sentence']} - Errors: {r['errors']}, Score: {r['score']:.2f}")

In [3]:
print(tool.check("Ich habe ein Hund gesehen."))

LanguageToolError: Error: Internal Error: java.lang.RuntimeException: Could not activate rules, detected: de-DE