Assess simplification output using BERTScore

In [None]:
import pandas as pd, numpy as np, json, ast
from collections import defaultdict, Counter
from bert_score import score

In [None]:
df_scores = pd.read_csv("bert_score_results.csv")  # contains P, R, F1 per sentence
df_rules  = pd.read_csv("master_data/output_assessment/ordered_simplifications_with_rules_clean.csv")

In [None]:
df_rules.info()

In [None]:
print("Before cleaning:", len(df_rules))


# Identify & remove faulty rows that only contain a number
faulty = df_rules[
    df_rules["applied_rules"].str.contains("convert_word_to_number") &
    df_rules["final_simplification"].str.match(r"^\d+$")
]
print("Faulty rows:", len(faulty))


df_counter2 = df_rules.drop(faulty.index)


print("After cleaning:", len(df_counter2))


In [None]:
# Merge BERTScore results with rules dataframe on 'uid'
df = df_rules.merge(
    df_scores[["uid", "bertscore_precision", "bertscore_recall", "bertscore_f1"]],
    on="uid", how="left"
)

In [None]:
# removed 88 faulty rows
df.info()

In [None]:
#Calculation helper function
def mean_ci(x, n_boot=2000, alpha=0.05, seed=42):
    rng = np.random.default_rng(seed)
    arr = np.array(x, dtype=float)
    if len(arr) == 0:
        return np.nan, np.nan, np.nan, np.nan
    boots = [np.mean(rng.choice(arr, size=len(arr), replace=True)) for _ in range(n_boot)]
    lo, hi = np.percentile(boots, [100*alpha/2, 100*(1-alpha/2)])
    return float(np.mean(arr)), float(np.std(arr)), float(lo), float(hi)

#parsing applied rules
def parse_rules(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            return json.loads(x)
        except Exception:
            try:
                return ast.literal_eval(x)
            except Exception:
                return []
    return []

df["applied_rules"] = df["applied_rules"].apply(parse_rules)

In [None]:
# Collect scores per rule
rule_to_scores = defaultdict(lambda: {"P": [], "R": [], "F1": []})

for _, row in df.iterrows():
    for rule in row["applied_rules"]:
        if pd.notna(row["bertscore_precision"]):
            rule_to_scores[rule]["P"].append(row["bertscore_precision"])
            rule_to_scores[rule]["R"].append(row["bertscore_recall"])
            rule_to_scores[rule]["F1"].append(row["bertscore_f1"])
#Apply aggregation
results = []
for rule, vals in rule_to_scores.items():
    meanP, stdP, loP, hiP = mean_ci(vals["P"])
    meanR, stdR, loR, hiR = mean_ci(vals["R"])
    meanF, stdF, loF, hiF = mean_ci(vals["F1"])
    results.append({
        "rule": rule,
        "N": len(vals["F1"]),
        "mean_precision": meanP, "ci95_lo_P": loP, "ci95_hi_P": hiP,
        "mean_recall": meanR,    "ci95_lo_R": loR, "ci95_hi_R": hiR,
        "mean_f1": meanF,        "ci95_lo_F1": loF, "ci95_hi_F1": hiF
    })

df_results = pd.DataFrame(results).sort_values("mean_f1", ascending=False)

In [None]:
#RUle counts
all_rules = [r for rules in df["applied_rules"] for r in rules]
rule_counts = Counter(all_rules)
df_counts = pd.DataFrame(rule_counts.items(), columns=["rule", "count"])

# Merge counts into results
df_final = df_results.merge(df_counts, on="rule", how="left").sort_values("mean_f1", ascending=False)


In [None]:
print("\n=== Average BERTScore aggregated per rule ===")
print(df_final[[
    "rule", "count",
    "mean_precision", #"ci95_lo_P", "ci95_hi_P",
    "mean_recall",    #"ci95_lo_R", "ci95_hi_R",
    "mean_f1",        #"ci95_lo_F1","ci95_hi_F1"
]].to_string(index=False, float_format="%.4f"))