In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns
from os import path, makedirs
import json
from scipy.stats import wilcoxon, shapiro, normaltest, ttest_rel
import numpy as np

In [None]:
systems_type = CategoricalDtype(
    categories=["UAM-CSI", "Viking-7B", "Viking-13B"], ordered=True
)
styles_type = CategoricalDtype(categories=["minimal", "fluency"], ordered=True)

In [None]:
scores_csv_file = "scores.csv"
auto_df = pd.read_csv(scores_csv_file)


auto_df_long = auto_df.melt(
    id_vars=["essay_id", "correction_style", "system"],
    value_vars=["gleu", "precision", "recall", "f0.5", "scribendi_score"],
    var_name="metric",
    value_name="score",
)

print(auto_df_long.info())

styles = auto_df_long["correction_style"].unique().tolist()
teams = auto_df_long["system"].unique().tolist()

output_metrics = {
    "grammaticality": "SOME: Grammaticality",
    "fluency": "SOME: Fluency",
    "meaning_preservation": "SOME: Meaning Preservation",
    "manual_evaluation": "SOME: Total",
    "gleu": "GLEU",
    "precision": "ERRANT: Precision",
    "recall": "ERRANT: Recall",
    "f0.5": "ERRANT: $ \\text{F}_{0.5} $-Score",
    "scribendi_score": "Scribendi Score",
}

In [None]:
manual_evaluation_dir = "manual_evaluation/"
d_key = "evaluations"

manual_eval_dicts = []

for team in teams:
    team_dir = path.join(manual_evaluation_dir, team)
    for style in styles:
        style_file_name = f"{style}.json"
        style_file_path = path.join(team_dir, style_file_name)
        with open(style_file_path) as f:
            metric_df = json.load(f)
        scores = metric_df[d_key]
        for d in scores:
            total = 0
            for metric in ["grammaticality", "fluency", "meaning_preservation"]:
                manual_eval_dicts.append(
                    {
                        "essay_id": d["id"],
                        "correction_style": style,
                        "system": team,
                        "metric": metric,
                        "score": d[metric],
                    }
                )
                total += d[metric]
            manual_eval_dicts.append(
                {
                    "essay_id": d["id"],
                    "correction_style": style,
                    "system": team,
                    "metric": "manual_evaluation",
                    "score": total / 3,
                }
            )

manual_df_long = pd.DataFrame(manual_eval_dicts)

In [None]:
dfs = [auto_df_long, manual_df_long]
df_long = pd.concat(dfs)
df_long["system"] = df_long["system"].astype(systems_type)
df_long["correction_style"] = df_long["correction_style"].astype(styles_type)
print(df_long.info())

In [None]:
essay_ids = df_long["essay_id"].unique().tolist()


essay_id_subs = {essay_id: i for i, essay_id in enumerate(essay_ids, 1)}

df_long["essay_id"] = df_long["essay_id"].map(essay_id_subs)
print(df_long.info())

metrics = df_long["metric"].unique().tolist()
print(metrics)

In [None]:
def format_legend_label(label):
    match label:
        case "minimal":
            return "Minimal Edits"
        case "fluency":
            return "Fluency Edits"
        case _:
            return output_metrics.get(label, label).replace("_", " ").title()


def normalize_file_name(file_name):
    return file_name.lower().replace(".", "").replace(" ", "_")


plots_dir = "plots/"
makedirs(plots_dir, exist_ok=True)

metric_types = {
    "continuous": [
        "gleu",
        "precision",
        "recall",
        "f0.5",
        "manual_evaluation",
    ],
    "discrete": [
        "scribendi_score",
        "grammaticality",
        "fluency",
        "meaning_preservation",
    ],
}


def get_minmax(v):
    return v.min(), v.max()


def is_even(n):
    return n % 2 == 0


offset = 0.2


for metric in metrics:

    metric_df = df_long[df_long["metric"] == metric]

    fig, ax = plt.subplots(figsize=(9, 6))
    sns.violinplot(
        metric_df,
        y="system",
        x="score",
        cut=0,
        hue="correction_style",
        inner=None,
        density_norm="area",
    )
    grouped = metric_df.groupby(["system", "correction_style"], observed=False)["score"]
    means = grouped.mean()
    medians = grouped.median()

    ys = [i // 2 - offset if is_even(i) else i // 2 + offset for i in range(len(means))]
    plt.scatter(
        y=ys,
        x=means,
        marker="s",
        color="black",
        edgecolors="white",
        zorder=3,
        label="Mean",
    )
    plt.scatter(
        y=ys,
        x=medians,
        marker="o",
        color="white",
        edgecolors="black",
        zorder=3,
        label="Median",
    )

    if metric in metric_types["discrete"]:
        # Ensure axes show integers
        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        ax.yaxis.set_major_locator(MaxNLocator(integer=True))

    ax.set(xlabel=output_metrics[metric], ylabel="System")

    handles, labels = plt.gca().get_legend_handles_labels()
    plt.legend(handles, map(format_legend_label, labels))
    sns.move_legend(
        ax,
        "lower center",
        ncol=2,
        bbox_to_anchor=(0.5, 1),
        frameon=True,
    )
    file_name = f"{normalize_file_name(metric)}.png"
    file_path = path.join(plots_dir, file_name)
    plt.tight_layout()
    plt.savefig(file_path)
    plt.tight_layout()
    plt.show()

In [None]:
print(df_long.info())

In [None]:
for metric in metrics:
    metric_df = df_long[df_long["metric"] == metric]
    ax = sns.lineplot(
        metric_df, x="essay_id", y="score", hue="system", style="correction_style"
    )
    ax.set(xlabel="Essay ID", ylabel=output_metrics[metric])
    handles, labels = plt.gca().get_legend_handles_labels()
    plt.legend(handles, map(format_legend_label, labels))
    sns.move_legend(
        ax,
        "upper center",
        bbox_to_anchor=(1.35, 1),
        ncol=1,
        frameon=True,
    )
    plt.show()

In [None]:
for metric in metrics:
    metric_df = df_long[df_long["metric"] == metric]
    g = sns.FacetGrid(
        metric_df,
        col="correction_style",
        row="system",
        margin_titles=True,
        sharex=True,
        sharey=True,
    )
    g.map_dataframe(sns.histplot, x="score", bins=8)

    # Set axis labels and titles
    g.set_axis_labels("Score", "Count")
    g.set_titles(row_template="{row_name}", col_template="{col_name}")

    plt.title(metric)

    plt.tight_layout()
    plt.show()

In [None]:
print(metrics)

In [None]:
def get_spread(x):
    return x.max() - x.min()


summary = (
    df_long.groupby(["correction_style", "system", "metric"])
    .agg(
        mean=("score", "mean"),
        median=("score", "median"),
        min=("score", "min"),
        max=("score", "max"),
        spread=("score", get_spread),
        # std=("score", "std"),
    )
    .reset_index()
)


metric_dfs = {
    metric: group.drop(columns="metric")
    .sort_values(by=["correction_style", "system"], ascending=[True, True])
    .reset_index(drop=True)
    for metric, group in summary.groupby("metric")
}


output_styles = {
    "minimal": "Minimal",
    "fluency": "Fluency",
}

output_headers = {
    "correction_style": "Edit Style",
    "system": "System",
}

mu = r"\( \mu \)"
sigma = r"\( \sigma \)"

for metric, metric_df in metric_dfs.items():
    metric_df.rename(columns=output_headers, inplace=True)
    metric_df["Edit Style"] = metric_df["Edit Style"].map(output_styles)
    metric_df.set_index(["Edit Style", "System"], inplace=True)

latex_args = {
    "sparse_index": True,
    "convert_css": True,
    "clines": "skip-last;data",
    "hrules": True,
    "column_format": None,
    "siunitx": True,
    "multicol_align": "c",
}

tables_dir = "tables/"
makedirs(tables_dir, exist_ok=True)
summary_dir = path.join(tables_dir, "summary")
makedirs(summary_dir, exist_ok=True)

highlight = {
    "max": "background-color: kth-lightblue40",
    "min": "background-color: kth-lightred40",
}


def float_formatter(x):
    return f"\\num{{{x:.2f}}}"


for metric, metric_df in metric_dfs.items():
    print(metric)

    latex = (
        # metric_df.style.highlight_max(
        #    subset=["mean", "median", "max", "min"], props=highlight["max"], axis=0
        # )
        # .highlight_min(subset=sigma, props="font-weight: bold", axis=0)
        metric_df.style.highlight_min(props=highlight["min"], axis=0)
        .highlight_max(props=highlight["max"], axis=0)
        .format(formatter=float_formatter)
        .to_latex(
            **latex_args,
        )
    )

    file_name = f"{normalize_file_name(metric)}.tex"
    file_path = path.join(summary_dir, file_name)
    with open(file_path, "w+") as f:
        f.write(latex)

    print(latex)

In [None]:
discrete_metrics = metric_types["discrete"]

discrete_df = df_long[df_long["metric"].isin(discrete_metrics)]


def int_formatter(x):
    return f"\\num{{{x}}}"


for metric in discrete_metrics:
    metric_df = (
        discrete_df[discrete_df["metric"] == metric]
        .drop(columns=["metric", "essay_id"])
        .reset_index(drop=True)
    )
    metric_df.rename(columns=output_headers, inplace=True)
    metric_df["Edit Style"] = metric_df["Edit Style"].map(output_styles)
    metric_df["score"] = metric_df["score"].astype(int)
    mean = metric_df.groupby(["Edit Style", "System"], observed=False)["score"].mean()
    median = metric_df.groupby(["Edit Style", "System"], observed=False)[
        "score"
    ].median()
    metric_df = metric_df.pivot_table(
        index=["Edit Style", "System"],
        columns="score",
        aggfunc="size",
        fill_value=0,
        observed=False,
    )
    metric_df["mean"] = mean
    metric_df["median"] = median
    cols = ["mean", "median"] + list(metric_df.columns[:-2])
    metric_df = metric_df[cols]
    metric_df.columns = pd.MultiIndex.from_tuples(
        [
            ("", col) if col in ["mean", "median"] else ("Score Count", col)
            for col in metric_df.columns
        ]
    )
    print(metric)
    print(metric_df)

    latex = (
        metric_df.style.highlight_min(props=highlight["min"], axis=0)
        .highlight_max(props=highlight["max"], axis=0)
        .format(formatter=float_formatter, subset=[("", "mean"), ("", "median")])
        .format(formatter=int_formatter, subset=["Score Count"])
        .to_latex(
            **latex_args,
        )
    )

    file_name = f"{normalize_file_name(metric)}.tex"
    file_path = path.join(summary_dir, file_name)
    with open(file_path, "w+") as f:
        f.write(latex)

    print(latex)

In [None]:
metric_types["continuous"].remove("manual_evaluation")
metric_types["discrete"].append("manual_evaluation")

In [None]:
significance_level = 0.05

dists = []
grouped = df_long.groupby(
    ["system", "correction_style", "metric"], observed=False
)

for metric in metric_types["continuous"]:
    for team in teams:
        for style in styles:
            scores = grouped.get_group((team, style, metric))[
                "score"].to_numpy()

            shapiro_stat, shapiro_p = shapiro(scores)
            normaltest_stat, normaltest_p = normaltest(scores)

            dists.append(
                {
                    "metric": metric,
                    "team": team,
                    "style": style,
                    # "shapiro_stat": shapiro_stat,
                    "sp": shapiro_p,
                    "sn": shapiro_p > significance_level,
                    # "normaltest_stat": normaltest_stat,
                    "np": normaltest_p,
                    "nn": normaltest_p > significance_level,
                }
            )

dist_df = pd.DataFrame(dists)
print(dist_df)

In [None]:
# grouped = df_long.groupby(["system", "correction_style", "metric"])
baseline = "UAM-CSI"
vikings = [t for t in teams if t != baseline]


test_results = []
for team in vikings:
    for metric in metrics:
        for style in styles:
            keys = [team, baseline]
            args = [(k, style, metric) for k in keys]
            scores = [
                grouped.get_group(arg).sort_values(by="essay_id")["score"].to_numpy()
                for arg in args
            ]

            if metric in metric_types["continuous"]:
                stat, p_value = ttest_rel(scores[0], scores[1], alternative="greater")
            else:
                diffs = np.around(scores[0] - scores[1], 3)
                stat, p_value = wilcoxon(diffs, alternative="greater")

            test_results.append(
                {
                    "team": team,
                    "style": style,
                    "metric": metric,
                    "statistic": stat,
                    "p_value": p_value,
                    "significant": p_value < significance_level,
                }
            )

test_results_df = pd.DataFrame(test_results)
# sub = test_results_df[["team", "style", "metric", "p_value", "significant"]]
test_results_df = test_results_df.sort_values(
    by=["metric", "style", "team"], ascending=[True, False, False]
)
# print(test_results_df)

pretty_test_results_df = test_results_df.copy()
cols = ["metric", "style", "team", "p_value"]
pretty_test_results_df = pretty_test_results_df[cols].reset_index(drop=True)


output_headers = {
    "style": "Edit Style",
    "team": "System",
    "p_value": "$ p $-value",
    "significant": "Significant",
    "metric": "Metric",
    "statistic": "Statistic",
}


renamed = test_results_df.rename(columns=output_headers)
renamed["Edit Style"] = renamed["Edit Style"].map(output_styles)
renamed["Metric"] = renamed["Metric"].map(output_metrics)

pivot = renamed.pivot_table(
    index=[output_headers["metric"]],
    columns=[output_headers["style"], output_headers["team"]],
    values=[output_headers["p_value"]],
)

latex_args["column_format"] = None
latex_args["siunitx"] = True


def scientific_formatter(x):
    if pd.notnull(x):
        pretty = f"{x:.2e}"
        return f"\\num{{{pretty}}}"
    return f"\\text{{NaN}}"

green = "background-color: kth-lightgreen"
latex = (
    pivot.style.map(
        lambda p: (green if float(p) < significance_level else ""),
        subset=[output_headers["p_value"]],
    )
    .format(formatter=scientific_formatter)
    .to_latex(**latex_args)
)


## Test Multi Index

In [None]:
pretty_test_results_df = test_results_df.copy()
cols = ["metric", "style", "team", "p_value"]
pretty_test_results_df = pretty_test_results_df[cols].reset_index(drop=True)

renamed = pretty_test_results_df.rename(columns=output_headers)
renamed["Edit Style"] = renamed["Edit Style"].map(output_styles)
renamed["Metric"] = renamed["Metric"].map(output_metrics)

renamed[["Metric", "Submetric"]] = renamed["Metric"].str.split(":", expand=True)
renamed["Submetric"] = renamed["Submetric"].str.strip().fillna("-")

# Reorder the columns
index_cols = ["Metric", "Submetric", "Edit Style", "System"]


# Change the index to index_cols and make multiple levels
renamed.set_index(index_cols, inplace=True)

metric_order = CategoricalDtype(
    categories=[
        output_metrics["gleu"],
        "ERRANT",
        output_metrics["scribendi_score"],
        "SOME",
    ],
    ordered=True,
)

submetrics_raw = [
    "precision",
    "recall",
    "f0.5",
    "grammaticality",
    "fluency",
    "meaning_preservation",
    "manual_evaluation",
]

submetrics = [
    output_metrics[submetric].split(":")[1].strip() for submetric in submetrics_raw
] + ["-"]

submetric_order = CategoricalDtype(
    categories=submetrics,
    ordered=True,
)

style_order = CategoricalDtype(
    categories=["Minimal", "Fluency"],
    ordered=True,
)

system_order = CategoricalDtype(
    categories=["Viking-7B", "Viking-13B"],
    ordered=True,
)


renamed.index = renamed.index.set_levels(
    renamed.index.levels[0].astype(metric_order),
    level=0,
)
renamed.index = renamed.index.set_levels(
    renamed.index.levels[1].astype(submetric_order),
    level=1,
)

renamed.index = renamed.index.set_levels(
    renamed.index.levels[2].astype(style_order),
    level=2,
)
renamed.index = renamed.index.set_levels(
    renamed.index.levels[3].astype(system_order),
    level=3,
)
renamed = renamed.sort_index()

latex_args["column_format"] = None
latex_args["siunitx"] = True


def scientific_formatter(x):
    if pd.notnull(x):
        pretty = f"{x:.2e}"
        return f"\\num{{{pretty}}}"
    return f"\\text{{NaN}}"


green = "background-color: kth-lightgreen"
latex = (
    renamed.style.map(
        lambda p: (green if float(p) < significance_level else ""),
    )
    .format(formatter=scientific_formatter)
    .to_latex(**latex_args)
)


print(latex)
file_name = "test_results.tex"
file_path = path.join(tables_dir, file_name)
with open(file_path, "w+") as f:
    f.write(latex)