# Plots and Tables

This notebook creates plots and tables

## Imports

Import relevant packages

In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib.lines import Line2D
import seaborn as sns
from os import path, makedirs
from scipy.stats import wilcoxon, shapiro, normaltest, ttest_rel
import numpy as np

## Ordering

Setup the ordering of categories for the tables in the report.

In [None]:
orders = {
    "Edit Style": CategoricalDtype(categories=["Minimal", "Fluency"], ordered=True),
    "Metric": CategoricalDtype(
        categories=["GLEU", "ERRANT", "Scribendi Score", "SOME"], ordered=True
    ),
    "Submetric": CategoricalDtype(
        categories=[
            "-",
            "Precision",
            "Recall",
            "$\\text{F}_{0.5}$-Score",
            "Grammaticality",
            "Fluency",
            "Meaning Preservation",
            "Total",
        ],
        ordered=True,
    ),
    "System": CategoricalDtype(
        categories=["UAM-CSI", "Viking-7B", "Viking-13B"], ordered=True
    ),
}

## Read File

Read the raw CSV file into a pandas `DataFrame` and setup variables for holding category values.

In [None]:
scores_csv_file = "scores_long.csv"
df = pd.read_csv(scores_csv_file)
df = df.fillna("-")

for col, dtype in orders.items():
    df[col] = df[col].astype(dtype)

df.sort_index(inplace=True)


columns = df.columns
essay_ids = df["Essay ID"].unique()
styles = df["Edit Style"].unique()
metrics = [
    tuple(row)
    for row in df[["Metric", "Submetric"]].drop_duplicates().to_numpy().tolist()
]
systems = df["System"].unique()

In [None]:
def format_label(label):
    if label in metrics:
        major, minor = label
        if minor == "-":
            return major
        return f"{major}: {minor}"
    elif label in styles:
        return f"{label} Edits"
    return label


def normalize_metric(metric):
    match metric:
        case ("GLEU", "-"):
            return "gleu"
        case ("ERRANT", "Precision"):
            return "errant_precision"
        case ("ERRANT", "Recall"):
            return "errant_recall"
        case ("ERRANT", "$\\text{F}_{0.5}$-Score"):
            return "errant_f05"
        case ("Scribendi Score", "-"):
            return "scribendi_score"
        case ("SOME", "Grammaticality"):
            return "some_grammaticality"
        case ("SOME", "Fluency"):
            return "some_fluency"
        case ("SOME", "Meaning Preservation"):
            return "some_meaning_preservation"
        case ("SOME", "Total"):
            return "some_total"
        case _:
            raise ValueError(f"Unknown metric: {metric}")


def get_image_file_name(metric):
    normalized = normalize_metric(metric)
    return f"{normalized}.png"


def normalize_file_name(file_name):
    return file_name.lower().replace(".", "").replace(" ", "_")


plots_dir = "plots/"
makedirs(plots_dir, exist_ok=True)

metric_types = {
    "continuous": [
        ("GLEU", "-"),
        ("ERRANT", "Precision"),
        ("ERRANT", "Recall"),
        ("ERRANT", "$\\text{F}_{0.5}$-Score"),
        ("SOME", "Total"),
    ],
    "discrete": [
        ("Scribendi Score", "-"),
        ("SOME", "Grammaticality"),
        ("SOME", "Fluency"),
        ("SOME", "Meaning Preservation"),
    ],
}


def get_minmax(v):
    return v.min(), v.max()


def is_even(n):
    return n % 2 == 0


def get_metric_df(metric):
    major, minor = metric
    if pd.isna(minor):
        return df[df["Metric"] == major]
    return df[(df["Metric"] == major) & (df["Submetric"] == minor)]


offset = 0.2


for metric in metrics:

    metric_df = get_metric_df(metric)

    fig, ax = plt.subplots(figsize=(10, 5))
    sns.violinplot(
        metric_df,
        y="System",
        x="Score",
        cut=0,
        hue="Edit Style",
        inner=None,
        density_norm="area",
    )

    violin_handles, violin_labels = ax.get_legend_handles_labels()

    grouped = metric_df.groupby(["System", "Edit Style"], observed=False)["Score"]
    means = grouped.mean()
    medians = grouped.median()

    ys = [i // 2 - offset if is_even(i) else i // 2 + offset for i in range(len(means))]
    plt.scatter(
        y=ys,
        x=means,
        marker="s",
        color="black",
        edgecolors="white",
        zorder=3,
        label="Mean",
    )
    plt.scatter(
        y=ys,
        x=medians,
        marker="o",
        color="white",
        edgecolors="black",
        zorder=3,
        label="Median",
    )

    # Ensure axes show integers for discrete metrics
    if metric in metric_types["discrete"]:
        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        ax.yaxis.set_major_locator(MaxNLocator(integer=True))

    # Custom handles for Statistics
    stat_handles = [
        Line2D(
            [0],
            [0],
            marker="s",
            color="black",
            label="Mean",
            markerfacecolor="black",
            markeredgecolor="white",
            linestyle="",
        ),
        Line2D(
            [0],
            [0],
            marker="o",
            color="white",
            label="Median",
            markerfacecolor="white",
            markeredgecolor="black",
            linestyle="",
        ),
    ]

    # Combine with dummy headers
    combined_handles = []
    combined_labels = []

    # Add Edit Style header (dummy)
    combined_handles.append(Line2D([], [], linestyle="none"))
    combined_labels.append("Edit Style")

    # Add Edit Style entries
    combined_handles.extend(violin_handles)
    combined_labels.extend(violin_labels)

    # Add Statistics header (dummy)
    combined_handles.append(Line2D([], [], linestyle="none"))
    combined_labels.append("Statistics")

    # Add Statistics entries
    combined_handles.extend(stat_handles)
    combined_labels.extend([h.get_label() for h in stat_handles])

    # Create the legend
    ax.legend(
        combined_handles,
        combined_labels,
        loc="upper left",
        bbox_to_anchor=(1, 1),
        frameon=True,
        ncol=1,
        handletextpad=1,
    )

    ax.set(xlabel=format_label(metric), ylabel="System")
    file_name = get_image_file_name(metric)
    file_path = path.join(plots_dir, file_name)
    plt.tight_layout()
    plt.savefig(file_path)
    plt.tight_layout()
    plt.show()

In [None]:
for metric in metrics:
    metric_df = get_metric_df(metric)

    ax = sns.lineplot(
        metric_df, x="Essay ID", y="Score", hue="System", style="Edit Style"
    )
    ax.set(xlabel="Essay ID", ylabel=format_label(metric))
    handles, labels = plt.gca().get_legend_handles_labels()
    plt.legend(handles, map(format_label, labels))
    sns.move_legend(
        ax,
        "upper center",
        bbox_to_anchor=(1.35, 1),
        ncol=1,
        frameon=True,
    )
    plt.show()

In [None]:
for metric in metrics:
    metric_df = get_metric_df(metric)
    g = sns.FacetGrid(
        metric_df,
        col="Edit Style",
        row="System",
        margin_titles=True,
        sharex=True,
        sharey=True,
    )
    g.map_dataframe(sns.histplot, x="Score", bins=8)

    # Set axis labels and titles
    g.set_axis_labels("Score", "Count")
    g.set_titles(row_template="{row_name}", col_template="{col_name}")

    plt.title(format_label(metric))

    plt.tight_layout()
    plt.show()

In [None]:
def save_latex_table(latex, file_name):
    file_path = path.join(tables_dir, file_name)
    with open(file_path, "w+") as f:
        f.write(latex)


def get_spread(x):
    return x.max() - x.min()


def get_tex_file_name(metric):
    normalized = normalize_metric(metric)
    return f"{normalized}.tex"


continuous_metrics = [m for m in metrics if m in metric_types["continuous"]]

latex_args = {
    "sparse_index": True,
    "convert_css": True,
    "clines": "skip-last;data",
    "hrules": True,
    "column_format": None,
    "siunitx": True,
    "multicol_align": "c",
}


def float_formatter(x):
    return f"\\num{{{x:.2f}}}"


highlight = {
    "max": "background-color: kth-lightblue40",
    "min": "background-color: kth-lightred40",
}

tables_dir = "tables/"
makedirs(tables_dir, exist_ok=True)
summary_dir = path.join(tables_dir, "summary")
makedirs(summary_dir, exist_ok=True)

uparrow = r"$\uparrow$"
downarrow = r"$\downarrow$"
aggregated_column_names = {
    "mean": "mean" + uparrow,
    "median": "median" + uparrow,
    "min": "min" + uparrow,
    "max": "max" + uparrow,
    "spread": "spread" + downarrow,
}

hi_better = ["mean", "median", "min", "max"]
lo_better = ["spread"]

hi_better = [aggregated_column_names[l] for l in hi_better]
lo_better = [aggregated_column_names[l] for l in lo_better]

for metric in continuous_metrics:
    metric_df = get_metric_df(metric)
    index = pd.MultiIndex.from_product(
        [
            metric_df["Edit Style"].unique(),
            metric_df["System"].unique(),
        ],
        names=["Edit Style", "System"],
    )
    summary = (
        metric_df.groupby(["Edit Style", "System"], observed=False)
        .agg(
            mean=("Score", "mean"),
            median=("Score", "median"),
            min=("Score", "min"),
            max=("Score", "max"),
            spread=("Score", get_spread),
        )
        .reindex(index)
    )
    summary = summary.rename(columns=aggregated_column_names)
    print(metric)
    latex = (
        summary.style.highlight_min(props=highlight["min"], subset=hi_better, axis=0)
        .highlight_max(props=highlight["max"], subset=hi_better, axis=0)
        .highlight_min(props=highlight["max"], subset=lo_better, axis=0)
        .highlight_max(props=highlight["min"], subset=lo_better, axis=0)
        .format(formatter=float_formatter)
        .to_latex(
            **latex_args,
        )
    )

    print(latex)
    file_name = get_tex_file_name(metric)
    save_latex_table(latex, file_name)

In [None]:
discrete_metrics = [m for m in metrics if m in metric_types["discrete"]]


def int_formatter(x):
    return f"\\num{{{x}}}"


for metric in discrete_metrics:
    print(metric)
    metric_df = get_metric_df(metric)

    scores = metric_df.groupby(["Edit Style", "System"], observed=False)["Score"]
    mean = scores.mean()
    median = scores.median()

    summary = metric_df.pivot_table(
        index=["Edit Style", "System"],
        columns="Score",
        aggfunc="size",
        fill_value=0,
        observed=False,
    )

    summary["mean"] = mean
    summary["median"] = median
    columns = ["mean", "median"] + list(summary.columns[:-2])
    summary = summary[columns]

    summary.columns = pd.MultiIndex.from_tuples(
        [
            ("", col) if col in ["mean", "median"] else ("Score Count", int(col))
            for col in summary.columns
        ]
    )

    latex = (
        summary.style.highlight_min(props=highlight["min"], axis=0)
        .highlight_max(props=highlight["max"], axis=0)
        .format(formatter=float_formatter, subset=[("", "mean"), ("", "median")])
        .format(formatter=int_formatter, subset=["Score Count"])
        .to_latex(
            **latex_args,
        )
    )

    file_name = get_tex_file_name(metric)
    save_latex_table(latex, file_name)

    print(latex)

In [None]:
metric_types["continuous"].remove(("SOME", "Total"))
metric_types["discrete"].append(("SOME", "Total"))

In [None]:
significance_level = 0.05

dists = []
# grouped = df_long.groupby(["system", "correction_style", "metric"], observed=False)

for metric in metric_types["continuous"]:
    metric_df = get_metric_df(metric)
    for team in systems:
        team_df = metric_df[metric_df["System"] == team]
        for style in styles:
            style_df = team_df[team_df["Edit Style"] == style]
            scores = style_df["Score"].to_numpy()
            # scores = grouped.get_group((team, style, metric))["score"].to_numpy()

            shapiro_stat, shapiro_p = shapiro(scores)
            normaltest_stat, normaltest_p = normaltest(scores)

            dists.append(
                {
                    "metric": format_label(metric),
                    "team": team,
                    "style": style,
                    # "shapiro_stat": shapiro_stat,
                    "sp": shapiro_p,
                    "sn": shapiro_p > significance_level,
                    # "normaltest_stat": normaltest_stat,
                    "np": normaltest_p,
                    "nn": normaltest_p > significance_level,
                }
            )

dist_df = pd.DataFrame(dists)
display(dist_df)

In [None]:
baseline = "UAM-CSI"
vikings = [t for t in systems if t != baseline]


def get_alternate_hypothesis(hypothesis):
    def is_odd(n):
        return n % 2 != 0

    return "greater" if is_odd(hypothesis) else "less"


def format_hypothesis(hypothesis):
    return f"$H_{{{hypothesis}}}$"


def perform_statistical_test(metric, scores, alternative):
    if metric in metric_types["continuous"]:
        return ttest_rel(scores[0], scores[1], alternative=alternative)
    diffs = np.around(scores[0] - scores[1], 3)
    return wilcoxon(diffs, alternative=alternative)


grouped = df.groupby(["System", "Edit Style", "Metric", "Submetric"], observed=False)


def perform_statistical_tests(hypothesis):
    test_results = []

    for team in vikings:
        for metric in metrics:
            for style in styles:
                keys = [team, baseline]
                args = [(k, style, *metric) for k in keys]
                scores = [
                    grouped.get_group(arg)
                    .sort_values(by="Essay ID")["Score"]
                    .to_numpy()
                    for arg in args
                ]

                stat, p_value = perform_statistical_test(
                    metric, scores, get_alternate_hypothesis(hypothesis)
                )

                major, minor = metric
                test_results.append(
                    {
                        "System": team,
                        "Edit Style": style,
                        "Metric": major,
                        "Submetric": minor,
                        "statistic": stat,
                        "$p$-value": p_value,
                        "Hypothesis": format_hypothesis(hypothesis),
                    }
                )

    return test_results

## Test Multi Index

In [None]:
pos = perform_statistical_tests(1)
neg = perform_statistical_tests(2)
test_results = pos + neg
test_results_df = pd.DataFrame(test_results)
display(test_results_df)

In [None]:
def prepare_test_results(test_results_df, columns):
    # Select relevant columns and reset index
    df = test_results_df[columns]  # .reset_index(drop=True)

    pivot_index = [
        c for c in ["Metric", "Submetric", "Edit Style", "System"] if c in df.columns
    ]

    pivoted = df.pivot(index=pivot_index, columns=["Hypothesis"], values=["$p$-value"])

    pivoted_reset = pivoted.reset_index()
    pivoted_sorted = pivoted_reset.sort_values(
        by=pivot_index,
        key=lambda col: col if col.name not in orders else col.astype(orders[col.name]),
    )
    pivoted_sorted = pivoted_sorted.set_index(pivot_index)

    return pivoted_sorted


def generate_latex_table(df, latex_args, significance_level, green, formatter):
    return (
        df.style.map(lambda p: (green if float(p) < significance_level else ""))
        .format(formatter=formatter)
        .to_latex(**latex_args)
    )




def scientific_formatter(x):
    if pd.notnull(x):
        pretty = f"{x:.2e}"
        return f"\\num{{{pretty}}}"
    return f"\\text{{NaN}}"


# Define constants
columns = ["Metric", "Submetric", "Edit Style", "System", "$p$-value", "Hypothesis"]


# Prepare the test results DataFrame
renamed = prepare_test_results(test_results_df, columns)

# Generate the LaTeX table
green = "background-color: kth-lightgreen"
latex = generate_latex_table(
    renamed, latex_args, significance_level, green, scientific_formatter
)

# Save the LaTeX table to a file
file_name = "test_results.tex"
save_latex_table(latex, file_name)

print(latex)

### Compare Viking-Based Systems

In [None]:
viking_results = []
for metric in metrics:
    major, minor = metric
    for style in styles:
        args = [(v, style, *metric) for v in vikings]
        scores = [
            grouped.get_group(arg).sort_values(by="Essay ID")["Score"].to_numpy()
            for arg in args
        ]
        """Explanation of the hypotheses:
        3: Viking-7B > Viking-13B
        4: Viking-7B < Viking-13B
        """
        hypotheses = [3, 4]
        for hypothesis in hypotheses:
            stat, p_value = perform_statistical_test(
                metric, scores, get_alternate_hypothesis(hypothesis)
            )
            viking_results.append(
                {
                    "Edit Style": style,
                    "Metric": major,
                    "Submetric": minor,
                    "statistic": stat,
                    "$p$-value": p_value,
                    "Hypothesis": format_hypothesis(hypothesis),
                }
            )

viking_results_df = pd.DataFrame(viking_results)
# Prepare the test results DataFrame
columns = ["Metric", "Submetric", "Edit Style", "$p$-value", "Hypothesis"]
renamed = prepare_test_results(viking_results_df, columns)
display(renamed)

# Generate the LaTeX table
green = "background-color: kth-lightgreen"
latex = generate_latex_table(
    renamed, latex_args, significance_level, green, scientific_formatter
)

# Save the LaTeX table to a file
file_name = "viking_pairwise_test_results.tex"
save_latex_table(latex, file_name)

print(latex)

In [None]:
style_results = []

for metric in metrics:
    major, minor = metric
    for v in vikings:
        args = [(v, style, *metric) for style in styles]
        scores = [
            grouped.get_group(arg).sort_values(by="Essay ID")["Score"].to_numpy()
            for arg in args
        ]
        """Explanation of the hypotheses:
        5: Minimal > Fluency
        6: Minimal < Fluency
        """
        hypotheses = [5, 6]
        for hypothesis in hypotheses:
            stat, p_value = perform_statistical_test(
                metric, scores, get_alternate_hypothesis(hypothesis)
            )
            style_results.append(
                {
                    "System": v,
                    "Metric": major,
                    "Submetric": minor,
                    "$p$-value": p_value,
                    "Hypothesis": format_hypothesis(hypothesis),
                }
            )

style_results_df = pd.DataFrame(style_results)
# Prepare the test results DataFrame
columns = ["Metric", "Submetric", "System", "$p$-value", "Hypothesis"]
renamed = prepare_test_results(style_results_df, columns)
display(renamed)

# Generate the LaTeX table
green = "background-color: kth-lightgreen"
latex = generate_latex_table(
    renamed, latex_args, significance_level, green, scientific_formatter
)

# Save the LaTeX table to a file
file_name = "viking_style_test_results.tex"
save_latex_table(latex, file_name)

print(latex)