In [None]:
import pandas as pd
from scipy.stats import ttest_rel, wilcoxon
import numpy as np
import matplotlib.pyplot as plt 

In [None]:
scores_csv = "scores.csv"
df = pd.read_csv(scores_csv)
print(df.columns.values.tolist())

In [None]:
def extract_unique_values(df, column):
    return df[column].unique().tolist()

In [None]:
MINIMAL = "minimal"
FLUENCY = "fluency"
VIKING = "Viking"
UAM_CSI = "UAM-CSI"
GLEU = "GLEU"
PRECISION = "Precision"
RECALL = "Recall"
F05 = "F0.5"
SCRIBENDI_SCORE = "Scribendi Score"
versions = [MINIMAL, FLUENCY]
teams = [VIKING, UAM_CSI]
metrics = [GLEU, PRECISION, RECALL, F05, SCRIBENDI_SCORE]

In [None]:
df = df.sort_values(by=["System", "Correction Style", "Essay ID"])

In [None]:
def extract_team_version_metric(team, version, metric):
    return df[(df["System"] == team) & (df["Correction Style"] == version)][
        metric
    ].to_numpy()


def extract_team_version(team, version):
    return {
        metric: extract_team_version_metric(team, version, metric) for metric in metrics
    }


def extract_team(team):
    return {version: extract_team_version(team, version) for version in versions}


values = {team: extract_team(team) for team in teams}

In [None]:
continuous_metrics = [GLEU, PRECISION, RECALL, F05]
ordinal_metrics = [SCRIBENDI_SCORE]
alpha = 0.05

metric_tests = [
    (continuous_metrics, ttest_rel),
    (ordinal_metrics, wilcoxon),
]

results = []
for metrics, test_function in metric_tests:
    for metric in metrics:
        for version in versions:
            test_result = test_function(
                values[VIKING][version][metric], values[UAM_CSI][version][metric]
            )
            p_value = test_result.pvalue
            significant = p_value < alpha
            result_dict = {
                "metric": metric,
                "version": version,
                "p_value": p_value,
                "significant": significant,
            }
            results.append(result_dict)