# Automatic Evaluation

This notebook performs the automatic evaluation of the system outputs.

Make sure you have read the `README` in this directory and installed all required packages.

## Imports


In [None]:
from os import path, makedirs, popen, system
import re
import pandas as pd
import tqdm.notebook as tqdm
from itertools import product
from syntok.tokenizer import Tokenizer
from scribendi import ScribendiScore

In [None]:
M2DIR = "../m2/"
REPO_ROOT = "../"

### Processsing Functions


In [None]:
def pretokenize(txt):
    tok = Tokenizer()
    return " ".join([str(token).strip() for token in tok.tokenize(txt)])


NEWLINE = "\n"
SPACE = " "


def convert_essay_to_single_line(essay: str):
    """
    Replace all newlines ("\\n") in essay with spaces (" ").
    """
    return essay.replace(NEWLINE, SPACE)


def md_to_dict(md):
    """
    Parse shared task format into a dictionary where keys are essay IDs
    and values are essay texts.

    Arguments:

    md --- a string with the content of a shared task Markdown file.
    """
    essay_dict = {}
    for essay in md.split("### essay_id = ")[1:]:
        (essay_id, text) = essay.split("\n", maxsplit=1)
        text_tokenized = pretokenize(text).strip("\n")
        essay_dict[essay_id] = convert_essay_to_single_line(text_tokenized)
    return essay_dict


def write_essay_to_file(output_dir, essay_id, essay_text):
    file_name = f"{essay_id}".tmp
    file_path = path.join(output_dir, file_name)
    with open(file_path, "w+") as f:
        f.write(essay_text)
    return file_path


def _ensure_directory_exists(directory):
    makedirs(directory, exist_ok=True)


def split_file_per_essay(input_file, output_dir):
    _ensure_directory_exists(output_dir)

    ids_texts = md_to_dict(input_file)

    file_paths = {}

    for essay_id, essay_text in ids_texts.items():
        file_path = write_essay_to_file(essay_id, essay_text)
        file_paths[essay_id] = file_path
    return file_paths

## Variables


In [None]:
MINIMAL = "minimal"
FLUENCY = "fluency"
VIKING = "Viking"
UAM_CSI = "UAM-CSI"

DATA_DIR = path.join(REPO_ROOT, "data/swedish/SweLL_gold/")
SOURCE_DIR = path.join(REPO_ROOT, "sources/")
REFERENCE_DIR = path.join(REPO_ROOT, "references/")
HYPOTHESIS_DIR = path.join(REPO_ROOT, "hypotheses/")
SYSTEM_OUTPUT_DIR = path.join(REPO_ROOT, "outputs/")

### Sources


In [None]:
def get_all_source_paths():
    md = path.join(DATA_DIR, "sv-swell_gold-orig-test.md")
    return split_file_per_essay(md, SOURCE_DIR)


source_paths = get_all_source_paths()

### References


In [None]:
def get_reference_paths(input_file, version):
    output_dir = path.join(REFERENCE_DIR, version)
    return split_file_per_essay(input_file, output_dir)


def get_all_reference_paths():
    minimal_reference_md = path.join(DATA_DIR, "sv-swell_gold-ref1-test.md")
    fluency_reference_md = path.join(DATA_DIR, "sv-swell_gold-ref2-test.md")
    return {
        MINIMAL: get_reference_paths(minimal_reference_md, MINIMAL),
        FLUENCY: get_reference_paths(fluency_reference_md, FLUENCY),
    }


reference_paths = get_all_reference_paths()

### Hypotheses


In [None]:
def get_system_version_hypothesis_paths(team, version, md):
    hypothesis_dir = path.join(HYPOTHESIS_DIR, team, version)

    return split_file_per_essay(md, hypothesis_dir)


def get_system_hypothesis_paths(team):
    minimal_hypothesis_md = path.join(
        SYSTEM_OUTPUT_DIR, team, "sv-swell_gold-hypo-test.md"
    )
    fluency_hypothesis_md = path.join(
        SYSTEM_OUTPUT_DIR, team, "sv-swell_gold-fluency-hypo-test.md"
    )
    return {
        MINIMAL: get_system_version_hypothesis_paths(
            team, MINIMAL, minimal_hypothesis_md
        ),
        FLUENCY: get_system_version_hypothesis_paths(
            team, FLUENCY, fluency_hypothesis_md
        ),
    }


def get_all_hypothesis_paths():
    return {
        VIKING: get_system_hypothesis_paths(VIKING),
        UAM_CSI: get_system_hypothesis_paths(UAM_CSI),
    }


hypothesis_paths = get_all_hypothesis_paths()

## GLEU

In [None]:
def compute_gleu(
    source_file, minimal_reference_file, fluency_reference_file, hypothesis_file
):
    gleu_command = f"gleu -s {source_file} -r {minimal_reference_file} {fluency_reference_file} -o {hypothesis_file} -d 4 -f -n 4 -t word"

    gleu_output = popen(gleu_command)
    if gleu_output != "":
        gleu_split = gleu_output.split()
        gleu_score = float(gleu_split[1])
    else:
        gleu_score = -float("inf")
    return gleu_score

## ERRANT

In [None]:
ERRANT_REGEX = re.compile(r"\d\.\d+\s+\d\.\d+\s+\d\.\d+")


def compute_errant(
    source_file,
    minimal_reference_file,
    fluency_reference_file,
    hypothesis_file,
    essay_id,
    version,
    team,
):
    # run ERRANT alignment for reference(s), from one-essay-per-line .tmp file(s) to M2 output (if needed)
    reference_m2 = path.join(M2DIR, f"{essay_id}-{version}-reference.m2")
    if not path.isfile(reference_m2):
        errant_parallel_reference_command = f"errant_parallel -orig {source_file} -cor {minimal_reference_file} {fluency_reference_file} -out {reference_m2} -lang SV"
        system(errant_parallel_reference_command)

    # run ERRANT alignment on hypothesis file, from one-essay-per-line .md file to M2 output (if needed)
    hypothesis_m2 = path.join(M2DIR, f"{team}-{version}-{essay_id}.m2")
    if not path.isfile(hypothesis_m2):
        errant_parallel_hypothesis_command = f"errant_parallel -orig {source_file} -cor {hypothesis_file} -out {hypothesis_m2} -lang SV"
        system(errant_parallel_hypothesis_command)

    # run ERRANT scoring
    errant_compare_command = f"errant_compare -hyp {hypothesis_m2} -ref {reference_m2}"

    errant_scores = popen(errant_compare_command).read()

    # capture the output which looks like this, add prec/rec/F0.5 to the output file
    # =========== Span-Based Correction ============
    # TP      FP      FN      Prec    Rec     F0.5
    # 12      4       6       0.75    0.6667  0.7317
    # ==============================================

    if errant_scores != "":
        prf_search = ERRANT_REGEX.search(errant_scores)
        prf_list = prf_search.group(0).split("\t")
        prf_values = [x for x in prf_list if x]
        precision = prf_values[0]
        recall = prf_values[1]
        f05 = prf_values[2]
        return float(precision), float(recall), float(f05)
    else:
        return tuple(-float("inf")) * 3

## Scribendi Score

In [None]:
scribendi_scorer = ScribendiScore()


def read_file(file_path):
    with open(file_path) as f:
        return f.read()


def compute_scribendi_score(source_file, hypothesis_file):
    source_text = read_file(source_file)
    hypothesis_text = read_file(hypothesis_file)
    return scribendi_scorer.score([source_text], [hypothesis_text])

In [None]:
def compute_scores(
    essay_id,
    version,
    team,
    source_file,
    minimal_reference_file,
    fluency_reference_file,
    hypothesis_file,
):
    gleu = compute_gleu(
        source_file, minimal_reference_file, fluency_reference_file, hypothesis_file
    )
    precision, recall, f05 = compute_errant(
        source_file,
        minimal_reference_file,
        fluency_reference_file,
        hypothesis_file,
        essay_id,
        version,
        team,
    )
    scribendi_score = compute_scribendi_score(source_file, hypothesis_file)
    return {
        "Essay ID": essay_id,
        "Correction Style": version,
        "System": team,
        "GLEU": gleu,
        "Precision": precision,
        "Recall": recall,
        "F0.5": f05,
        "Scribendi Score": scribendi_score,
    }


essay_ids = source_paths.keys()
versions = [MINIMAL, FLUENCY]
teams = [VIKING, UAM_CSI]

n_iterations = len(essay_ids) * len(versions) * len(teams)

all_scores_list = []

for essay_id, version, team in tqdm(
    product(essay_ids, versions, teams), total=n_iterations
):
    source_file = source_paths[essay_id]
    minimal_reference_file = reference_paths[MINIMAL][essay_id]
    fluency_reference_file = reference_paths[FLUENCY][essay_id]
    hypothesis_file = hypothesis_paths[team][version][essay_id]
    scores_dict = compute_scores(
        essay_id,
        version,
        team,
        source_file,
        minimal_reference_file,
        fluency_reference_file,
        hypothesis_file,
    )
    all_scores_list.append(scores_dict)

df = pd.DataFrame(all_scores_list)
csv_file_name = "scores.csv"
df.to_csv(csv_file_name)