# Automatic Evaluation

This notebook performs the automatic evaluation of the system outputs.

Make sure you have read the `README` in this directory and installed all required packages.

## Imports

Import all required packages


In [None]:
from os import path, makedirs, popen, system, listdir
import re
import pandas as pd
import tqdm.notebook as tqdm
from itertools import product
from syntok.tokenizer import Tokenizer
from scribendi import ScribendiScore

## Constants

Setup the repo root and m2-directory as constants.


In [None]:
REPO_ROOT = path.join(path.dirname(path.abspath(__file__)), "..")
M2DIR = path.join(REPO_ROOT, "m2")
makedirs(M2DIR, exist_ok=True)  # Ensure directory exists

### Processsing Functions

Define functions to process the data.


In [None]:
def pretokenize(txt):
    """
    Tokenizes and returns txt with syntok.tokenizer.
    """
    tok = Tokenizer()
    return " ".join([str(token).strip() for token in tok.tokenize(txt)])


def convert_essay_to_single_line(essay: str):
    """
    Replace all newlines in essay with spaces.
    """
    newline = "\n"
    space = " "
    return essay.replace(newline, space)


def md_to_dict(md):
    """
    Parse shared task format into a dictionary where keys are essay IDs
    and values are essay texts.

    Arguments:

    md --- a string with the content of a shared task Markdown file.
    """
    essay_dict = {}
    for essay in md.split("### essay_id = ")[1:]:
        (essay_id, text) = essay.split("\n", maxsplit=1)
        text_tokenized = pretokenize(text).strip("\n")
        essay_dict[essay_id] = convert_essay_to_single_line(text_tokenized)
    return essay_dict


def write_essay_to_file(output_dir, essay_id, essay_text):
    """
    Writes essay text to the file path output_dir/essay_id.tmp and returns the file path.
    """
    file_name = f"{essay_id}".tmp
    file_path = path.join(output_dir, file_name)
    with open(file_path, "w+") as f:
        f.write(essay_text)
    return file_path


def _ensure_directory_exists(directory):
    """
    Creates directory if it does not exist.
    """
    makedirs(directory, exist_ok=True)


def split_file_per_essay(input_file, output_dir):
    """
    Reads each essay from input_file and writes them to individual files.
    The input file is structured as below:
    ### essay_id = ABC123
    ...
    ### essay_id = XYZ987
    ...

    Each essay is written to a file with the path: output_dir/essay_id.tmp.

    Returns a dict[essay_id] = file_path.
    """
    _ensure_directory_exists(output_dir)

    ids_texts = md_to_dict(input_file)

    file_paths = {}

    for essay_id, essay_text in ids_texts.items():
        file_path = write_essay_to_file(essay_id, essay_text)
        file_paths[essay_id] = file_path
    return file_paths

## Variables

These variables are used to create file paths for later use in the evaluation process.

### Edit Versions

Setup variables to distinguish minimal edits and fluency edits.


In [None]:
MINIMAL = "minimal"
FLUENCY = "fluency"
versions = [MINIMAL, FLUENCY]

### Teams

Create a list of all teams, which are all directories under `../models/`.


In [None]:
models_dir = path.join(REPO_ROOT, "models/")

teams = [d for d in listdir(models_dir) if path.isdir(path.join(models_dir, d))]

### Directories

Setup Directories for various directories.

In [None]:
DATA_DIR = path.join(REPO_ROOT, "data/swedish/SweLL_gold/")
SOURCE_DIR = path.join(REPO_ROOT, "sources/")
REFERENCE_DIR = path.join(REPO_ROOT, "references/")
HYPOTHESIS_DIR = path.join(REPO_ROOT, "hypotheses/")
SYSTEM_OUTPUT_DIR = path.join(REPO_ROOT, "outputs/")

### Sources

Get paths for all source files.


In [None]:
def get_all_source_paths():
    """
    Writes all source essays to files on the form: `SOURCE_DIR/essay_id.tmp`.
    Returns a dict[essay_id] = file_path.
    """
    md = path.join(DATA_DIR, "sv-swell_gold-orig-test.md")
    return split_file_per_essay(md, SOURCE_DIR)


source_paths = get_all_source_paths()

### References

Get paths for all reference files.


In [None]:
def get_reference_paths(input_file, version):
    """
    Writes all reference essays to files on the form: `REFERENCE_DIR/version/essay_id.tmp`.
    Returns a dict[essay_id] = file_path.
    """
    output_dir = path.join(REFERENCE_DIR, version)
    return split_file_per_essay(input_file, output_dir)


def get_all_reference_paths():
    """
    Writes both minimal-edited and fluency-edited reference essays to files on the form: `REFERENCE_DIR/version/essay_id.tmp`.
    Returns a dict[version][essay_id] = file_path.
    """
    minimal_reference_md = path.join(DATA_DIR, "sv-swell_gold-ref1-test.md")
    fluency_reference_md = path.join(DATA_DIR, "sv-swell_gold-ref2-test.md")
    return {
        MINIMAL: get_reference_paths(minimal_reference_md, MINIMAL),
        FLUENCY: get_reference_paths(fluency_reference_md, FLUENCY),
    }


reference_paths = get_all_reference_paths()

### Hypotheses

Get paths for all hypothesis paths.


In [None]:
def get_system_version_hypothesis_paths(team, version, md):
    """
    Writes all system hypotheses to files on the form: `HYPOTHESIS_DIR/team/version/essay_id.tmp`.
    Returns a dict[essay_id] = file_path.
    """
    hypothesis_dir = path.join(HYPOTHESIS_DIR, team, version)
    return split_file_per_essay(md, hypothesis_dir)


def get_system_hypothesis_paths(team):
    """
    Writes both minimal-edited and fluency-edited system hypotheses to files on the form: `HYPOTHESIS_DIR/team/version/essay_id.tmp`.
    Returns a dict[version][essay_id] = file_path.
    """
    minimal_hypothesis_md = path.join(
        SYSTEM_OUTPUT_DIR, team, "sv-swell_gold-hypo-test.md"
    )
    fluency_hypothesis_md = path.join(
        SYSTEM_OUTPUT_DIR, team, "sv-swell_gold-fluency-hypo-test.md"
    )
    return {
        MINIMAL: get_system_version_hypothesis_paths(
            team, MINIMAL, minimal_hypothesis_md
        ),
        FLUENCY: get_system_version_hypothesis_paths(
            team, FLUENCY, fluency_hypothesis_md
        ),
    }


def get_all_hypothesis_paths():
    """
    Writes both minimal-edited and fluency-edited system hypotheses for both teams to files on the form: `HYPOTHESIS_DIR/team/version/essay_id.tmp`.
    Returns a dict[team][version][essay_id] = file_path.
    """
    return {team: get_system_hypothesis_paths(team) for team in teams}


hypothesis_paths = get_all_hypothesis_paths()

## GLEU

Compute GLEU with the implementation by Shota Koyama ([https://github.com/shotakoyama/gleu](https://github.com/shotakoyama/gleu)).


In [None]:
def compute_gleu(
    source_file, minimal_reference_file, fluency_reference_file, hypothesis_file
):
    gleu_command = [
        f"gleu",
        f"-s {source_file}",
        f"-r {minimal_reference_file} {fluency_reference_file}",  # Use both references
        f"-o {hypothesis_file}",
        f"-d 4",  # Number of decimal places
        f"-f",  # Fixed seed
        f"-n 4",  # Maximum n-gram length
        f"-t word",  # Word-level tokenization
    ]

    gleu_output = popen(" ".join(gleu_command)).read()
    if gleu_output != "":
        gleu_split = gleu_output.split()
        gleu_score = float(gleu_split[1])
    else:
        gleu_score = -float("inf")
    return gleu_score

## ERRANT

Compute ERRANT with the implementation by Andrew Caines ([https://github.com/cainesap/errant](https://github.com/cainesap/errant)).


In [None]:
# Used for extracting precision, recall, and F0.5 from ERRANT output
ERRANT_REGEX = re.compile(r"\d\.\d+\s+\d\.\d+\s+\d\.\d+")


def compute_errant(
    source_file,
    minimal_reference_file,
    fluency_reference_file,
    hypothesis_file,
    essay_id,
    version,
    team,
):
    # ERRANT align source and reference files, from md to m2 (if needed)
    reference_m2 = path.join(M2DIR, f"{essay_id}-{version}-reference.m2")
    if not path.isfile(reference_m2):
        errant_parallel_reference_command = [
            f"errant_parallel",
            f"-orig {source_file}",
            f"-cor {minimal_reference_file} {fluency_reference_file}",  # Use both references
            f"-out {reference_m2}",
            f"-lang SV",  # Set language to Swedish
        ]
        system(" ".join(errant_parallel_reference_command))

    # ERRANT align source and hypothesis files, from md to m2 (if needed)
    hypothesis_m2 = path.join(M2DIR, f"{team}-{version}-{essay_id}.m2")
    if not path.isfile(hypothesis_m2):
        errant_parallel_hypothesis_command = [
            f"errant_parallel",
            f"-orig {source_file}",
            f"-cor {hypothesis_file}",
            f" -out {hypothesis_m2}",
            f"-lang SV",  # Set language to Swedish
        ]
        system(" ".join(errant_parallel_hypothesis_command))

    errant_compare_command = [
        f"errant_compare",
        f"-hyp {hypothesis_m2}",  # Aligned hypothesis file
        f"-ref {reference_m2}",  # Aligned reference file
    ]

    errant_scores = popen(" ".join(errant_compare_command)).read()

    # capture the output which looks like this, add prec/rec/F0.5 to the output file
    # =========== Span-Based Correction ============
    # TP      FP      FN      Prec    Rec     F0.5
    # 12      4       6       0.75    0.6667  0.7317
    # ==============================================

    if errant_scores != "":
        prf_search = ERRANT_REGEX.search(errant_scores)
        prf_list = prf_search.group(0).split("\t")
        prf_values = [x for x in prf_list if x]
        precision = prf_values[0]
        recall = prf_values[1]
        f05 = prf_values[2]
        return float(precision), float(recall), float(f05)
    else:
        return tuple(-float("inf")) * 3

## Scribendi Score

Compute the Scribendi Score with the implementation by Robert Östling ([https://github.com/robertostling/scribendi_score](https://github.com/robertostling/scribendi_score)).


In [None]:
# Use ScribendiScore's API to compute the Scribendi score
scribendi_scorer = ScribendiScore()


def read_file(file_path):
    """
    Helper function that reads a file and returns its content.
    """
    with open(file_path) as f:
        return f.read()


def compute_scribendi_score(source_file, hypothesis_file):
    # Read source and hypothesis file contents
    source_text = read_file(source_file)
    hypothesis_text = read_file(hypothesis_file)
    
    # The Scribendi-Score API requires dicts as input
    dummy_id = "1"
    source_input = {dummy_id: source_text}
    hypothesis_input = {dummy_id: hypothesis_text}

    return scribendi_scorer.score(source_input, hypothesis_input)

## Compute All Scores

Compute all scores and return a dict. The dicts are then used in a row in a pandas DataFrame that contains all scores.

The final DataFrame is saved to `scores.csv`.


In [None]:
def compute_scores(
    essay_id,
    version,
    team,
    source_file,
    minimal_reference_file,
    fluency_reference_file,
    hypothesis_file,
):
    # GLEU
    gleu = compute_gleu(
        source_file, minimal_reference_file, fluency_reference_file, hypothesis_file
    )
    # ERRANT
    precision, recall, f05 = compute_errant(
        source_file,
        minimal_reference_file,
        fluency_reference_file,
        hypothesis_file,
        essay_id,
        version,
        team,
    )
    # Scribendi Score
    scribendi_score = compute_scribendi_score(source_file, hypothesis_file)

    # Returned dict represents a row in the final CSV
    return {
        "Essay ID": essay_id,
        "Correction Style": version,
        "System": team,
        "GLEU": gleu,
        "Precision": precision,
        "Recall": recall,
        "F0.5": f05,
        "Scribendi Score": scribendi_score,
    }


essay_ids = source_paths.keys()

n_iterations = len(essay_ids) * len(versions) * len(teams)

all_scores_list = []

for essay_id, version, team in tqdm(
    product(essay_ids, versions, teams), total=n_iterations
):
    # Retrieve file paths
    source_file = source_paths[essay_id]
    minimal_reference_file = reference_paths[MINIMAL][essay_id]
    fluency_reference_file = reference_paths[FLUENCY][essay_id]
    hypothesis_file = hypothesis_paths[team][version][essay_id]

    # Compute scores
    scores_dict = compute_scores(
        essay_id,
        version,
        team,
        source_file,
        minimal_reference_file,
        fluency_reference_file,
        hypothesis_file,
    )

    # Save scores to list
    all_scores_list.append(scores_dict)


## Create and Save Pandas Dataframe

In [None]:
# Create and save a CSV file with all scores
df = pd.DataFrame(all_scores_list)
csv_file_name = "scores.csv"
df.to_csv(csv_file_name)