# Automatic Evaluation

This notebook performs the automatic evaluation of the system outputs.

Make sure you have read the `README` in this directory and installed all required packages.

## Imports

Import all required packages


In [None]:
from os import path, makedirs, listdir, environ
import subprocess
import errant
from scribendi import ScribendiScore
from syntok.tokenizer import Tokenizer
from itertools import product
import spacy_udpipe
import spacy
from tqdm.notebook import tqdm
import pandas as pd
import re

## Constants

Setup the repo root and m2-directory as constants.


In [None]:
REPO_ROOT = path.join("/home/jovyan/da231x")
print(REPO_ROOT)
M2DIR = path.join(REPO_ROOT, "m2")
makedirs(M2DIR, exist_ok=True)  # Ensure directory exists

FULL_CORPUS = "full_corpus"

### Processsing Functions

Define functions to process the data.


In [None]:
def read_file(file_path):
    with open(file_path) as f:
        return f.read()


def pretokenize(txt):
    """
    Tokenizes and returns txt with syntok.tokenizer.
    """
    tok = Tokenizer()
    return " ".join([str(token).strip() for token in tok.tokenize(txt)])


def convert_essay_to_single_line(essay):
    """
    Replace all newlines in essay with spaces.
    """
    newline = "\n"
    space = " "
    return essay.replace(newline, space)


def md_to_dict(md):
    """
    Parse shared task format into a dictionary where keys are essay IDs
    and values are essay texts.

    Arguments:

    md --- a string with the content of a shared task Markdown file.
    """
    essay_dict = {}
    for essay in md.split("### essay_id = ")[1:]:
        (essay_id, text) = essay.split("\n", maxsplit=1)
        text_tokenized = pretokenize(text).strip("\n")
        essay_dict[essay_id] = convert_essay_to_single_line(text_tokenized)
    return essay_dict


def write_essay_to_file(output_dir, essay_id, essay_text):
    """
    Writes essay text to the file path output_dir/essay_id.tmp and returns the file path.
    """
    file_name = f"{essay_id}.tmp"
    file_path = path.join(output_dir, file_name)
    with open(file_path, "w+") as f:
        f.write(essay_text)
    return file_path


def _ensure_directory_exists(directory):
    """
    Creates directory if it does not exist.
    """
    makedirs(directory, exist_ok=True)


def split_file_per_essay(input_file, output_dir):
    """
    Reads each essay from input_file and writes them to individual files.
    The input file is structured as below:
    ### essay_id = ABC123
    ...
    ### essay_id = XYZ987
    ...

    Each essay is written to a file with the path: output_dir/essay_id.tmp.

    Returns a dict[essay_id] = file_path.
    """
    _ensure_directory_exists(output_dir)

    input_text = read_file(input_file)

    ids_texts = md_to_dict(input_text)

    file_paths = {}

    for essay_id, essay_text in ids_texts.items():
        file_path = write_essay_to_file(output_dir, essay_id, essay_text)
        file_paths[essay_id] = file_path

    joined = "\n".join(text for text in ids_texts.values())

    full_corpus_file_path = path.join(output_dir, f"{FULL_CORPUS}.tmp")
    with open(full_corpus_file_path, "w+") as f:
        f.write(joined)
    file_paths[FULL_CORPUS] = full_corpus_file_path

    return file_paths

## Variables

These variables are used to create file paths for later use in the evaluation process.

### Edit Versions

Setup variables to distinguish minimal edits and fluency edits.


In [None]:
MINIMAL = "minimal"
FLUENCY = "fluency"
versions = [MINIMAL, FLUENCY]

In [None]:
SOURCE = "source"
HYPOTHESIS = "hypothesis"
REFERENCE = "reference"

PRECISION = "precision"
RECALL = "recall"
F05 = "f05"

ESSAY_ID = "essay id"

TEAM = "team"
VERSION = "version"
BATCH_SIZE = "batch size"

SPACE = " "

### Teams

Create a list of all teams, which are all directories under `../models/`.


In [None]:
models_dir = path.join(REPO_ROOT, "outputs/")

teams = [
    d for d in listdir(models_dir)
    if path.isdir(path.join(models_dir, d))
]
print(teams)

In [None]:
TEAM_LABEL = "Team"
STYLE_LABEL = "Correction Style"
ESSAY_LABEL = "Essay"

### Directories

Setup Directories for various directories.

In [None]:
DATA_DIR = path.join(REPO_ROOT, "data/swedish/SweLL-gold/")
SOURCE_DIR = path.join(REPO_ROOT, "sources/")
REFERENCE_DIR = path.join(REPO_ROOT, "references/")
HYPOTHESIS_DIR = path.join(REPO_ROOT, "hypotheses/")
SYSTEM_OUTPUT_DIR = path.join(REPO_ROOT, "outputs/")

print(DATA_DIR)
print(SOURCE_DIR)
print(REFERENCE_DIR)
print(HYPOTHESIS_DIR)
print(SYSTEM_OUTPUT_DIR)

### Sources

Get paths for all source files.


In [None]:
def get_all_source_paths():
    """
    Writes all source essays to files on the form: `SOURCE_DIR/essay_id.tmp`.
    Returns a dict[essay_id] = file_path.
    """
    md = path.join(DATA_DIR, "sv-swell_gold-orig-test.md")
    return split_file_per_essay(md, SOURCE_DIR)


source_paths = get_all_source_paths()
print(source_paths)

### References

Get paths for all reference files.


In [None]:
def get_reference_paths(input_file, version):
    """
    Writes all reference essays to files on the form: `REFERENCE_DIR/version/essay_id.tmp`.
    Returns a dict[essay_id] = file_path.
    """
    output_dir = path.join(REFERENCE_DIR, version)
    return split_file_per_essay(input_file, output_dir)


def get_all_reference_paths():
    """
    Writes both minimal-edited and fluency-edited reference essays to files on the form: `REFERENCE_DIR/version/essay_id.tmp`.
    Returns a dict[version][essay_id] = file_path.
    """
    minimal_reference_md = path.join(DATA_DIR, "sv-swell_gold-ref1-test.md")
    fluency_reference_md = path.join(DATA_DIR, "sv-swell_gold-ref2-test.md")
    return {
        MINIMAL: get_reference_paths(minimal_reference_md, MINIMAL),
        FLUENCY: get_reference_paths(fluency_reference_md, FLUENCY),
    }


reference_paths = get_all_reference_paths()
print(reference_paths)

### Hypotheses

Get paths for all hypothesis paths.


In [None]:
def get_system_version_hypothesis_paths(team, version, md):
    """
    Writes all system hypotheses to files on the form: `HYPOTHESIS_DIR/team/version/essay_id.tmp`.
    Returns a dict[essay_id] = file_path.
    """
    hypothesis_dir = path.join(HYPOTHESIS_DIR, team, version)
    return split_file_per_essay(md, hypothesis_dir)


def get_system_hypothesis_paths(team):
    """
    Writes both minimal-edited and fluency-edited system hypotheses to files on the form: `HYPOTHESIS_DIR/team/version/essay_id.tmp`.
    Returns a dict[version][essay_id] = file_path.
    """

    minimal_hypothesis_md = path.join(
        SYSTEM_OUTPUT_DIR, team, MINIMAL, "sv-swell_gold-hypo-test.md"
    )
    fluency_hypothesis_md = path.join(
        SYSTEM_OUTPUT_DIR, team, FLUENCY, "sv-swell_gold-fluency-hypo-test.md"
    )
    return {
        MINIMAL: get_system_version_hypothesis_paths(
            team, MINIMAL, minimal_hypothesis_md
        ),
        FLUENCY: get_system_version_hypothesis_paths(
            team, FLUENCY, fluency_hypothesis_md
        ),
    }


def get_all_hypothesis_paths():
    """
    Writes both minimal-edited and fluency-edited system hypotheses for both teams to files on the form: `HYPOTHESIS_DIR/team/version/essay_id.tmp`.
    Returns a dict[team][version][essay_id] = file_path.
    """
    return {team: get_system_hypothesis_paths(team) for team in teams}


hypothesis_paths = get_all_hypothesis_paths()
print(hypothesis_paths)

## Compute scores

In [None]:
essay_ids = list(source_paths.keys())
essay_ids.remove(FULL_CORPUS)


def compute_pairwise_scores(metric_function):
    scores = {}
    for team in tqdm(teams, desc=TEAM_LABEL):
        scores[team] = {}
        for version in tqdm(versions, leave=False, desc=STYLE_LABEL):
            scores[team][version] = {}
            for essay_id in tqdm(essay_ids, leave=False, desc=ESSAY_LABEL):
                args = {
                    SOURCE: source_paths[essay_id],
                    MINIMAL: reference_paths[MINIMAL][essay_id],
                    FLUENCY: reference_paths[FLUENCY][essay_id],
                    HYPOTHESIS: hypothesis_paths[team][version][essay_id],
                    TEAM: team,
                    VERSION: version,
                    ESSAY_ID: essay_id,
                    BATCH_SIZE: 1,
                }
                scores[team][version][essay_id] = metric_function(args)

    return scores

In [None]:
def compute_corpus_scores(metric_function):
    scores = {}
    for team in tqdm(teams, desc=TEAM_LABEL):
        scores[team] = {}
        for version in tqdm(versions, leave=False, desc=STYLE_LABEL):
            scores[team][version] = {}
            args = {
                SOURCE: source_paths[FULL_CORPUS],
                SOURCE: source_paths[FULL_CORPUS],
                MINIMAL: reference_paths[MINIMAL][FULL_CORPUS],
                FLUENCY: reference_paths[FLUENCY][FULL_CORPUS],
                HYPOTHESIS: hypothesis_paths[team][version][FULL_CORPUS],
                TEAM: team,
                VERSION: version,
                ESSAY_ID: FULL_CORPUS,
                BATCH_SIZE: 4,
                FULL_CORPUS: True
            }
            scores[team][version][FULL_CORPUS] = metric_function(args)

    return scores

## Run Shell Commands

In [None]:
current_env = environ.copy()


def run_command(command_args):
    command = SPACE.join(command_args)
    result = subprocess.run(
        command, shell=True, capture_output=True, env=current_env
    )
    # Verify that program exited correctly, otherwise raise exception
    result.check_returncode()
    output_bytes = result.stdout
    output = output_bytes.decode("utf-8")
    return output

## GLEU

Compute GLEU with the implementation by Shota Koyama ([https://github.com/shotakoyama/gleu](https://github.com/shotakoyama/gleu)).


In [None]:
def compute_gleu(files):
    gleu_command = [
        f"gleu",
        f"-s {files[SOURCE]}",
        f"-r {files[MINIMAL]} {files[FLUENCY]}",  # Use both references
        f"-o {files[HYPOTHESIS]}",
        f"-d 4",  # Number of decimal places
        f"-f",  # Fixed seed
        f"-n 4",  # Maximum n-gram length
        f"-t word",  # Word-level tokenization
    ]

    gleu_output = run_command(gleu_command)

    if gleu_output != "":
        gleu_split = gleu_output.split()
        gleu_score = float(gleu_split[1])
    else:
        gleu_score = -float("inf")
    return gleu_score

In [None]:
pairwise_gleu_scores = compute_pairwise_scores(compute_gleu)

In [None]:
corpus_gleu_scores = compute_corpus_scores(compute_gleu)

In [None]:
print(corpus_gleu_scores)

## ERRANT

Compute ERRANT with the implementation by Andrew Caines ([https://github.com/cainesap/errant](https://github.com/cainesap/errant)).

Begin by defining some helper functions.

Read a file and return its contents.


In [None]:
language = "sv"


def create_m2_file(original, corrected, output):
    errant_parallel_command = [
        f"errant_parallel",
        f"-orig {original}",
        f"-cor {SPACE.join(corrected)}",  # Use all provided references
        f"-out {output}",
        f"-lang {language}",
    ]
    # Capture output to dummy variable to avoid printing
    dummy = run_command(errant_parallel_command)


def prepare_m2_files(args):
    # Create source-reference m2
    reference_m2_dir = path.join(M2DIR, REFERENCE)
    _ensure_directory_exists(reference_m2_dir)
    reference_m2_path = path.join(reference_m2_dir, f"{args[ESSAY_ID]}.m2")

    # Reference m2 does not change
    if not path.isfile(reference_m2_path):
        create_m2_file(
            args[SOURCE],
            [args[MINIMAL], args[FLUENCY]],
            reference_m2_path
        )

    # Create source-hypothesis m2
    hypothesis_m2_dir = path.join(M2DIR, args[TEAM], args[VERSION])
    _ensure_directory_exists(hypothesis_m2_dir)
    hypothesis_m2_path = path.join(hypothesis_m2_dir, f"{args[ESSAY_ID]}.m2")

    create_m2_file(
        args[SOURCE],
        [args[HYPOTHESIS]],
        hypothesis_m2_path
    )

    return None

In [None]:
dummy = compute_pairwise_scores(prepare_m2_files)

In [None]:
dummy = compute_corpus_scores(prepare_m2_files)

In [None]:
ERRANT_REGEX = re.compile(r"\d\.\d+\s+\d\.\d+\s+\d\.\d+")
language = "sv"


def compute_errant(args):
    reference_m2_path = path.join(M2DIR, REFERENCE, f"{args[ESSAY_ID]}.m2")

    hypothesis_m2_path = path.join(
        M2DIR, args[TEAM], args[VERSION], f"{args[ESSAY_ID]}.m2"
    )

    errant_compare_command = [
        f"errant_compare",
        f"-hyp {hypothesis_m2_path}",
        f"-ref {reference_m2_path}",
    ]
    scores = run_command(errant_compare_command)

    # Capture the output, which looks like this. Then extract precision, recall and F0.5
    # =========== Span-Based Correction ============
    # TP      FP      FN      Prec    Rec     F0.5
    # 12      4       6       0.75    0.6667  0.7317
    # ==============================================

    if scores != "":
        regex_match = ERRANT_REGEX.search(scores)
        match_list = regex_match.group(0).split("\t")
        match_values = [100 * float(v) for v in match_list]

        precision = match_values[0]
        recall = match_values[1]
        f05 = match_values[2]
    else:
        precision = -float("inf")
        recall = -float("inf")
        f05 = -float("inf")

    return {
        PRECISION: precision,
        RECALL: recall,
        F05: f05,
    }

In [None]:
pairwise_errant_scores = compute_pairwise_scores(compute_errant)

In [None]:
corpus_errant_scores = compute_corpus_scores(compute_errant)

In [None]:
print(corpus_errant_scores)

## Scribendi Score

Compute the Scribendi Score with the implementation by Robert Östling ([https://github.com/robertostling/scribendi_score](https://github.com/robertostling/scribendi_score)).

Begin by setting up the model. Use the same `Llama-3.1-8B` model ([Hugging Face](https://huggingface.co/meta-llama/Llama-3.1-8B)) as in the shared task.


In [None]:
scribendi_model = "meta-llama/Llama-3.1-8B"
scribendi_access_token = "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"
scribendi_scorer = ScribendiScore(
    model_id=scribendi_model, access_token=scribendi_access_token
)

In [None]:
def compute_scribendi_score(args: dict):
    contents = {
        SOURCE: read_file(args[SOURCE]),
        HYPOTHESIS: read_file(args[HYPOTHESIS])
    }

    if args.get(FULL_CORPUS, False):
        inputs = {
            k: {
                str(i): [text]
                for i, text in enumerate(contents[k].splitlines())
            }
            for k in contents.keys()
        }
    else:
        dummy_key = "DUMMY"
        inputs = {
            k: {
                dummy_key: [contents[k]]
            }
            for k in contents.keys()
        }

    return scribendi_scorer.score(inputs[SOURCE], inputs[HYPOTHESIS], batch_size=args[BATCH_SIZE])

In [None]:
pairwise_scribendi_scores = compute_pairwise_scores(compute_scribendi_score)

In [None]:
corpus_scribendi_scores = compute_corpus_scores(compute_scribendi_score)

In [None]:
print(corpus_scribendi_scores)

Remove the Scribendi-Score object to save VRAM.

In [None]:
del scribendi_scorer

# Save Scores

Combine all scores into a single dataframe and save the dataframe

In [None]:
all_pairwise_scores = []

for team, version, essay_id in product(teams, versions, essay_ids):
    gleu = pairwise_gleu_scores[team][version][essay_id]
    precision = pairwise_errant_scores[team][version][essay_id]["precision"]
    recall = pairwise_errant_scores[team][version][essay_id]["recall"]
    f05 = pairwise_errant_scores[team][version][essay_id]["f05"]
    scribendi_score = pairwise_scribendi_scores[team][version][essay_id]
    all_pairwise_scores.append(
        {
            "essay_id": essay_id,
            "correction_style": version,
            "system": team,
            "gleu": gleu,
            "precision": precision,
            "recall": recall,
            "f0.5": f05,
            "scribendi_score": scribendi_score,
        }
    )

pairwise_df = pd.DataFrame(all_pairwise_scores).round(2)
pairwise_df.to_csv("scores.csv", index=False)

In [None]:
all_corpus_scores = []

for team, version in product(teams, versions):
    gleu = corpus_gleu_scores[team][version][FULL_CORPUS]
    precision = corpus_errant_scores[team][version][FULL_CORPUS]["precision"]
    recall = corpus_errant_scores[team][version][FULL_CORPUS]["recall"]
    f05 = corpus_errant_scores[team][version][FULL_CORPUS]["f05"]
    scribendi_score = corpus_scribendi_scores[team][version][FULL_CORPUS]
    all_corpus_scores.append(
        {
            "correction_style": version,
            "system": team,
            "gleu": gleu,
            "precision": precision,
            "recall": recall,
            "f0.5": f05,
            "scribendi score": scribendi_score / 50,
        }
    )

corpus_df = pd.DataFrame(all_corpus_scores).round(2)
corpus_df.to_csv("corpus_scores.csv", index=False)