In [40]:
# Can be run with python -m pie.scripts.evaluate
import os.path
from pie import utils


from pie.models import BaseModel
from pie.data import Dataset, Reader
from pie.settings import load_default_settings, settings_from_file


def run(model_path, test_path, train_path=None, # data
        # decoding
        settings=None, batch_size=16, buffer_size=100000, use_beam=False, beam_width=2, device="cuda", 
        model_info=False):  # report

    model = BaseModel.load(model_path).to(device)
    if model_info:
        print(model)

    # settings
    if hasattr(model, '_settings'):  # new models should all have _settings
        settings = model._settings
    elif settings:
        print("Using user specified settings file: {}".format(settings))
        with utils.shutup():
            settings = settings_from_file(settings)
    else:
        print("Warning! Using default settings")
        with utils.shutup():
            settings = load_default_settings()

    # overwrite defaults
    settings.batch_size = batch_size
    settings.buffer_size = buffer_size
    settings.device = device
    settings.shuffle = False    # avoid shuffling

    # read datasets
    trainset = None
    if train_path:
        trainset = Dataset(
            settings, Reader(settings, train_path), model.label_encoder)
    elif hasattr(settings, "input_path") and \
         settings.input_path and os.path.exists(settings.input_path):
        print("--- Using train set from settings")
        trainset = Dataset(
            settings, Reader(settings, settings.input_path), model.label_encoder)
    else:
        print("--- Not using trainset to evaluate known/unknown tokens")

    if not len(test_path) and hasattr(settings, "test_path"):
        print("--- Using test set from settings")
        test_path = (settings.test_path, )

    testset = Dataset(settings, Reader(settings, *test_path), model.label_encoder)

    return {task:score for task, score in model.evaluate(testset, trainset,
                               use_beam=use_beam, beam_width=beam_width).items()}

In [46]:
results = run(
    model_path="/home/thibault/dev/latin-lasla-models/model.tar",
    test_path=["/home/thibault/dev/thesis-lasla-pipeline/test.tsv"],
    train_path="/home/thibault/dev/thesis-lasla-pipeline/train.tsv"
    #test_path=["/home/thibault/dev/thesis-lasla-pipeline/mood-tense-voice-pft-clitics-uppercase/test.tsv"]
)

643it [01:09,  9.19it/s]


In [62]:
from typing import List

full_results: List[bool] = [
    True for _ in range(len(results["lemma"].preds))    
]
full_results_unknown: List[bool] = [   
]
lemma = []

tasks = {
    task: []
    for task in results if task not in ("Dis", "Entity", "lemma")
}

known_tokens = set(results["lemma"].known_tokens)
for task, scorer in results.items():
    if task in ("Dis", "Entity"):
        #print(scorer.__dict__.keys())
        continue
        
    for idx, (p, t, cur, tok) in enumerate(zip(scorer.preds, scorer.trues, full_results, scorer.tokens)):
        full_results[idx] = cur and p == t
        
        if task == "lemma":
            lemma.append(p == t)
        elif t != "_":
            tasks[task].append(p == t)
        if tok not in known_tokens:
            full_results_unknown.append(full_results[idx])
    # scorer.print_summary()


In [74]:
from tabulate import tabulate
table = [
    ("Mesure", "Score")
]
#data.append(("Lemme", f"{lemma.count(True)*100/len(lemma):.2f}"))
table.append(("Tâches Agrégées", f"{full_results.count(True)*100/len(full_results):.2f}"))
table.append(("Tâches Agrégées (Formes inconnues)",
             f"{full_results_unknown.count(True)*100/len(full_results_unknown):.2f}"))
for task, task_score in tasks.items():
    table.append((f"{task}", f"{task_score.count(True)*100/len(task_score):.2f}"))
    
print(tabulate(table[1:], headers=table[0], tablefmt="latex"))

\begin{tabular}{lr}
\hline
 Mesure                             &   Score \\
\hline
 Tâches Agrégées                    &   85.86 \\
 Tâches Agrégées (Formes inconnues) &   76.68 \\
 pos                                &   96.49 \\
 Gend                               &   89.98 \\
 Numb                               &   96.44 \\
 Case                               &   87.84 \\
 Deg                                &   93.37 \\
 Mood\_Tense\_Voice                   &   94.44 \\
 Person                             &   98.49 \\
\hline
\end{tabular}
