In [24]:
# Can be run with python -m pie.scripts.evaluate
import os.path
from pie import utils
from typing import Dict, List


from pie.models import BaseModel
from pie.data import Dataset, Reader
from pie.settings import load_default_settings, settings_from_file
from pie.tagger import Tagger
from pie.utils import model_spec



def run(model_path, test_path, train_path=None, # data
        # decoding
        settings_path=None, batch_size=16, buffer_size=100000, use_beam=False, beam_width=2, device="cuda", 
        main_setting_task = None):  # report
    
    scorers = {}
    
    for model_no, (model, tasks) in enumerate(model_spec(model_path)):
        print(" - model: {}".format(model))
        print(" - tasks: {}".format(", ".join(tasks)))
        
        model = BaseModel.load(model).to(device)
        # settings
        if hasattr(model, '_settings'):  # new models should all have _settings
            settings = model._settings
        elif settings_path:
            print("- Using user specified settings file: {}".format(settings_path))
            with utils.shutup():
                settings = settings_from_file(settings_path)
        else:
            print("- Warning! Using default settings")
            with utils.shutup():
                settings = load_default_settings()
        
        settings.batch_size = batch_size
        settings.buffer_size = buffer_size
        settings.device = device
        settings.shuffle = False    # avoid shuffling

        # read datasets
        if train_path:
            trainset = Dataset(settings, Reader(settings, train_path), model.label_encoder)
        elif hasattr(settings, "input_path") and \
             settings.input_path and os.path.exists(settings.input_path):
            print("--- Using train set from settings")
            trainset = Dataset(
                settings, Reader(settings, settings.input_path), model.label_encoder)
        else:
            print("--- Not using trainset to evaluate known/unknown tokens")

        if not len(test_path) and hasattr(settings, "test_path"):
            print("--- Using test set from settings")
            test_path = (settings.test_path, )

        testset = Dataset(settings, Reader(settings, *test_path), model.label_encoder)

        scorers.update(model.evaluate(testset, trainset=trainset))
        
    return scorers

In [28]:
results, tasks = run(
    model_path="</home/thibault/dev/latin-lasla-models/Case.tar,Case></home/thibault/dev/latin-lasla-models/POS.tar,pos>",
    test_path=["/home/thibault/dev/thesis-lasla-pipeline/test.tsv"],
    train_path="/home/thibault/dev/thesis-lasla-pipeline/train.tsv"
    #test_path=["/home/thibault/dev/thesis-lasla-pipeline/mood-tense-voice-pft-clitics-uppercase/test.tsv"]
)

 - model: /home/thibault/dev/latin-lasla-models/Case.tar
 - tasks: Case


643it [00:11, 57.39it/s] 


 - model: /home/thibault/dev/latin-lasla-models/POS.tar
 - tasks: pos


643it [00:10, 58.92it/s] 


In [74]:
import matplotlib.pyplot as plt

"""def dict(scorer):
    return {
        "known_tokens": self.known_tokens,
        "amb_tokens": self.amb_tokens,
        "preds": self.preds,
        "trues": self.trues,
        "tokens": self.tokens
    }
"""

from sklearn.metrics import precision_recall_fscore_support, classification_report, balanced_accuracy_score

elements = []

def normalize_nom(value):
    if value[:3] == "NOM":
        return "NOM"
    return value
CLEAN_POS = True

for task, scorer in results.items():
    # (y_true, y_pred,
    if task == "pos" and CLEAN_POS is True:
        data = classification_report(
            results[task].trues,
            [normalize_nom(x) for x in results[task].preds],
            output_dict=True
        )
    else:
        data = classification_report(results[task].trues, results[task].preds, output_dict=True)
    elements.append({
        **{"task": task,
         "accuracy": data["accuracy"]*100}, 
        **{
            f"{key[:4]}-{category[:3].strip('-')}": value*100
            for key in ('macro avg', "weighted avg")
            for category, value in data[key].items()
            if category != "support"
        }
    })
    
from tabulate import tabulate
print(tabulate(elements, floatfmt=".2f", headers="keys"))

# Précision: 
#  quand un lemme est choisi, on compare les bonnes réponses aux réponses bonnes et mauvaises pour tel lemme
# Rappel
#  Quand un lemme est choisi, on compare les bonnes réponses aux bonnes réponses + bonnes réponses manquantes
# Macro:
#  Calculate metrics for each label, and find their unweighted mean. 
#      This does not take label imbalance into account.
#  -> Sur-représente de potentielles rares ou uniques formes.
# Weighted:
#  Calculate metrics for each label, and find their average weighted by support 
#   -> Avantage les formes très courantes

  _warn_prf(average, modifier, msg_start, len(result))


task      accuracy    macr-pre    macr-rec    macr-f1    weig-pre    weig-rec    weig-f1
------  ----------  ----------  ----------  ---------  ----------  ----------  ---------
Case         94.64       90.38       88.82      89.56       94.63       94.64      94.63
pos          96.97       88.51       88.09      88.27       96.96       96.97      96.96


In [7]:
from typing import List

full_results: List[bool] = [
    True for _ in range(len(results["lemma"].preds))    
]
full_results_unknown: List[bool] = [   
]
lemma = []

tasks = {
    task: []
    for task in results if task not in ("Dis", "Entity", "lemma")
}

known_tokens = set(results["lemma"].known_tokens)
for task, scorer in results.items():
    if task in ("Dis", "Entity"):
        #print(scorer.__dict__.keys())
        continue
        
    for idx, (p, t, cur, tok) in enumerate(zip(scorer.preds, scorer.trues, full_results, scorer.tokens)):
        full_results[idx] = cur and p == t
        
        if task == "lemma":
            lemma.append(p == t)
        elif t != "_":
            tasks[task].append(p == t)
        if tok not in known_tokens:
            full_results_unknown.append(full_results[idx])
    # scorer.print_summary()


In [8]:
from tabulate import tabulate
table = [
    ("Mesure", "Score")
]
#data.append(("Lemme", f"{lemma.count(True)*100/len(lemma):.2f}"))
table.append(("Tâches Agrégées", f"{full_results.count(True)*100/len(full_results):.2f}"))
table.append(("Tâches Agrégées (Formes inconnues)",
             f"{full_results_unknown.count(True)*100/len(full_results_unknown):.2f}"))
for task, task_score in tasks.items():
    table.append((f"{task}", f"{task_score.count(True)*100/len(task_score):.2f}"))
    
print(tabulate(table[1:], headers=table[0], tablefmt="latex"))

\begin{tabular}{lr}
\hline
 Mesure                             &   Score \\
\hline
 Tâches Agrégées                    &   61.72 \\
 Tâches Agrégées (Formes inconnues) &   61.97 \\
 pos                                &   68.38 \\
 Gend                               &   89.95 \\
 Numb                               &   96.74 \\
 Case                               &   87.69 \\
 Deg                                &   92.54 \\
 Mood\_Tense\_Voice                   &   94.26 \\
 Person                             &   98.62 \\
\hline
\end{tabular}
