Analyse des erreurs du lemmatiseur
==============================

## Étapes

1. Parsage
2. Comparaison Synchronie vs Diachronie (Ablatif comme adverbe, Participe comme adjectif, etc.)
3. Décompte du nombre de lemmes nouveaux

## 1. Parsage

In [10]:
errors = [
    #(source, target, count)
]
preds = []
trues = []
with open("LemmatisationReport.md") as f:
    started = False
    last_true = None
    for line in f:
        if not line.strip():
            continue
        if line.endswith("lemma Confusion Matrix\n"):
            print("Heelo")
            started = True
            continue
        elif line.startswith("#"):
            started = False
            continue
        elif "Expected" in line:
            continue
        elif "|---" in line:
            continue
            
        if not started:
            continue
        
        # We are no dealing only with the table
        true, _, pred, cnt = [x.strip() for x in line.strip().split("|") if x]
        if not last_true:
            last_true = true
        preds.append(pred)
        if true:
            last_true = true
            trues.append(true)
        else:
            true = last_true
        
        if "界" in true and "界" in pred:
            true = true.split("界")[0]
            pred = pred.split("界")[0]
            
        errors.append((true, pred, int(cnt)))
print(errors)

Heelo
[('qui', 'quod', 111), ('qui', 'quis', 103), ('qui', 'quam', 39), ('qui', 'quo', 24), ('qui', 'qua', 11), ('qui', 'quiuis', 1), ('quis', 'qui', 156), ('quis', 'quo', 8), ('quis', 'quam', 5), ('quis', 'quod', 3), ('quis', 'qua', 1), ('quis', 'cuius', 1), ('quod', 'qui', 103), ('quod', 'quis', 5), ('multus', 'multum', 37), ('multus', 'multi', 19), ('bonus', 'bonum', 14), ('bonus', 'bene', 8), ('bonus', 'boni', 3), ('bonus', 'Bonus', 1), ('primus', 'primum', 21), ('primus', 'primo', 4), ('sui', 'suus', 24), ('sui', 'suum', 1), ('malus', 'malum', 21), ('malus', 'mala', 2), ('malus', 'mali', 1), ('quam', 'qui', 22), ('quam', 'quis', 1), ('facio', 'factum', 20), ('facio', 'facies', 1), ('facio', 'sum', 1), ('facio', 'fax', 1), ('quo', 'qui', 22), ('bonum', 'bonus', 18), ('bonum', 'boni', 4), ('factum', 'facio', 19), ('multi', 'multus', 17), ('meus', 'ego', 13), ('meus', 'meum', 2), ('meus', 'mei', 1), ('tantus', 'tantum', 16), ('paratus', 'paro', 15), ('boni', 'bonus', 9), ('boni', 'bo

In [24]:
singles = {
    
}
with open("LemmatisationReport.md") as f:
    started = False
    last_true = None
    for line in f:
        if not line.strip():
            continue
        if line.endswith("lemma Classification report\n"):
            started = True
            continue
        elif line.startswith("#"):
            started = False
            continue
        elif "target" in line:
            continue
        elif "|---" in line:
            continue
            
        if not started:
            continue
        
        # We are no dealing only with the table
        tgt, precision, recal, f1, support, *_ = [x.strip() for x in line.strip().split("|") if x]
        if support != "0":
            singles[tgt] = (float(precision), float(recal), float(f1), int(support))
            

## 2. Comparaison Synchronie vs. Diachronie

### 2.1 Chargement du lemmatiseur

In [2]:
from pycollatinus import Lemmatiseur
lemmatizer = Lemmatiseur()

### 2.2 Lemmatisation de tous les lemmes

In [12]:
from collections import defaultdict
lemmatisations = {}
for lemma in set(preds + trues):
    lemmatisations[lemma] = list(lemmatizer.lemmatise(lemma))

### 2.3 Comparaison suivant certaines catégories

In [13]:
def is_particip_parf(lemma):
    for possibility in lemmatisations[lemma]:
        if possibility["morph"] == "nominatif masculin singulier participe parfait passif":
            return possibility["lemma"]
        elif possibility["morph"] == "accusatif masculin singulier participe parfait passif":
            return possibility["lemma"]
    return -1

def is_particip_pres(lemma):
    for possibility in lemmatisations[lemma]:
        if possibility["morph"] == "nominatif masculin singulier participe présent actif":
            return possibility["lemma"]
        #elif possibility["morph"] == "accusatif masculin singulier participe présent actif":
        #    return possibility["lemma"]
    return -3

def is_verb(lemma):
    for possibility in lemmatisations[lemma]:
        if possibility["morph"] == "1ère singulier indicatif présent actif":
            return possibility["lemma"]
    return -2

def share_nominatif(lemma):
    for possibility in lemmatisations[lemma]:
        if possibility["morph"] in (
            "nominatif singulier", "nominatif neutre singulier", "nominatif féminin singulier",
            "nominatif masculin singulier"
        ):
            return possibility["lemma"]
    return lemma

errors_repartition = {
    "part-parf-as-verb": 0,
    "verb-as-part-parf": 0,
    "part-pres-as-verb": 0,
    "verb-as-part-pres": 0,
    "nom-as-something-else": 0,
    "something-else-as-nom": 0,
    "passif-as-actif": 0,
    "actif-as-passif": 0,
    "same-nominatif": 0,
    "total": 0
}

for true, pred, cnt in errors:
    errors_repartition["total"] += cnt
    
    if true not in lemmatisations or pred not in lemmatisations:
        continue
    
    is_verb_pred = is_verb(pred)
    is_verb_true = is_verb(true)
    
    if pred in ("qui", "quod", "quis", "quam", "qua"):
        continue
    if true+"r" == pred:
        errors_repartition["actif-as-passif"] += cnt
    elif true == pred+"r":
        errors_repartition["passif-as-actif"] += cnt
    elif is_particip_parf(true) == is_verb_pred:
        #print(true, pred)
        errors_repartition["part-parf-as-verb"] += cnt
    elif is_verb_true == is_particip_parf(pred):
        #print(true, pred)
        errors_repartition["verb-as-part-parf"] += cnt
    elif is_particip_pres(true) == is_verb_pred:
        #print(true, pred)
        errors_repartition["part-pres-as-verb"] += cnt
    elif is_verb_true == is_particip_pres(pred):
        #print(true, pred)
        errors_repartition["verb-as-part-pres"] += cnt
    elif share_nominatif(true) == pred:
        errors_repartition["nom-as-something-else"] += cnt
    elif true == share_nominatif(pred):
        errors_repartition["something-else-as-nom"] += cnt
    elif share_nominatif(true) == share_nominatif(pred):
        errors_repartition["same-nominatif"] += cnt
    
print(errors_repartition)

{'part-parf-as-verb': 240, 'verb-as-part-parf': 245, 'part-pres-as-verb': 51, 'verb-as-part-pres': 42, 'nom-as-something-else': 158, 'something-else-as-nom': 297, 'passif-as-actif': 22, 'actif-as-passif': 29, 'same-nominatif': 58, 'total': 4395}


### 2.4 Affichage relatif

In [14]:
print(errors_repartition)

support = 169822
# 169822-0.9752*169822
total_diachronical = 0
for type_error, cnt_errors in errors_repartition.items():
    if type_error=="total":
        continue
    #print("{}: {:.4}".format(type_error, cnt_errors/support))
    print("{}: {:.2f} % of errors".format(type_error, cnt_errors*100/errors_repartition["total"]))
    total_diachronical += cnt_errors

print("---")
print("{}: {:.2f} % of errors".format(type_error, total_diachronical*100/errors_repartition["total"]))

{'part-parf-as-verb': 240, 'verb-as-part-parf': 245, 'part-pres-as-verb': 51, 'verb-as-part-pres': 42, 'nom-as-something-else': 158, 'something-else-as-nom': 297, 'passif-as-actif': 22, 'actif-as-passif': 29, 'same-nominatif': 58, 'total': 4395}
part-parf-as-verb: 5.46 % of errors
verb-as-part-parf: 5.57 % of errors
part-pres-as-verb: 1.16 % of errors
verb-as-part-pres: 0.96 % of errors
nom-as-something-else: 3.59 % of errors
something-else-as-nom: 6.76 % of errors
passif-as-actif: 0.50 % of errors
actif-as-passif: 0.66 % of errors
same-nominatif: 1.32 % of errors
---
total: 25.98 % of errors


## 3. Lemmes existants, lemmes inexistants

### 3.1 Chargement de la base de lemmes existants

In [6]:
lemmatas = []
with open("LASLA-Lemmas/out.txt") as f:
    for lineno, line in enumerate(f):
        if lineno == 0:
            continue
        line = line.strip().split("\t")[0]
        if line[-1].isnumeric():  # Remove disambiguation index
            line=line[:-1]
        lemmatas.append(line.lower())
        
lemmatas = set(lemmatas)
print(len(lemmatas))

25185


### 3.2 Calcul du nombre d'inconnus

In [7]:
unknown = 0
totally_unknown = 0
unknown_pred = []

for true, pred, cnt in errors:
    if pred not in lemmatas:
        unknown += cnt
        if pred not in lemmatisations:
            totally_unknown += cnt
            print(pred)
        else:
            print(pred)
        unknown_pred.append((true, pred))
        
print("{}: {:.2f} % of errors ({})".format("Unknown", unknown*100/errors_repartition["total"], unknown))
print("{} (included in Collatinus): {:.2f} % of errors ({})".format(
    "Unknown", totally_unknown*100/errors_repartition["total"], totally_unknown
))

bons
eumo
utertia
utentia
moenio
ardes
qui界que
quisquisque
si界uolo
bella
nocturne
nocturna
simplicius
simplicus
teucrus
nitus
reuersus
tonde
innoto
insta
promero
impertus
imperto
attinus
attinae
turicrema
myrrhena
caesea
caesio
frix
frigens
hercynia
solutus界sum
sanes
memacenes
memacenus
laris
uiges
clito
cliton
mytilenus
licita
misella
misellis
opificum
narniensis
summa界sumptus
uenumdo
subpar
supara
lano
essurio
rixus
inaris
inarimen
antisto
titia
apelle
facisso
priauo
captiue
lotus
typanus
saepeo
impauidi
orbor
seuiro
excludo界sum
aprilia
quatuordeo
alanis
utrinque
libyci
superincumbus
honoris
carius
lechaeus
hirquus
baii
atrienses
perdelis
accliue
rosetus
cibo
multifarius
aletrinati
internito
oblideo
singlariter
calcio
pulmus
nasut
sterquilinus
commoeto
borestorus
exento
hirrius
suptero
cum界que
philiteus
sallustus
semustus
pleades
sameramis
lai
pammeni
antecellus
carnificus
resico
rhinocer
maedus
melicher
acosmus
dorco
charito
mius
cataplexus
larinus
excutido
cappadocas
duplica
aeeta


## Orthographe du lemme

In [52]:
from collections import defaultdict

elem = defaultdict(list)

for lemma, (precision, recall, f1, support) in singles.items():
    if support == 0 or support > 10:
        continue
        
    if "界" in lemma:
        continue
        
    val = [1] * round(support*(1-precision)) # Number of errors
    if not val:
        continue
        
    lemma = lemma.lower()
    grams = set([
        b1+b2
        for b1, b2 in zip("_"+lemma, lemma + "_")
    ])
    for gram in grams:
        elem[gram].extend(val)
    
macro_elem = list({
    key: sum(val)# / len(val)
    for key, val in elem.items()
}.items())

sorted(macro_elem, key=lambda x: x[1], reverse=True)[:100]

[('s_', 638),
 ('us', 432),
 ('tu', 257),
 ('o_', 250),
 ('um', 231),
 ('er', 221),
 ('m_', 219),
 ('_p', 210),
 ('a_', 183),
 ('_s', 174),
 ('_c', 160),
 ('_a', 158),
 ('ri', 153),
 ('en', 146),
 ('is', 143),
 ('iu', 139),
 ('in', 137),
 ('es', 131),
 ('an', 118),
 ('te', 116),
 ('li', 111),
 ('co', 110),
 ('at', 109),
 ('su', 109),
 ('ar', 108),
 ('ra', 107),
 ('_i', 106),
 ('or', 104),
 ('ti', 103),
 ('re', 101),
 ('la', 100),
 ('pe', 100),
 ('ta', 97),
 ('st', 96),
 ('nu', 95),
 ('_m', 95),
 ('al', 93),
 ('ns', 92),
 ('ni', 91),
 ('it', 89),
 ('ca', 87),
 ('r_', 86),
 ('ic', 85),
 ('ue', 83),
 ('_u', 81),
 ('e_', 80),
 ('se', 80),
 ('_t', 80),
 ('on', 78),
 ('_e', 78),
 ('ae', 77),
 ('pr', 77),
 ('_d', 76),
 ('ui', 75),
 ('i_', 75),
 ('nt', 75),
 ('_l', 75),
 ('ro', 74),
 ('de', 74),
 ('lu', 73),
 ('ia', 73),
 ('di', 71),
 ('le', 66),
 ('il', 63),
 ('ru', 62),
 ('cu', 61),
 ('ul', 61),
 ('ma', 61),
 ('na', 60),
 ('ct', 60),
 ('to', 60),
 ('as', 59),
 ('ex', 59),
 ('si', 58),
 ('ce'