## Objectives

Analyser les morphologies existantes pour valider la liste de morphologie autorisée établie à la main

### Étapes

1. Liste des morphologies dans les fichiers trains, test, dev + décompte
2. Affichage pour débugage
3. Extraction information de la liste manuelle
4. Recherche des cas non présents dans la liste manuelle.
5. Génération de la liste de contrôle

In [1]:
import glob

from collections import Counter, defaultdict


morph_counter = defaultdict(Counter)

def get_morph(dico):
    return "|".join(
        [
            morph+"="+dico[morph]
            for morph in ["Case", "Numb", "Gend", "Deg", "Mood", "Tense", "Voice", "Person"]
            if dico[morph] != "_"
        ]
    )


for file in glob.glob("/home/thibault/dev/LASLA/mood-tense-voice-pft/*.tsv"):
    with open(file) as file:
        for lineno, line in enumerate(file):
            if lineno == 0:
                header = line.strip().split("\t")
            elif not line.strip():
                continue
            else:
                line = dict(zip(header, line.strip().split("\t")))
                if line["Mood_Tense_Voice"] != "_":
                    line["Mood"], line["Tense"], line["Voice"] = line["Mood_Tense_Voice"].split("|")
                else:
                    line["Mood"], line["Tense"], line["Voice"] = "_", "_", "_"
                reformatted = get_morph(line)
                pos = line["pos"]
                morph_counter[pos][reformatted] += 1

In [55]:
Debug = False
if Debug == True:
    for key in sorted(list(morph_counter.keys())):
        print("POS: "+key)
        for morph in sorted(list(morph_counter[key].keys())):
            print("\t{cnt} -> {morph_code}".format(cnt=morph_counter[key][morph], morph_code=morph))

In [95]:
## Étape 3 : Parsage des codes manuels
import csv

exists = defaultdict(list)
with open("1.4.5.X - Codes.csv") as f:
    reader = csv.DictReader(f, delimiter="\t", fieldnames=[
        'pos-readable', 'pos', 
        'Case-readable', 'Case', 
        'Numb-readable', 'Numb', 
        'Gend-readable', 'Gend', 
        'Deg-readable', 'Deg', 
        'Mood-readable', 'Mood', 
        'Tense-readable', 'Tense', 
        'Voice-readable', 'Voice', 
        'Person-readable', 'Person' , 
        '_', '_'
    ])
    for line_no, line in enumerate(reader):
        if line_no == 0:
            continue  # We ignore the first line because we provide our own header
        morph = get_morph({key: val or "_" for key, val in line.items()})
        pos = line["pos"]
        exists[pos].append(morph)
        
print("%s found POS" % len(exists) )

28 found POS


In [102]:
## Étape 4: comparaison

class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'
    
total_counter = {
    pos: sum(list(morph.values()))
    for pos, morph in morph_counter.items()
}
    
for pos in sorted(list(morph_counter.keys())):
    print("POS: "+pos)
    if pos not in exists:
        print(color.RED + "---- POS NOT FOUND (%s codes )" % len(morph_counter[pos]))
        print("--------> Codes %s " % ", ".join(morph_counter[pos].keys()))
        continue
    print("--- Information : %s manually added morph codes, %s found" % (len(exists[pos]), len(morph_counter[pos])))
    for morph, morph_cnt in sorted(list(morph_counter[pos].items()), key=lambda x: x[1]):
        if morph not in exists[pos]:# and morph_counter[pos][morph] != 1:
            print(
                "\t{cnt:.2f}% -> {morph_code}".format(
                    cnt=morph_counter[pos][morph]*100/total_counter[pos],
                    morph_code=morph
                )
            )

POS: 
--- Information : 36 manually added morph codes, 2 found
	5.88% -> Gend=MascFem
POS: ADJadv.mul
--- Information : 1 manually added morph codes, 1 found
POS: ADJadv.ord
--- Information : 3 manually added morph codes, 3 found
POS: ADJcar
--- Information : 85 manually added morph codes, 41 found
POS: ADJdis
--- Information : 84 manually added morph codes, 15 found
POS: ADJmul
--- Information : 252 manually added morph codes, 22 found
POS: ADJord
--- Information : 252 manually added morph codes, 61 found
	0.03% -> Deg=Sup
POS: ADJqua
--- Information : 255 manually added morph codes, 136 found
	0.00% -> Deg=Sup
	0.00% -> Deg=Comp
	0.01% -> Deg=Pos
POS: ADV
--- Information : 3 manually added morph codes, 3 found
POS: ADVint
--- Information : 1 manually added morph codes, 3 found
	0.01% -> Gend=Fem
	0.01% -> Gend=MascNeut
POS: ADVint.neg
--- Information : 1 manually added morph codes, 1 found
POS: ADVneg
--- Information : 1 manually added morph codes, 1 found
POS: ADVrel
--- Information

In [113]:
## Étape 5 : Liste de contrôle !
import csv

def get_morph_and_readable(dico):
    return "|".join(
        [
            morph+"="+dico[morph]
            for morph in ["Case", "Numb", "Gend", "Deg", "Mood", "Tense", "Voice", "Person"]
            if dico[morph] != "_"
        ]
    ), " ".join(
        [
            dico[morph+"-readable"]
            for morph in ["Case", "Numb", "Gend", "Deg", "Mood", "Tense", "Voice", "Person"]
            if dico[morph] != "_"
        ]
    )

morph_codes = defaultdict(lambda: {"readable": "", "POS": []})
with open("1.4.5.X - Codes.csv") as f:
    reader = csv.DictReader(f, delimiter="\t", fieldnames=[
        'pos-readable', 'pos', 
        'Case-readable', 'Case', 
        'Numb-readable', 'Numb', 
        'Gend-readable', 'Gend', 
        'Deg-readable', 'Deg', 
        'Mood-readable', 'Mood', 
        'Tense-readable', 'Tense', 
        'Voice-readable', 'Voice', 
        'Person-readable', 'Person' , 
        '_', '_'
    ])
    for line_no, line in enumerate(reader):
        if line_no == 0:
            continue  # We ignore the first line because we provide our own header
        if not line["pos"]:
            continue
        morph, readable = get_morph_and_readable({key: val or "_" for key, val in line.items()})
        if morph == "":
            morph = "MORPH=Empty"
            readable = "Empty"
        pos = line["pos"]
        morph_codes[morph]["readable"] = readable
        if pos in morph_codes[morph]["POS"]:
            print(morph_codes[morph]["POS"], morph)
        morph_codes[morph]["POS"].append(pos)
        
      
lines = []
for morph_code, morph_info in morph_codes.items():
    lines.append("{code}\t{name} - {pos}".format(
        code=morph_code,
        name=morph_info["readable"],
        pos=",".join(morph_info["POS"])
    ))

print(len(lines))
print("label\treadable")
print("\n".join(lines))

1518
label	readable
Case=Nom|Numb=Sing	Nominatif Singulier - NOM,PROper
Case=Voc|Numb=Sing	Vocatif Singulier - NOM,PROper
Case=Acc|Numb=Sing	Accusatif Singulier - NOM,PROper,PROref
Case=Gen|Numb=Sing	Génitif Singulier - NOM,PROper,PROref
Case=Dat|Numb=Sing	Datif Singulier - NOM,PROper,PROref
Case=Abl|Numb=Sing	Ablatif Singulier - NOM,PROper,PROref
Case=Loc|Numb=Sing	Locatif Singulier - NOM
Case=Ind	Indéclinable - NOM,ADJcar,PROdem,PROrel,PROint,PROind
Case=Nom|Numb=Plur	Nominatif Pluriel - NOM,PROper
Case=Voc|Numb=Plur	Vocatif Pluriel - NOM,PROper
Case=Acc|Numb=Plur	Accusatif Pluriel - NOM,PROper,PROref
Case=Gen|Numb=Plur	Génitif Pluriel - NOM,PROper,PROref
Case=Dat|Numb=Plur	Datif Pluriel - NOM,PROper,PROref
Case=Abl|Numb=Plur	Ablatif Pluriel - NOM,PROper,PROref
Case=Loc|Numb=Plur	Locatif Pluriel - NOM
Case=Nom|Numb=Sing|Gend=Com|Mood=Par|Tense=Pres|Voice=Act	Nominatif Singulier Commun Participe Présent Actif - VER
Case=Voc|Numb=Sing|Gend=Com|Mood=Par|Tense=Pres|Voice=Act	Vocatif Sing