In [1]:
import glob
import re
from collections import defaultdict, Counter

## Étape 1: Parsage de l'OCR

In [2]:
def get_lemmas(debug=False):
    lemma = re.compile("^(?P<pos>[0-9\&LJMNT\-KG]{2})\s+(?P<lemma>[A-Z 0-9]+[A-Z0-9])(?![a-z'’é])(?P<desc>.*)")

    lemmas = [

    ]
    for file in glob.glob("LASLA-Lemmas/LASLA_Lemmes.txt"):
        with open(file) as f:
            for line in f:
                if "LaslaDic" in line or "2008" in line:
                    continue
                elif not line.strip():
                    continue
                elif line.strip().isnumeric():
                    continue
                line = line.strip()

                if lemma.match(line):
                    lemmas.append(lemma.match(line).groupdict())
                else:
                    # We update the last lemma
                    if debug:
                        print("(%s) Updated %s with `%s`" % (file, lemmas[-1]["lemma"], line))
                    lemmas[-1]["desc"] += " " + line

    # Disambiguation
    dis = re.compile("^(?P<lemma>[A-Z ]+) (?P<Dis>[0-9]?[ANC0-9])$")
    for lemma in lemmas:
        lemma["Dis"] = ""
        if dis.match(lemma["lemma"]):
            lemma.update(dis.match(lemma["lemma"]).groupdict())

    # Remove spaces
    for lemma in lemmas:
        lemma["lemma"] = lemma["lemma"].replace(" ", "")

    print(len(lemmas))

    real_lemmas = defaultdict(dict)
    for lemma in lemmas:
        real_lemmas[lemma["lemma"]][lemma["Dis"]] = (lemma["pos"], lemma["desc"])
    return real_lemmas

real_lemmas = get_lemmas(debug=True)

(LASLA-Lemmas/LASLA_Lemmes.txt) Updated AB SCIS VS with `TRVNCVS)`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updated ABSOLVTVS with `=COMPLETV S)`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updated ACCEPTVS with `de la langue Financière)`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updated ACCVRATVS with `l'ORATIO)`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updated ADVENTICIVM with `ADVENTICIO)`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updated ADVLTVS with `de personnes)`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updated AEQVVM with `locutions avec Prép.);=RECTVM (Sg.,Pf)`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updated AGENS 2 with `(Grammaire)`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updated ALIENVM with `(PI.)`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updated APERTVM with `locutions avec Prép.;Sg.,Pl.)`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updated ASTRICTVS with `terme de rhétorique : Concis`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updated AVERSVM with `(Sg.);Sous-entendu LOCA (PI.)`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updated BREVE 1 with `(PI.)`
(LASLA-Lemmas/LASLA_Lemmes.txt) Updat

## Étape 2: Comparaison avec le contenu du PDF

### 2.1 Obtention des données

In [3]:
raw_lemmas = defaultdict(set)
_l = re.compile("^(?:[0-9\&LJMNT\-KG]{2}\s+)?([A-Z]{2,})(?![a-z])(?: ([0-9]))?(?![a-z])")
with open("LASLA-Lemmas/rawtext_LASLA.txt") as f:
    for line in f:
        for lemma, dis in _l.findall(line):
            raw_lemmas[lemma].add(dis)
print(len(raw_lemmas))

16993


### 2.2 Comparaison et affichage des manquants

In [4]:
# Check lemmas that would not have been found
IGNORE = [x for x in """
    ADVENTICIO
    CONJVNCTVS SEQVITVR DECVMANI ATQUE COMBVRITVR INDITVS RERVM PENSIS EXHIBETVR PROMISCVO
    REM VIRTVTIBVS OPINANS QVOQVEVERSVS ABHORRENS RVGA IVDICES TEMPORE SVBDITVS CONTEXTVM
    
""".split() if x]
real_lemmas = get_lemmas()
#print(list(raw_lemmas.keys())[:50])
#print(list(real_lemmas.keys())[:50])
count = 0
for lemma in (list(raw_lemmas.keys())):
    dis = raw_lemmas[lemma]
    if lemma not in real_lemmas and lemma not in IGNORE:
        count += 1
        print(lemma, dis)

print("%s missing" % count)

18175
GRATA {''}
1 missing


## Étape 3: Fonctions de mise en forme

In [5]:
def compute_genitif(form, lemma):
    if form == lemma:
        return ", " + lemma[-2:]
    if len(form) <= 5:
        return ", " + form
    return ", " + "".join([
        fc
        for fi, (fc, lc) in enumerate(zip(form, lemma+" "*len(form)))
        # Take into account previous difference and current difference
        if lemma[:fi] != form[:fi] or fc != lc
    ])

def disambiguate(cur_lemma, dis):
    if dis:
        if "N" in dis:
            cur_lemma = cur_lemma.capitalize()
            dis = dis.replace("N", "")
        elif "A" in dis:
            cur_lemma = cur_lemma.capitalize()
            dis = dis.replace("A", "")
        if dis:
            if dis.isnumeric():
                cur_lemma = cur_lemma+dis
            else:
                cur_lemma = cur_lemma+"_"+dis
    return cur_lemma

CODES = {
    "11": ", ae",
    "12": ", i",
    "14": ", us",
    "15": ", ei",
    "21": ", a, um",
    "51": ", as, are"
}
from cltk.stem.latin.declension import CollatinusDecliner
decliner = CollatinusDecliner()


def get_secondary_form(lemma, pos):
    if pos in CODES:
        return CODES[pos]
    if lemma[-1].isnumeric():
        return ""
    if pos.startswith("1") or pos.startswith("2"):
        try:
            info = decliner.decline(lemma)#[3][0]
            if len(info) == 12:
                form = info[3][0]
                return compute_genitif(form, lemma)
            allowed = []
            for form, code in info:
                if code == '--s---mg-':
                    allowed.append(form)
            if lemma == "abstinax":
                print(allowed)
                print(sorted(allowed, key=len, reverse=True)[0])
            if allowed:
                return compute_genitif(sorted(allowed, key=len, reverse=True)[0], lemma)
        except:
            return ""
    elif pos.startswith("5"):
        try:
            info = decliner.decline(lemma)#[3][0]
            keep = []
            for form, code in info:
                if "code" == "v2spia---": # 2eme
                    keep.append(form[-2:])
                elif code == 'v--pna---': # Inf
                    keep.append(form[-3:])
                if len(keep) == 2:
                    return "".join(keep)
        except:
            return ""
    return ""

def get_desc(texte, code):
    sec = ""
    if code.endswith("7") and (code.startswith("1") or code.startswith("2")):
        sec = "(Greek)"
    return " ".join([texte, sec])

## Étape 4: Récupération des formes existantes dans les données d'entraînement

### 4.1 Récupération des données

In [6]:
# Calcul des formes les plus courantes
forms = defaultdict(Counter)
dict_forms = defaultdict(lambda: "")
found_pos = {}
for file in glob.glob("/home/thibault/dev/LASLA/protogenie_classic/*.tsv"):
    with open(file) as f:
        for lineno, line in enumerate(f):
            line = line.strip()
            if not line:
                continue
            line = line.split("\t")
            if lineno == 0:
                header = line
            else:
                line = {key: val for key, val in zip(header, line) if val != "_"}
                lemma = disambiguate(line["lemma"], line.get("Dis", ""))
                lemma_no_dis = disambiguate(line["lemma"], "")
                forms[lemma][line["token"]] += 1
                form = ""
                found_pos[lemma] = line["pos"]
                
                if line["pos"].startswith("VER"):
                    # Inf
                    if line.get("Mood") == "Inf" and line.get("Tense") == "Pres" \
                        and line.get("Voice") != "Pass":
                        form = line["token"][-3:]
                        if form not in dict_forms[lemma]:
                            dict_forms[lemma] += ", "+form
                        continue
                    # 2eme personne
                    elif line.get("Mood") == "Ind" and line.get("Tense") == "Pres"\
                        and line.get("Voice") != "Pass" and line.get("Person") == "2"\
                        and line.get("Numb") == "Sing":
                        form = line["token"][-2:]
                        if form.endswith("ris"):
                            form = form[-4:]
                        else:
                            form = form[-2:]
                        if form not in dict_forms[lemma]:
                            dict_forms[lemma] = ", "+form + dict_forms[lemma]
                        continue
                    # 3eme personne
                    elif line.get("Mood") == "Ind" and line.get("Tense") == "Pres"\
                        and line.get("Voice") != "Pass" and line.get("Person") == "3"\
                        and line.get("Numb") == "Sing":
                        form = line["token"]
                        if form.endswith("t"):
                            form = form.replace("t", "s")[-2:]
                        else:
                            form = form.replace("tur", "ris")[-4:]
                        if form not in dict_forms[lemma]:
                            dict_forms[lemma] = ", "+form + dict_forms[lemma]
                        continue
                        
                elif line["pos"].startswith("ADJ"):
                    if line.get("Case") == "Gen" and line.get("Gend") == "Masc" and line.get("Numb") == "1":
                        form = line["token"]
                    elif line.get("Case") in ("Dat", "Abl", "Acc") and not lemma_no_dis.endswith("us"):
                        if line["token"].endswith("e"):
                            form = line["token"][:-1]+"is"
                        if line["token"].endswith("ia"):
                            form = line["token"][:-2]+"is"
                        elif line["token"].endswith("i"):
                            form = line["token"][:-1]+"is"
                        elif line["token"].endswith("es"):
                            form = line["token"][:-2]+"is"
                        elif line["token"].endswith("um"):
                            form = line["token"][:-2]+"is"
                        elif line["token"].endswith("ibus"):
                            form = line["token"][:-4]+"is"
                        elif line["token"].endswith("em"):
                            form = line["token"][:-2]+"is"
                        
                elif line["pos"].startswith("NOM"):
                    if line.get("Case") == "Gen" and line.get("Numb") == "1":
                        form = line["token"]
                    elif line.get("Case") in ("Dat", "Abl", "Acc") and line.get("pos", "").endswith("3"):
                        
                        if line["token"].endswith("e"):
                            form = line["token"][:-1]+"is"
                        if line["token"].endswith("ia"):
                            form = line["token"][:-2]+"is"
                        elif line["token"].endswith("i"):
                            form = line["token"][:-1]+"is"
                        elif line["token"].endswith("es"):
                            form = line["token"][:-2]+"is"
                        elif line["token"].endswith("um"):
                            form = line["token"][:-2]+"is"
                        elif line["token"].endswith("ibus"):
                            form = line["token"][:-4]+"is"
                        elif line["token"].endswith("em"):
                            form = line["token"][:-2]+"is"

                if form:
                    # Ne marche pas pour abdomen, abdominis
                    dict_forms[lemma] = compute_genitif(form, lemma)

### 4.2 Filtre des formes les plus courantes

In [7]:
most_commons = {
    lemma: sorted(list(form.items()), key=lambda x: x[1], reverse=True)[:5]
    for lemma, form in forms.items()
}

## Étape 5 : Lemmes absents du PDF mais présent dans LASLA

### 5.1 Récupération

In [8]:
# Génération de la liste des lemmes connus
parsed_lemmas = []
for lemma, values in real_lemmas.items():
    lemma = lemma.lower().replace("v", "u")
    for dis, (pos, texte) in values.items():
        parsed_lemmas.append(disambiguate(lemma, dis).lower())
# Check lemma that chould be there but are not ?
missing = []
for lemma in most_commons:
    if lemma not in parsed_lemmas:
        missing.append(lemma)

### 5.2 To dedisambiguated list

In [9]:
def dedisambiguate(lem):
    dis = ""
    if lem[0].isupper():
        dis = "N"
        lem = lem
    if lem[-1].isnumeric():
        dis = lem[-1]+dis
        lem = lem[:-1]
    return lem.upper().replace("U", "V"), dis
        
missing = [(lemma, dedisambiguate(lemma)) for lemma in missing]
print("%s missing lemmas" % len(missing))

8316 missing lemmas


### 5.3 Génération de dictionnaire proche de Étape 1

In [10]:
ps = []
reverse_natures = {
    value: key
    for key, value in {
        "1": "NOM",
        "2": "ADJ",
        "3": "NUM",
        "4": "PRO",
        "5": "VER",
        "6": "ADV",
        "7": "PRE",
        "8": "CON",
        "9": "INJ",
        "N": "AUX"
    }.items()
}

new_lemmas = {}
for cur_lemma, (lemma, dis) in sorted(missing):
    if not cur_lemma.strip():
        continue
    pos = found_pos.get(cur_lemma, "")
    code = ""
    ps.append(pos)
    if pos[-1].isnumeric():
        code = reverse_natures[pos[:-1]]+pos[-1]
        pos = pos[:-1]
    else:
        code = reverse_natures[pos[:3]]
    if lemma not in real_lemmas or dis not in real_lemmas[lemma]:
        real_lemmas[lemma][dis] = (code, "Auto.")

## Étape 6 : Génération du TSV Final

### 6.1 Lowercasing keys

In [11]:
real_lemmas = {
    lemma: value
    for lemma, value in real_lemmas.items()
}

### 6.2 Write !

In [12]:
parsed_lemmas = []

with open("LASLA-Lemmas/out.txt", "w") as f:
    f.write("{}\t{}\t{}\t{}\t{}\t{}\n".format("Lemma", "Mark", "Nature", "Description", "Formes communes", "Code"))
    natures = {
        "1": "NOM",
        "2": "ADJ",
        "3": "NUM",
        "4": "PRO",
        "5": "VER",
        "6": "ADV",
        "7": "PRE",
        "8": "CON",
        "9": "INJ",
        "N": "AUX"
    }
    for lemma in sorted(list(real_lemmas.keys())):
        values = real_lemmas[lemma]
        lemma = lemma.lower().replace("v", "u")
        for dis, (pos, texte) in values.items():
            cur_lemma = disambiguate(lemma, dis)
            written_lemma = cur_lemma
            if False not in [x[0].isupper() for x, _ in most_commons.get(cur_lemma, [("a", "")])]:
                written_lemma = cur_lemma.capitalize()
            f.write("{lemma}\t{mark}\t{pos}\t{desc}\t{forms}\t{code}\n".format(
                lemma=written_lemma,
                code=pos, 
                pos=natures[pos[0]], 
                desc=get_desc(texte.replace("\t", ""), pos),
                forms=", ".join([x for x, _ in most_commons.get(cur_lemma, [])[:2]]),
                mark=(
                    dict_forms.get(cur_lemma, "") or get_secondary_form(cur_lemma, pos)
                )
            ))

['abstinacis', 'abstiniis']
abstinacis


In [13]:
!cp LASLA-Lemmas/out.txt /home/thibault/dev/forcellini-lemmas/dictionnary.tsv

In [10]:
with open("LASLA-Lemmas/out.txt") as f:
    with open("LASLA-Lemmas/lemma_list.txt", "w") as out:
        added = []
        for lineno, line in enumerate(f):
            if lineno == 0:
                continue
            added.append(line.strip().split("\t")[0])
        added.extend(""">
{
~
_
]
-
/
+
=
<
"
@
}
$
.
%
&
^
!
(
,
)
[
|
;
`
?
#
:
\
*
'
Maria
Stilicho
Honorius
Cypris
Serena""".split("\n"))
        out.write("\n".join(sorted(list(set(added)))))