In [35]:
CORPUS_DIR = "/home/thibault/dev/latin-lemmatized-texts/lemmatized/xml/"

import glob
import lxml.etree as ET
from typing import List
import random
import tqdm

In [49]:
def has_other_than_punc(sentence: List[ET.Element]) -> bool:
    return len([tok for tok in sentence if tok.attrib["pos"] != "PUNC"]) > 0

def debug_print(sentence: List[ET.Element]):
    print(" ".join([
        w.text.strip()
        for w in sentence
    ]))

def write_sample(sent: List[ET.Element], file_id: str, idx: int):
    prefix = "_".join(file_id.split(":")[2:])
    with open(f"negative-examples/{prefix}--{idx}.xml", "w") as out:
        joined_w = """""".join([ET.tostring(w, encoding=str).replace('xmlns="http://www.tei-c.org/ns/1.0"', "") for w in sent])
        ref1, ref2 = sent[0].attrib["n"], sent[-1].attrib["n"]
        
        out.write(f"""<div type="fragment" ana="#negative-example">
    <bibl type="source">
        <idno type="CTS_URN">{file_id}</idno>
    </bibl>
    <quote xml:lang="lat" source="{file_id}:{ref1}-{ref2}">
        {joined_w}
    </quote>
 </div>""")

sample = 0    
for file in tqdm.tqdm(glob.glob(CORPUS_DIR+"*.xml")):
    x = ET.parse(file)
    # xmlns="http://www.tei-c.org/ns/1.0"
    sentences = [
        []
    ]
    file_id = x.xpath("//tei:teiHeader/@n", namespaces={"tei": "http://www.tei-c.org/ns/1.0"})[0]
    
    for w in x.xpath("//tei:w", namespaces={"tei": "http://www.tei-c.org/ns/1.0"}):
            
        # Work Around for Vulgate
        if "greekLit" in file_id:
            if len(sentences[-1]) and sentences[-1][-1].attrib["n"] != w.attrib["n"]:
                sentences.append([])
            sentences[-1].append(w)
        else:
            sentences[-1].append(w)

            if w.attrib["pos"] == "PUNC" \
                and w.attrib["lemma"] in {"?", "!", "...", ".", ";", ")"} \
                and len(sentences[-1]) and has_other_than_punc(sentences[-1]):
                
                sentences.append([])

    # Keep sentences larger than 5 words
    sentences = [s for s in sentences if len(s) >= 5]
    
    # Shuffle !
    random.shuffle(sentences)
    
    # Keep 5 ?
    
    for idx, sent in enumerate(sentences[:5]):
        write_sample(sent, file_id, idx)
    sample += 1

100%|██████████| 853/853 [01:18<00:00, 10.85it/s]


Multifariam multisque {multisque} modis ago gratias , Fortunate mihi in Domino carissime , diuinae benignitati , quod et tuam pietatem excitauit ad exstimulandum officium meum , ut unico libello commilitonibus nostris materiam subministrarem , unde sibi gratia spiritus opitulante munimenta telaque {telaque} fabricarentur aduersus persecutionum incursus , et mihi quod petebas conanti non dedignata est adesse , et quod utriusque nostrum operam non passa est esse sterilem , hoc est quod mei Fortunati pium desiderium bene fortunauit , et ministri sui officium uoluit esse gregis sui beneficium .
scribis enim ex eo libro multum roboris et alacritatis accessisse strenuis , timidioribus autem et infirmis tantum animorum esse additum , ut ex his quoque per Dei gratiam certam uictoriam nobis promittere debeamus .
eoque {eoque} libentius obtemperaui posterioribus tuis litteris , quibus orat tua caritas , ut quoniam hoc auspiciis Domini nostri Iesu Christi bene successit , addam uolumen alterum de