In [24]:
CORPUS_DIR = "/home/thibault/dev/latin-lemmatized-texts/lemmatized/xml/"

import glob
import lxml.etree as ET
from typing import List
import random
import tqdm

In [25]:
def get_forbidden_passages(directory: str = "/home/thibault/dev/these-corpus/data/*.xml"):
    refs = []
    sentences = []
    for file in tqdm.tqdm(glob.glob(directory)):
        xml = ET.parse(file)
        urn = xml.xpath("//idno[@type='CTS_URN']")[0].text
        ref = xml.xpath("//biblScope[@unit='ref']")[0].text
        refs.append(f"{urn}:{ref}")
        sentences.append(" ".join(xml.xpath("//w[@pos!='PUNC']/text()")))
    return set(refs), " ".join(list(set(sentences)))
known_refs, known_sentences = get_forbidden_passages()


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2516/2516 [00:00<00:00, 13337.13it/s]


In [26]:
def has_other_than_punc(sentence: List[ET.Element]) -> bool:
    return len([tok for tok in sentence if tok.attrib["pos"] != "PUNC"]) > 0

def debug_print(sentence: List[ET.Element]):
    print(" ".join([
        w.text.strip()
        for w in sentence
    ]))

def get_urn(sent: List[ET.Element], file_id: str):
    ref1, ref2 = sent[0].attrib["n"], sent[-1].attrib["n"]
    if ref1 == ref2:
        return f"{file_id}:{ref1}"
    else:
        return f"{file_id}:{ref1}-{ref2}"
    
def get_simplified_sentence(sent: List[ET.Element]):
    return " ".join([et.text for et in sent if et.attrib.get("pos") != "PUNC"])
    
def write_sample(sent: List[ET.Element], file_id: str, idx: int):
    prefix = "_".join(file_id.split(":")[2:])
    with open(f"dataset/negative-examples/{prefix}--{idx}.xml", "w") as out:
        joined_w = """""".join([ET.tostring(w, encoding=str).replace('xmlns="http://www.tei-c.org/ns/1.0"', "") for w in sent])
        
        out.write(f"""<div type="fragment" ana="#negative-example">
    <bibl type="source">
        <idno type="CTS_URN">{file_id}</idno>
    </bibl>
    <quote xml:lang="lat" source="{get_urn(sent, file_id)}">
        {joined_w}
    </quote>
 </div>""")

sample = 0
KEEP = 30
for file in tqdm.tqdm(glob.glob(CORPUS_DIR+"*.xml")):
    x = ET.parse(file)
    # xmlns="http://www.tei-c.org/ns/1.0"
    sentences = [
        []
    ]
    file_id = x.xpath("//tei:teiHeader/@n", namespaces={"tei": "http://www.tei-c.org/ns/1.0"})[0]
    
    for w in x.xpath("//tei:w", namespaces={"tei": "http://www.tei-c.org/ns/1.0"}):
            
        # Work Around for Vulgate
        if "greekLit" in file_id:
            if len(sentences[-1]) and sentences[-1][-1].attrib["n"] != w.attrib["n"]:
                sentences.append([])
            sentences[-1].append(w)
        else:
            sentences[-1].append(w)

            if w.attrib["pos"] == "PUNC" \
                and w.attrib["lemma"] in {"?", "!", "...", ".", ";", ")"} \
                and len(sentences[-1]) and has_other_than_punc(sentences[-1]):
                
                sentences.append([])

    # Keep sentences larger than 5 words
    sentences = [s for s in sentences if len(s) >= 5]
    
    # Shuffle !
    random.shuffle(sentences)
    
    # Keep 5 ?
    
    kept_samples = []
    while len(kept_samples) < KEEP and len(sentences) > 0:
        sent = sentences.pop()
        urn = get_urn(sent, file_id)
        full_text = get_simplified_sentence(sent)
        if urn in known_refs or full_text in known_sentences:
            continue
        kept_samples.append(sent)
    
    for idx, sent in enumerate(kept_samples):
        write_sample(sent, file_id, idx)
    sample += 1

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 853/853 [01:00<00:00, 14.05it/s]
