# Constantes

In [1]:
CHECKSUM_PATH = "./checksum-xmls.csv"
XSL = "../../helpers/reader/passage.transform.xsl"

# Récupération des éléments du corpus

In [2]:
import glob
from MyCapytain.resolvers.cts.local import CtsCapitainsLocalResolver

repositories = list(glob.glob("../../data/raw/corpora/**/*", recursive=False))
resolver = CtsCapitainsLocalResolver(repositories)

../../data/raw/corpora/lascivaroma_additional-texts/lascivaroma_additional-texts/data/phi1351/phi005/phi1351.phi005.perseus-eng1.xml is not present
../../data/raw/corpora/lascivaroma_priapeia/lascivaroma_priapeia/data/phi1103/phi001/phi1103.phi001.lascivaroma-eng1.xml is not present
../../data/raw/corpora/lascivaroma_priapeia/lascivaroma_priapeia/data/phi1103/phi001/phi1103.phi001.lascivaroma-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0472/phi001/phi0472.phi001.perseus-eng3.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0472/phi001/phi0472.phi001.perseus-eng4.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0448/phi001/phi0448.phi001.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0448/phi002/phi0448.phi002.perseus-eng2.xml is 

../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0690/phi003/phi0690.phi003.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1017/phi011/phi1017.phi011.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0660/phi003/phi0660.phi003.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1002/phi001/phi1002.phi001.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0914/phi001/phi0914.phi001.perseus-eng3.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0917/phi001/phi0917.phi001.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0119/phi004/phi0119.phi

../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1351/phi004/phi1351.phi004.perseus-eng1.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1351/phi001/phi1351.phi001.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1351/phi001/phi1351.phi001.perseus-eng1.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1351/phi002/phi1351.phi002.perseus-eng1.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0550/phi001/phi0550.phi001.perseus-eng1.xml is not present


# Génération des fichiers XML

## Nettoyage

In [3]:
!rm ./lemmatized/xml/*.xml
!rm ./lemmatized/*.xml

rm: cannot remove './lemmatized/*.xml': No such file or directory


## Récupération des données dépôts

In [4]:
REPOS = {
    
}
with open("../../data/raw/corpora.csv") as f:
    for lineno, line in enumerate(f):
        line = line.strip().split(";")
        if lineno == 0:
            headers = line
        else:
            line = dict(zip(headers, line))
            REPOS[line["Name"]] = line["Version"]

In [5]:
## Chargement des checksums
from hash_compute import md5sum, check_checksum_from_file, read_checksum_csv
import os.path

former_checksums = read_checksum_csv(CHECKSUM_PATH)
xsl_checksum = md5sum(XSL)

repositories_map = {
    "OpenGreekAndLatin_Latin": "OpenGreekAndLatin/Latin",
    "OpenGreekAndLatin_csel-dev": "OpenGreekAndLatin/csel-dev",
    "lascivaroma_additional-texts": "lascivaroma/additional-texts",
    "lascivaroma_priapeia": "lascivaroma/priapeia",
    "PerseusDL_canonical-latinLit": "PerseusDL/canonical-latinLit",
    "ponteineptique_digiliblt": "ponteineptique/digiliblt"
}
file_to_urn = {
    plaintext.split("/")[-1][:-4]: {
        "plaintext_checksum": info.checksum,
        "source_checksum": info.source_checksum,
        "repo": repositories_map[
            info.source.replace("/home/thibault/dev/these/data/raw/corpora/", "").split("/")[0]
        ],
        "version": REPOS[
            repositories_map[
                info.source.replace("/home/thibault/dev/these/data/raw/corpora/", "").split("/")[0]
            ]
        ],
        
    }
    for plaintext, info in former_checksums.items()
    if os.path.exists(plaintext)
}

## Génération

In [6]:
import regex as re
import os.path
import glob
import tqdm
import lxml.etree as etree



TEI = """<TEI xmlns="http://www.tei-c.org/ns/1.0">
    <teiHeader n="{tid}">
    
        <fileDesc>
            <titleStmt>
                <title>{title}</title>
                <author>{author}</author>
            </titleStmt>
            <publicationStmt>
                <publisher>
                    <persName>Thibault Clérice</persName>
                </publisher>
            </publicationStmt>
            <sourceDesc>
                <bibl>
                    <idno>{tid}</idno>
                    <link target="{uri}"/>
                    <dim source="xml" type="md5-checksum">{checksum_source}</dim>
                    <dim source="plaintext-transformation" type="md5-checksum">{checksum_text}</dim>
                    <dim source="xsl" type="md5-checksum">{checksum_xsl}</dim>
                </bibl>
            </sourceDesc>
        </fileDesc>
    </teiHeader>
    <text n="{tid}">
        <body>
            
        </body>
    </text>
</TEI>
"""
reference = re.compile(r"(\[REF:[A-Za-z0-9\.]+\])")

FORCE_XML = False
if FORCE_XML:
    tsvs = glob.glob("./lemmatized/tsv/*-pie.txt")
else:
    with open("new_xml.txt") as f:
        tsvs = [x for x in f.read().split() if x]

print(f"{len(tsvs)} file to XML")

for file in tqdm.tqdm(tsvs):
    with open(file) as read:
        xml = file.replace("-pie.txt", ".xml")
        if xml == file:
            print(xml, file)
            break
        text_id = os.path.basename(file)[:-len("-pie.txt")]
        if text_id not in file_to_urn:
            print(f"Ignoring {text_id}")
            continue
        metadata = resolver.getMetadata(text_id)
        title = metadata.parent.get_label()
        author = metadata.parent.parent.get_label()
        version_info = file_to_urn[text_id]
        
        root = etree.fromstring(TEI.format(
                tid=text_id,
                title=title,
                author=author,
                checksum_source=version_info["source_checksum"],
                checksum_text=version_info["plaintext_checksum"],
                checksum_xsl=xsl_checksum,
                uri="https://github.com/{name}/archive/{version}.zip".format(
                    name=version_info["repo"], version=version_info["version"]
                )
            ))
        
        body = root.xpath("//t:body", namespaces={"t": "http://www.tei-c.org/ns/1.0"})[0]
        #with open(xml, "w") as write:
        last_seg = None
        for lineno, line in enumerate(read):
            line = line.strip().split("\t")
            if lineno == 0:
                headers = line
                continue
            line = dict(zip(headers, line))


            # Deal with Segments
            if line["token"].startswith("[REF:"):
                section, *sid = line["token"][len("[REF:"):-1].split(".")
                sid = ".".join(sid)
                last_seg = etree.Element("{http://www.tei-c.org/ns/1.0}ab", type=section, n=f"{text_id}:{sid}")
                last_seg.tail = "\n            "
                last_seg.text = "\n                "
                body.append(last_seg)
                w_id = 1
                continue
            elif line["token"] in {"<", ">", "&"}:
                continue
            elif last_seg is None:
                section = "auto-fill-type"
                sid = "auto-fill-sid"
                last_seg = etree.Element("{http://www.tei-c.org/ns/1.0}ab", type=section, n=f"{text_id}:unregistered")
                last_seg.tail = "\n            "
                last_seg.text = "\n                "
                body.append(last_seg)
                w_id = 1
                
            new_word = etree.Element(
                "{http://www.tei-c.org/ns/1.0}w",
                rend=section,
                n=sid,
                pos=line["pos"],
                msd=line["morph"],
                lemma=line["lemma"],
            )
            new_word.text = line["token"]
            new_word.tail = "\n                "
            last_seg.append(new_word)
            w_id += 1

        with open(xml, "w") as f:
            f.write(etree.tostring(root, encoding=str, pretty_print=True))
            
            

  0%|          | 0/3 [00:00<?, ?it/s]

3 file to XML


100%|██████████| 3/3 [00:04<00:00,  1.44s/it]


## Déplacement des nouveaux fichiers.

In [7]:
!mkdir -p lemmatized/xml
!mv lemmatized/tsv/*.xml lemmatized/xml/

# Zip pour partage des données

In [8]:
!zip data lemmatized/xml/*.xml

  adding: lemmatized/xml/urn:cts:latinLit:phi2349.phi005.perseus-lat1.xml (deflated 92%)
  adding: lemmatized/xml/urn:cts:latinLit:phi2349.phi006.perseus-lat1.xml (deflated 92%)
  adding: lemmatized/xml/urn:cts:latinLit:phi2349.phi007.perseus-lat1.xml (deflated 92%)


In [9]:
!cp lemmatized/xml/* /home/thibault/dev/latin-lemmatized-texts/lemmatized/xml/
!cp lemmatized/xml/* /home/thibault/dev/blacklab-docker/corpora/latin-corpus/
!cd /home/thibault/dev/blacklab-docker/ && docker build -t dockerlab . && cd ~/dev/these/

Sending build context to Docker daemon  2.009GB
Step 1/14 : FROM tomcat:latest
 ---> f796d3d2c195
Step 2/14 : MAINTAINER Thibault Clérice <thibault.clerice@chartes.psl.eu>
 ---> Using cache
 ---> c3060f2949ee
Step 3/14 : ENV AS_VERSION 2.1.0
 ---> Using cache
 ---> 8db6b86464cd
Step 4/14 : ENV BL_VERSION 2.1.0
 ---> Using cache
 ---> 180e55cdedf4
Step 5/14 : ENV BLACKLAB_CONFIG_DIR /etc/blacklab
 ---> Using cache
 ---> 3b1a21457342
Step 6/14 : WORKDIR /jars/blacklab
 ---> Using cache
 ---> f7bb86af9500
Step 7/14 : ADD blacklab /etc/blacklab
 ---> Using cache
 ---> 963a7aaff990
Step 8/14 : ADD blacklab-server-${BL_VERSION}.war ${CATALINA_HOME}/webapps/blacklab-server.war
 ---> Using cache
 ---> a1b9d69b11da
Step 9/14 : ADD corpus-frontend-${AS_VERSION}.war ${CATALINA_HOME}/webapps/corpus-frontend.war
 ---> Using cache
 ---> c70a9003d64d
Step 10/14 : RUN ls /etc/blacklab
 ---> Using cache
 ---> 0ff2eb99751d
Step 11/14 : RUN mkdir -p /data/blacklab/indexes && mkdir -p /jars/blacklab
 --->