# Generation du corpus en texte brut

## Constantes

In [12]:
CHECKSUM_PATH = "./checksum-xmls.csv"
CORPORA_PATH = "../../data/raw/corpora/**/*"
WRITE_TEXT = True # Set to True for really applying XSL
FORCE_REGENERATE = True  # Mostly useful if the XSL changes
FORCE_GENERATE_CSV_FROM_SCRATCH = True # Force writing from scratch the CHECKSUM

import tqdm

## Lecture des dépôts

In [2]:
import glob
from MyCapytain.resolvers.cts.local import CtsCapitainsLocalResolver


repositories = list(glob.glob(CORPORA_PATH, recursive=False))
resolver = CtsCapitainsLocalResolver(repositories)

../../data/raw/corpora/lascivaroma_additional-texts/lascivaroma_additional-texts/data/phi1351/phi005/phi1351.phi005.perseus-eng1.xml is not present
../../data/raw/corpora/lascivaroma_priapeia/lascivaroma_priapeia/data/phi1103/phi001/phi1103.phi001.lascivaroma-eng1.xml is not present
../../data/raw/corpora/lascivaroma_priapeia/lascivaroma_priapeia/data/phi1103/phi001/phi1103.phi001.lascivaroma-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0472/phi001/phi0472.phi001.perseus-eng3.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0472/phi001/phi0472.phi001.perseus-eng4.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0448/phi001/phi0448.phi001.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0448/phi002/phi0448.phi002.perseus-eng2.xml is 

../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0690/phi002/phi0690.phi002.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0690/phi003/phi0690.phi003.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1017/phi011/phi1017.phi011.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0660/phi003/phi0660.phi003.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1002/phi001/phi1002.phi001.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0914/phi001/phi0914.phi001.perseus-eng3.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0917/phi001/phi0917.phi

../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1056/phi001/phi1056.phi001.perseus-eng1.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1351/phi004/phi1351.phi004.perseus-eng1.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1351/phi001/phi1351.phi001.perseus-eng2.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1351/phi001/phi1351.phi001.perseus-eng1.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi1351/phi002/phi1351.phi002.perseus-eng1.xml is not present
../../data/raw/corpora/PerseusDL_canonical-latinLit/PerseusDL_canonical-latinLit/data/phi0550/phi001/phi0550.phi001.perseus-eng1.xml is not present


## Créations des dossiers

In [3]:
import os

os.makedirs("./lemmatized/plain-text/", exist_ok=True)

## Ouverture de l'XSL

In [4]:
import lxml.etree as ET

with open("../../helpers/reader/passage.transform.xsl") as tr:
    xml = ET.parse(tr)
    xsl = ET.XSLT(xml)

## Vérification des changements de sources

Currently needs to install following version: 
```shell
pip install --upgrade https://github.com/PonteIneptique/pie/archive/update-pie/1.6.0.zip#egg=nlp_pie 
pip install --upgrade https://github.com/hipster-philology/nlp-pie-taggers/archive/feature/hashlist.zip#egg=pie_extended
```

### Calcul des hashes

In [5]:
from pie_extended.utils.hashcheck import md5sum

xml_current_hashes = {
    
}

for text in tqdm.tqdm(resolver.texts):
    if text.lang == "lat":
        try:
            depth = len(text.citation)
            xml_current_hashes[os.path.abspath(text.path)] = md5sum(text.path)
        except Exception as E:
            print(E, text.id)

100%|██████████| 769/769 [00:00<00:00, 2564.52it/s]

object of type 'NoneType' has no len() urn:cts:latinLit:phi0474.phi035.perseus-lat1





### Ouverture des checksums de source enregistrées

In [6]:
from pie_extended.utils.hashcheck import check_checksum_from_file, read_checksum_csv

former_checksums = read_checksum_csv(CHECKSUM_PATH)
former_sources = [infos.source for infos in former_checksums.values() if infos.source]
_, changed_sources = check_checksum_from_file(CHECKSUM_PATH)

new_sources = [
    source
    for source in xml_current_hashes
    if source not in former_sources
]

print(f"{len(former_sources)} former source file")
print(f"{len(new_sources)} new source file detected")
print(f"{len(changed_sources)} changed source files detected")

0 former source file
652 new source file detected
0 changed source files detected


### Choix entre re-XSLisé tout ou seulement les éléments demandés

In [7]:
texts_to_generate = []

if FORCE_REGENERATE:
    texts_to_generate = list(xml_current_hashes.keys())
else:
    texts_to_generate = new_sources + changed_sources

## Conversion des XML en plein texte

In [8]:
import tqdm

sources_to_out = {
    
}

for text in tqdm.tqdm(resolver.texts):
    # If it's a Latin text and it's marked as a text to regenerate
    if text.lang == "lat" and os.path.abspath(text.path) in texts_to_generate:
        try:
            depth = len(text.citation)
            type_citation = [cite.name for cite in text.citation][-1]
            interactive_text = resolver.getTextualNode(textId=text.id)
            out_path = "./lemmatized/plain-text/{}.txt".format(text.id)
            
            sources_to_out[os.path.abspath(text.path)] = os.path.abspath(out_path)
            if WRITE_TEXT:
                with open(out_path, "w") as out:
                    for ref in resolver.getReffs(textId=text.id, level=depth):
                        if "index" not in str(ref):
                            psg = interactive_text.getTextualNode(subreference=ref)
                            out.write("[REF:{typ}.{cit}] {tex}\n".format(typ=type_citation, cit=ref, tex=str(xsl(psg.xml)))) 
        except Exception as E:
            print(text, E)
            
print(f"{len(sources_to_out)} regenerated files")

100%|██████████| 769/769 [00:04<00:00, 188.97it/s]

652 regenerated files





## Compte des mots

In [9]:
cnt = 0
for text in glob.glob("lemmatized/plain-text/*.txt"):
    if "-pie" not in text:
        with open(text) as f:
            cnt += len(f.read().split())
print(cnt)

12797120


## Génération ou mise à jour du fichier sources -> hash

In [11]:
import csv
if os.path.isfile(CHECKSUM_PATH) and FORCE_GENERATE_CSV_FROM_SCRATCH != True:
    rows = [
        ["input","checksum","source","source_checksum"]
    ]
    for text_file, checksuminfo in former_checksums.items():
        if checksuminfo.source in sources_to_out:
            rows.append([
                text_file,
                checksuminfo.checksum,
                checksuminfo.source,
                md5sum(checksuminfo.source)
            ])
            del sources_to_out[checksuminfo.source]
            # Update the checksum of the source BUT NOT THE TEXTFILE
        else:
            rows.append([
                text_file,
                checksuminfo.checksum,
                checksuminfo.source,
                checksuminfo.source_checksum
            ])# Write the current line as is.
    # And finally, we write new ones with new sources
    for source_file, output_file in sources_to_out.items():
        rows.append([
            output_file,
            "never_lemmatized",
            source_file,
            md5sum(source_file)
        ])
else:
    rows = [
        ["input","checksum","source","source_checksum"]
    ] + [
        [
            plaintext,
            md5sum(plaintext),
            source,
            md5sum(source)
        ]
        for source, plaintext in sources_to_out.items()
    ]
    
with open(CHECKSUM_PATH, "w") as f:
    writer = csv.writer(f)
    writer.writerows(rows)

In [None]:
!zip lemmatized/plain-text.zip lemmatized/plain-text -r