# Constantes

In [1]:
PLAIN_TEXT_FILES_PATH = "lemmatized/plain-text/*.txt"
CHECKSUM_PATH = "./checksum-xmls.csv"
FORCE_RELEMMATIZE = False # Force relemmatization
DOWNLOAD_MODELS = False # Update the model files

# Nettoyage et upgrade

Potentiellement besoin de :

```shell
!pip install https://github.com/PonteIneptique/pie/archive/improvement/AttentionDecoder.predict_max.zip#egg=nlp_pie --upgrade
!pip install --upgrade https://github.com/hipster-philology/nlp-pie-taggers/archive/feature/hashlist.zip#egg
```

In [2]:
!rm ./lemmatized/plain-text/*-pie*.txt
#!pip install --upgrade pie-extended
#!pip install --upgrade https://github.com/PonteIneptique/pie/archive/improvement/AttentionDecoder.predict_max.zip#egg=nlp_pie

rm: cannot remove './lemmatized/plain-text/*-pie*.txt': No such file or directory


In [3]:
#pip install --upgrade --no-cache https://github.com/PonteIneptique/pie/archive/torch/upgrade-fix.zip#egg=nlp_pie

# Récupération des textes à lemmatiser

## Lecture du fichier des checksum

In [4]:
from hash_compute import md5sum, check_checksum_from_file, read_checksum_csv
import os.path
import glob

target_files = glob.glob(PLAIN_TEXT_FILES_PATH)
existing_files = {
    os.path.abspath(target): md5sum(target)
    for target in target_files
}

former_checksums = read_checksum_csv(CHECKSUM_PATH)
plaintext_former = [
    os.path.abspath(element)
    for element in former_checksums.keys()
]
#print(former_checksums)
new_input = [
    target
    for target in existing_files
    if target not in plaintext_former
]
changed_input, _ = check_checksum_from_file(CHECKSUM_PATH)


print(f"{len(former_checksums)} former source file")
print(f"{len(target_files)} target files")
print(f"{len(changed_input)} changed input files detected")
print(f"{len(new_input)} new input files detected")

746 former source file
746 target files
3 changed input files detected
0 new input files detected


## Génération de la liste des fichiers à produire

In [5]:
texts_to_lemmatize = []

if FORCE_RELEMMATIZE:
    texts_to_lemmatize = list(existing_files.keys())
else:
    texts_to_lemmatize = new_input + changed_input
    
print(f"{len(texts_to_lemmatize)} files to lemmatize")

3 files to lemmatize


# Lemmatisation

## Initialisation du tagger

In [6]:
from typing import List
from pie_extended.cli.utils import get_tagger, get_model, download

if DOWNLOAD_MODELS:
    for dl in download("lasla"):
        x = 1

from pie_extended.models.lasla import VERSION
print(f"LASLA Version {VERSION}")
# model_path allows you to override the model loaded by another .tar
model_name = "lasla"
tagger = get_tagger(model_name, batch_size=64, device="cuda", model_path=None)

LASLA Version 0.0.5b


## Taggage

In [7]:
# Get the main object from the model (: data iterator + postprocesor
from pie_extended.models.lasla.imports import get_iterator_and_processor
import glob
import tqdm

for file in tqdm.tqdm(texts_to_lemmatize):
    try:
        if "-pie" not in file:
            iterator, processor = get_iterator_and_processor(max_tokens=64)
            tagger.tag_file(file, iterator=iterator, processor=processor)
    except Exception as E:
        print(file)
        print(E)
        raise E

100%|██████████| 3/3 [03:00<00:00, 60.25s/it] 


## Debug

In [8]:
# Get the main object from the model (: data iterator + postprocesor
from pie_extended.models.lasla.imports import get_iterator_and_processor

iterator, processor = get_iterator_and_processor()
file = "lemmatized/plain-text/urn:cts:latinLit:stoa0275.stoa006.opp-lat1.txt"
#with open(file) as f:
#    print(f.read())

#tagger.tag_file(file, iterator=iterator, processor=processor)


## Déplacement des fichiers dans le dossier TSV

In [9]:
!mkdir -p lemmatized/tsv
!mv lemmatized/plain-text/*-pie.txt lemmatized/tsv/

# Update de l'index des fichiers à lemmatiser

In [10]:
## ToDo When there is a new update of the corpus
from hash_compute import write_csv_checksums

rows, modified = write_csv_checksums(CHECKSUM_PATH, texts_to_lemmatize, _write=True)

print("Modified files")
for mod in modified:
    print(modified)

Modified files
[ModifiedFiles(filename='/home/thibault/dev/these/notebooks/Data Preparation - Corpora/lemmatized/plain-text/urn:cts:latinLit:phi2349.phi005.perseus-lat1.txt', is_source=False, checksum='e9351896b85ff1d91c07c1f1fdf76681'), ModifiedFiles(filename='/home/thibault/dev/these/notebooks/Data Preparation - Corpora/lemmatized/plain-text/urn:cts:latinLit:phi2349.phi007.perseus-lat1.txt', is_source=False, checksum='7916efcf4414fbd6dd4f0d9d3612d44d'), ModifiedFiles(filename='/home/thibault/dev/these/notebooks/Data Preparation - Corpora/lemmatized/plain-text/urn:cts:latinLit:phi2349.phi006.perseus-lat1.txt', is_source=False, checksum='214c6a429c5666d532132a8be46d4629')]
[ModifiedFiles(filename='/home/thibault/dev/these/notebooks/Data Preparation - Corpora/lemmatized/plain-text/urn:cts:latinLit:phi2349.phi005.perseus-lat1.txt', is_source=False, checksum='e9351896b85ff1d91c07c1f1fdf76681'), ModifiedFiles(filename='/home/thibault/dev/these/notebooks/Data Preparation - Corpora/lemmatize

## Liste des fichiers à re-xmliser

In [11]:

with open("new_xml.txt", "w") as f:
    for mod in modified:
        f.write(f"{os.path.relpath(mod.filename.replace('.txt', '-pie.txt').replace('plain-text', 'tsv'))}\n")