# Création d'un corpus équivalent au corpus Perseus Treebank


## Décompte des tokens par oeuvres

In [97]:
from collections import Counter

chunks = Counter()
tokens = Counter()

with open("UD_Perseus_train_2.1.txt") as f:
    current_text = None
    for line in f:
        line = line.strip()
        if not line:
            continue
        if line.startswith("# sent_id = "):
            text_id = line[len("# sent_id = "):].split("@")[0]
            chunks[text_id] += 1
            current_text = text_id
        elif line[0].isnumeric():
            tokens[current_text] += 1
            
print(list(tokens.keys()))

['phi0975.phi001.perseus-lat1.tb.xml', 'phi1221.phi007.perseus-lat1.tb.xml', 'phi1348.abo012.perseus-lat1.tb.xml', 'phi1351.phi005.perseus-lat1.tb.xml', 'tlg0031.tlg027.perseus-lat1.tb.xml', 'phi0448.phi001.perseus-lat1.tb.xml', 'phi0474.phi013.perseus-lat1.tb.xml', 'phi0620.phi001.perseus-lat1.tb.xml', 'phi0631.phi001.perseus-lat1.tb.xml', 'phi0690.phi003.perseus-lat1.tb.xml']


Sur l'ensemble des textes trouvés dans le fichier d'entraînement de Universal Dependencies, 4 oeuvres ne sont pas disponibles dans le corpus du LASLA:

1. Phèdres, Fables
2. Auguste, Res Gestae
3. Suétone Vie d'Auguste
4. Vulgate

2 oeuvres supplémentaires sont citées sur le dépôt source sans être trouvées dans le fichier:

1. Ovide, Fastes
2. Pétrone, Satyricon

Nous remplacons les métamorphoses par les Fastes.

In [98]:
# Now we align with filename from LASLA
# Note that Petronius is mentionned there : https://github.com/PerseusDL/treebank_data/tree/7100a6b86826e121c6205182429ee670db64a392/v2.1/Latin
# But not found (phi0972), neither Ovidius (0959)
maps = {
    # Phèdre, Fables
    #      -> Non disponible
    'phi0975.phi001.perseus-lat1.tb.xml': [],
    # Auguste, Res Gestae
    #      -> Non disponible
    'phi1221.phi007.perseus-lat1.tb.xml': [],
    # Suétone, Vie d'Auguste
    #      -> Non disponible
    #      -> Remplacé par Satyricon car mentionné
    'phi1348.abo012.perseus-lat1.tb.xml': [
    ],
    # Tacite, Histoires
    'phi1351.phi005.perseus-lat1.tb.xml': [
        "Tacitus_TacHistoriae_TacHist1.tsv",
        "Tacitus_TacHistoriae_TacHist2.tsv",
        "Tacitus_TacHistoriae_TacHist3.tsv",
        "Tacitus_TacHistoriae_TacHist4.tsv",
        "Tacitus_TacHistoriae_TacHist5.tsv",
    ],
    # Vulgate
    #      -> Non disponible
    'tlg0031.tlg027.perseus-lat1.tb.xml': [
    ],
    # Caesar
    'phi0448.phi001.perseus-lat1.tb.xml': [
        "Caesar_BellumGallicum_CaesBG1.tsv",
        "Caesar_BellumGallicum_CaesBG2.tsv",
        "Caesar_BellumGallicum_CaesBG3.tsv",
        "Caesar_BellumGallicum_CaesBG4.tsv",
        "Caesar_BellumGallicum_CaesBG5.tsv",
        "Caesar_BellumGallicum_CaesBG6.tsv",
        "Caesar_BellumGallicum_CaesBG7.tsv"
    ],
    # Cicero, In Catilinam
    'phi0474.phi013.perseus-lat1.tb.xml': [
        "Cicero_InCatilinam_CicCat1.tsv",
        "Cicero_InCatilinam_CicCat2.tsv",
        "Cicero_InCatilinam_CicCat3.tsv",
        "Cicero_InCatilinam_CicCat4.tsv"
    ],
    # Properce
    'phi0620.phi001.perseus-lat1.tb.xml': [
        "Propertius_PropertiusElegiae_Propert1.tsv",
        "Propertius_PropertiusElegiae_Propert2.tsv",
        "Propertius_PropertiusElegiae_Propert3.tsv",
        "Propertius_PropertiusElegiae_Propert4.tsv"
    ],
    # Salluste, 
    'phi0631.phi001.perseus-lat1.tb.xml': [
        "Sallustius_Catilina_SalCatil.tsv"
    ],
    # Virgile, 
    'phi0690.phi003.perseus-lat1.tb.xml': [
        "Vergilius_Aeneis_VerAen01.tsv",
        "Vergilius_Aeneis_VerAen02.tsv",
        "Vergilius_Aeneis_VerAen03.tsv",
        "Vergilius_Aeneis_VerAen04.tsv",
        "Vergilius_Aeneis_VerAen05.tsv",
        "Vergilius_Aeneis_VerAen06.tsv",
        "Vergilius_Aeneis_VerAen07.tsv",
        "Vergilius_Aeneis_VerAen08.tsv",
        "Vergilius_Aeneis_VerAen09.tsv",
        "Vergilius_Aeneis_VerAen10.tsv",
        "Vergilius_Aeneis_VerAen11.tsv",
        "Vergilius_Aeneis_VerAen12.tsv"
    ]
}

titles = {
    'phi0975.phi001.perseus-lat1.tb.xml': "Phèdre, Fables",
    'phi1221.phi007.perseus-lat1.tb.xml': "Auguste, Res Gestae",
    'phi1348.abo012.perseus-lat1.tb.xml': "Suétone, Vie d'Auguste",
    'phi1351.phi005.perseus-lat1.tb.xml': "Tacite, Histoires",
    'tlg0031.tlg027.perseus-lat1.tb.xml': "Vulgate",
    'phi0448.phi001.perseus-lat1.tb.xml': "Caesar",
    'phi0474.phi013.perseus-lat1.tb.xml': "Cicero, In Catilinam",
    'phi0620.phi001.perseus-lat1.tb.xml': "Properce",
    'phi0631.phi001.perseus-lat1.tb.xml': "Salluste, Catilina",
    'phi0690.phi003.perseus-lat1.tb.xml': "Virgile, Énéide",
}

In [100]:
# Print table
from IPython.display import HTML, display
import tabulate

rows = [["Title", "Chunks", "Tokens"]]
# Sort by author
titles_rows = sorted(list(titles.items()), key=lambda x: x[1])

totals = [0, 0]

for key, title in titles_rows:
    rows.append([title, chunks[key], tokens[key]])
    totals[0] += chunks[key]
    totals[1] += tokens[key]

rows.append(["Total", *totals])
display(HTML(tabulate.tabulate(rows, tablefmt='html')))

print(tabulate.tabulate(rows, tablefmt='latex'))

0,1,2
Title,Chunks,Tokens
"Auguste, Res Gestae",38,708
Caesar,24,352
"Cicero, In Catilinam",137,1897
"Phèdre, Fables",233,2397
Properce,224,2776
"Salluste, Catilina",336,4999
"Suétone, Vie d'Auguste",109,2046
"Tacite, Histoires",64,866
"Virgile, Énéide",15,142


\begin{tabular}{lll}
\hline
 Title                  & Chunks & Tokens \\
 Auguste, Res Gestae    & 38     & 708    \\
 Caesar                 & 24     & 352    \\
 Cicero, In Catilinam   & 137    & 1897   \\
 Phèdre, Fables         & 233    & 2397   \\
 Properce               & 224    & 2776   \\
 Salluste, Catilina     & 336    & 4999   \\
 Suétone, Vie d'Auguste & 109    & 2046   \\
 Tacite, Histoires      & 64     & 866    \\
 Virgile, Énéide        & 15     & 142    \\
 Vulgate                & 154    & 2001   \\
 Total                  & 1334   & 18184  \\
\hline
\end{tabular}


In [101]:
# Remplacement d'element
replacements = [
    "Petronius_PetroniusSatiricon_PetronSa.tsv",
    "Ovidius_Fasti_OvFasti1.tsv",
    "Ovidius_Fasti_OvFasti2.tsv",
    "Ovidius_Fasti_OvFasti3.tsv"
]
use = Counter()
import math

for key, val in maps.items():
    if not val:
        val = replacements  # We use the replacements list if we have no mapping
    nb_files = len(val)
    nb_chunks = math.ceil(chunks[key]/nb_files)
    for file in val:
        use[file] += nb_chunks
        

print(tabulate.tabulate(list(use.items()), tablefmt='latex'))

\begin{tabular}{lr}
\hline
 Petronius\_PetroniusSatiricon\_PetronSa.tsv & 136 \\
 Ovidius\_Fasti\_OvFasti1.tsv                & 136 \\
 Ovidius\_Fasti\_OvFasti2.tsv                & 136 \\
 Ovidius\_Fasti\_OvFasti3.tsv                & 136 \\
 Tacitus\_TacHistoriae\_TacHist1.tsv         &  13 \\
 Tacitus\_TacHistoriae\_TacHist2.tsv         &  13 \\
 Tacitus\_TacHistoriae\_TacHist3.tsv         &  13 \\
 Tacitus\_TacHistoriae\_TacHist4.tsv         &  13 \\
 Tacitus\_TacHistoriae\_TacHist5.tsv         &  13 \\
 Caesar\_BellumGallicum\_CaesBG1.tsv         &   4 \\
 Caesar\_BellumGallicum\_CaesBG2.tsv         &   4 \\
 Caesar\_BellumGallicum\_CaesBG3.tsv         &   4 \\
 Caesar\_BellumGallicum\_CaesBG4.tsv         &   4 \\
 Caesar\_BellumGallicum\_CaesBG5.tsv         &   4 \\
 Caesar\_BellumGallicum\_CaesBG6.tsv         &   4 \\
 Caesar\_BellumGallicum\_CaesBG7.tsv         &   4 \\
 Cicero\_InCatilinam\_CicCat1.tsv            &  35 \\
 Cicero\_InCatilinam\_CicCat2.tsv            &  35 \\
 

In [102]:
import random

train_used = {"tokens": Counter(), "chunks": Counter()}
test_used = {"tokens": Counter(), "chunks": Counter()}
dev_used = {"tokens": Counter(), "chunks": Counter()}
files = {}
factor = 1
with open("../../../LASLA/mood-tense-voice/perseus-train.tsv", "w") as train:
    with open("../../../LASLA/mood-tense-voice/perseus-test.tsv", "w") as test:
        with open("../../../LASLA/mood-tense-voice/perseus-dev.tsv", "w") as dev:
            header_written = False
            for file, nb_chunks in use.items():
                sentences = []
                tokens = []
                with open("../../../LASLA/mood-tense-voice/train/"+file) as f:
                    for line_no, line in enumerate(f):
                        line = line.strip()
                        if line_no == 0:
                            if header_written == False:
                                train.write(line+"\n")
                                test.write(line+"\n")
                                dev.write(line+"\n")
                            header = line.split()
                            continue
                        if not line:
                            if tokens:
                                sentences.append(tokens)
                                tokens = []
                            continue
                        line_dict = dict(zip(header, line.split()))
                        tokens.append(line_dict)
                random.shuffle(sentences)
                # Write train sentences
                for sentence in sentences[:nb_chunks]:
                    for tok in sentence:
                        train.write("\t".join([tok[head] for head in header])+"\n")
                        train_used["tokens"][file] += 1
                    train.write("\n\n")
                    train_used["chunks"][file] += 1

                # Write dev sentence
                nb_chunks_dev = math.ceil(0.1 * nb_chunks)
                # We take at least one sentence, but 10% of the number of expected chunks
                if nb_chunks_dev + nb_chunks <= len(sentences):
                    # We can write dev !
                    for sentence in sentences[nb_chunks:nb_chunks+nb_chunks_dev]:
                        for tok in sentence:
                            dev.write("\t".join([tok[head] for head in header])+"\n")
                            dev_used["tokens"][file] += 1
                        dev.write("\n\n")
                        dev_used["chunks"][file] += 1
                     
                # Write the same amount in test if possible
                nb_chunks_test = math.ceil(0.2 * nb_chunks)
                nb_chunks_test = min(len(sentences), nb_chunks_dev+nb_chunks+nb_chunks_test)
                if nb_chunks_dev+nb_chunks < len(sentences):
                    for sentence in sentences[nb_chunks_dev+nb_chunks:nb_chunks_test]:
                        for tok in sentence:
                            test.write("\t".join([tok[head] for head in header])+"\n")
                            test_used["tokens"][file] += 1
                        test.write("\n\n")
                        test_used["chunks"][file] += 1


In [103]:
# Get the table
import re
def get_title(filename):
    for group in re.findall(r"([A-Z][a-z]+)([A-Z][a-zA-Z]+)?0?(\d+)?\.tsv", filename):
        return ", ".join([g for g in group if g])

rows = [["File", "Tokens", "", "", "Chunks", "", ""], ["", "Train", "Dev", "Test", "Train", "Dev", "Test"]]
total = {"chunks": {"train": 0, "dev": 0, "test": 0}, "tokens": {"train": 0, "dev": 0, "test": 0}}
for file in use:
    rows.append([get_title(file)])
    for cat in ["tokens", "chunks"]:
        for corpus, counter in {"train": train_used, "dev": dev_used, "test": test_used}.items():
            rows[-1].append(counter[cat][file])
            total[cat][corpus] += counter[cat][file]

rows.append(
    ["Total"] + \
    [total[cat][corpus] for cat in ["tokens", "chunks"] for corpus in total[cat]]

)
            
            
print(tabulate.tabulate(rows, tablefmt='latex'))

\begin{tabular}{lllllll}
\hline
 File         & Tokens &      &      & Chunks &     &      \\
              & Train  & Dev  & Test & Train  & Dev & Test \\
 Petron, Sa   & 1955   & 176  & 402  & 136    & 14  & 28   \\
 Ov, Fasti, 1 & 2251   & 208  & 455  & 136    & 14  & 28   \\
 Ov, Fasti, 2 & 1901   & 186  & 445  & 136    & 14  & 28   \\
 Ov, Fasti, 3 & 2225   & 154  & 439  & 136    & 14  & 28   \\
 Tac, Hist, 1 & 281    & 72   & 51   & 13     & 2   & 3    \\
 Tac, Hist, 2 & 251    & 70   & 88   & 13     & 2   & 3    \\
 Tac, Hist, 3 & 269    & 40   & 57   & 13     & 2   & 3    \\
 Tac, Hist, 4 & 592    & 30   & 95   & 13     & 2   & 3    \\
 Tac, Hist, 5 & 516    & 126  & 144  & 13     & 2   & 3    \\
 Caes, BG, 1  & 146    & 26   & 54   & 4      & 1   & 1    \\
 Caes, BG, 2  & 110    & 63   & 23   & 4      & 1   & 1    \\
 Caes, BG, 3  & 76     & 32   & 19   & 4      & 1   & 1    \\
 Caes, BG, 4  & 99     & 32   & 30   & 4      & 1   & 1    \\
 Caes, BG, 5  & 73     & 10   & 16   &

In [107]:
# Write the configuration
# Create the configuration files for all files
import json

BASE_CONFIG = json.loads("""{
    "modelname": "model-perseus-",
    "modelpath": "./models/",
    "run_test": false,
    "max_sent_len": 35,
    "max_sents": 1000000,
    "input_path": "./protogenie-partial/train-perseus.tsv",
    "test_path": "./protogenie-partial/test-perseus.tsv",
    "dev_path": "./protogenie-partial/dev-perseus.tsv",
    "breakline_ref": "pos",
    "breakline_data": "$.",
    "char_max_size": 500,
    "word_max_size": 20000,
    "char_min_freq": 1,
    "word_min_freq": 1,
    "char_eos": true,
    "char_bos": true,
    "header": true,
    "sep": "\\t",
    "tasks": [
        {
            "name": "lemma",
            "target": true,
            "context": "sentence",
            "level": "char",
            "decoder": "attentional",
            "settings": {
                "bos": true,
                "eos": true,
                "lower": true,
                "target": "lemma"
            },
            "layer": -1
        },
        {
            "name": "pos"
        },
        {
            "name": "Gend"
        }
    ],
    "task_defaults": {
        "level": "token",
        "layer": -1,
        "decoder": "linear",
        "context": "sentence"
    },
    "patience": 5,
    "factor": 0.5,
    "threshold": 0.0001,
    "min_weight": 0.2,
    "include_lm": true,
    "lm_shared_softmax": true,
    "lm_schedule": {
        "patience": 2,
        "factor": 0.5,
        "weight": 0.2,
        "mode": "min"
    },
    "batch_size": 256,
    "dropout": 0.25,
    "lr": 0.001,
    "lr_factor": 0.5,
    "lr_patience": 2,
    "epochs": 100,
    "cell": "GRU",
    "num_layers": 1,
    "hidden_size": 128,
    "wemb_dim": 100,
    "cemb_dim": 150,
    "cemb_type": "rnn",
    "cemb_layers": 2,
    "checks_per_epoch": 1,
    "report_freq": 200,
    "verbose": true,
    "device": "cuda",
    "buffer_size": 10000,
    "minimize_pad": false,
    "word_dropout": 0,
    "shuffle": true,
    "optimizer": "Adam",
    "clip_norm": 5,
    "pretrain_embeddings": false,
    "load_pretrained_embeddings": "",
    "load_pretrained_encoder": "",
    "freeze_embeddings": false,
    "custom_cemb_cell": false,
    "merge_type": "concat",
    "scorer": "general",
    "linear_layers": 1
}""")
# Reduced Cemb DIM from 300 to 150
with open("../../../LASLA/configs/perseus-train.json", "w") as f:
    json.dump(BASE_CONFIG, f)