# Extensibilité des résultats en fonction de la taille du corpus d'entraînement

On applique une coupe au corpus d'entraînement de 1%, 2%, 5%, etc.

En fonction de cette coupe, on analyse le résultat sur trois tâches: 
- la lemmatisation car elle est la tâche centrale du lemmatiseur,
- la POS car elle nécessite une compréhension de la syntaxe et du vocabulaire,
- le genre car elle nécessite une approche morphologique.

On obtient les coupes suivantes :

In [1]:
import glob
import os.path
from collections import defaultdict

toks = defaultdict(lambda: defaultdict(lambda: 0))
files = []
for file in sorted(list(glob.glob("../../../LASLA/0.*.txt"))):
    percent = os.path.basename(file).replace(".txt", "")
    files.append(percent)
    with open(file) as f:
        train = False
        started = False
        for line in f:
            # We look for the first iteration of train
            # and then the first iteration of started
            if not train:
                train = line.strip() == "train"
                continue
            if not started:
                started = line.strip().split() == ["File", "Chunks", "Tokens"]
                continue
            if line.strip() == "# train's statistics":
                break
            text, chunks, tokens = line.strip().split()
            text = text.replace(".tsv", "")
            if text[-1].isnumeric():
                text = text[:-1]
            # Reduce by author to have a manageable list
            author = text.split("_")[0]
            toks[author][percent] += int(tokens)

In [2]:
# Create a table like representation
import tabulate
from IPython.display import HTML, display

keys = sorted(list(toks.keys()))

rows = [
    [""] + [file for file in files]
]

total = {file: 0 for file in files}

for key in keys:
    rows.append([key])
    rows[-1].extend([toks[key][file] for file in files])
    for file in files:
        total[file] += toks[key][file]

rows.append(["Total"]  + [total[file] for file in files])

display(HTML(tabulate.tabulate(rows, tablefmt='html')))

print(tabulate.tabulate(rows, tablefmt='latex'))

0,1,2,3,4,5,6,7,8
,0.01,0.05,0.075,0.1,0.2,0.4,0.6,0.8
Caesar,827.0,3576.0,5210.0,7023.0,13642.0,28654.0,43006.0,56210.0
Cato,238.0,1076.0,1450.0,1873.0,4138.0,7113.0,10221.0,13426.0
Catullus,171.0,626.0,802.0,1098.0,2233.0,4257.0,6283.0,9206.0
Cicero,6861.0,27012.0,38738.0,49070.0,92026.0,171129.0,250420.0,330028.0
Curtius,652.0,3191.0,4534.0,6190.0,12564.0,24348.0,37070.0,50085.0
Hirtius,82.0,325.0,470.0,630.0,1281.0,2399.0,3539.0,4572.0
Horatius,487.0,2257.0,3356.0,4267.0,8181.0,16072.0,23916.0,32172.0
Juvenalis,311.0,1008.0,1405.0,1953.0,4141.0,8118.0,12599.0,17167.0
Lucretius,846.0,2838.0,4001.0,5074.0,9342.0,18545.0,27496.0,35730.0


\begin{tabular}{lrrrrrrrr}
\hline
               &     0.01 &     0.05 &      0.075 &      0.1 &      0.2 &      0.4 &      0.6 &      0.8         \\
 Caesar        &   827    &  3576    &   5210     &   7023   &  13642   &  28654   &  43006   &  56210           \\
 Cato          &   238    &  1076    &   1450     &   1873   &   4138   &   7113   &  10221   &  13426           \\
 Catullus      &   171    &   626    &    802     &   1098   &   2233   &   4257   &   6283   &   9206           \\
 Cicero        &  6861    & 27012    &  38738     &  49070   &  92026   & 171129   & 250420   & 330028           \\
 Curtius       &   652    &  3191    &   4534     &   6190   &  12564   &  24348   &  37070   &  50085           \\
 Hirtius       &    82    &   325    &    470     &    630   &   1281   &   2399   &   3539   &   4572           \\
 Horatius      &   487    &  2257    &   3356     &   4267   &   8181   &  16072   &  23916   &  32172           \\
 Juvenalis     &   311    &  1008    &

In [23]:
# Create the configuration files for all files
import json

BASE_CONFIG = json.loads("""{
    "modelname": "model-percent-{}-",
    "modelpath": "./models/",
    "run_test": false,
    "max_sent_len": 35,
    "max_sents": 1000000,
    "input_path": "./protogenie-partial/{per}train.tsv",
    "test_path": "./protogenie-partial/{per}test.tsv",
    "dev_path": "./protogenie-partial/{per}dev.tsv",
    "breakline_ref": "pos",
    "breakline_data": "$.",
    "char_max_size": 500,
    "word_max_size": 20000,
    "char_min_freq": 1,
    "word_min_freq": 1,
    "char_eos": true,
    "char_bos": true,
    "header": true,
    "sep": "\\t",
    "tasks": [
        {
            "name": "lemma",
            "target": true,
            "context": "sentence",
            "level": "char",
            "decoder": "attentional",
            "settings": {
                "bos": true,
                "eos": true,
                "lower": true,
                "target": "lemma"
            },
            "layer": -1
        },
        {
            "name": "pos"
        },
        {
            "name": "Gend"
        }
    ],
    "task_defaults": {
        "level": "token",
        "layer": -1,
        "decoder": "linear",
        "context": "sentence"
    },
    "patience": 10,
    "factor": 0.5,
    "threshold": 0.0001,
    "min_weight": 0.2,
    "include_lm": true,
    "lm_shared_softmax": true,
    "lm_schedule": {
        "patience": 2,
        "factor": 0.5,
        "weight": 0.2,
        "mode": "min"
    },
    "batch_size": 256,
    "dropout": 0.25,
    "lr": 0.001,
    "lr_factor": 0.5,
    "lr_patience": 8,
    "epochs": 100,
    "cell": "GRU",
    "num_layers": 1,
    "hidden_size": 128,
    "wemb_dim": 100,
    "cemb_dim": 300,
    "cemb_type": "rnn",
    "cemb_layers": 2,
    "checks_per_epoch": 1,
    "report_freq": 200,
    "verbose": true,
    "device": "cuda",
    "buffer_size": 10000,
    "minimize_pad": false,
    "word_dropout": 0,
    "shuffle": true,
    "optimizer": "Adam",
    "clip_norm": 5,
    "pretrain_embeddings": false,
    "load_pretrained_embeddings": "",
    "load_pretrained_encoder": "",
    "freeze_embeddings": false,
    "custom_cemb_cell": false,
    "merge_type": "concat",
    "scorer": "general",
    "linear_layers": 1
}""")

import copy
for file in files:
    config = copy.deepcopy(BASE_CONFIG)
    config["modelname"] = config["modelname"].format(file.replace(".", ","))
    config["input_path"] = config["input_path"].format(per=file)
    config["dev_path"] = config["dev_path"].format(per=file)
    config["test_path"] = config["test_path"].format(per=file)
    with open("../../../LASLA/configs/partial-{}.json".format(file), "w") as f:
        json.dump(config, f)
        

In [24]:
# Create the configuration files for all files
import json

BASE_CONFIG = json.loads("""{
    "modelname": "model-percent-simpler-{}-",
    "modelpath": "./models/",
    "run_test": false,
    "max_sent_len": 35,
    "max_sents": 1000000,
    "input_path": "./protogenie-partial/{per}train.tsv",
    "test_path": "./protogenie-partial/{per}test.tsv",
    "dev_path": "./protogenie-partial/{per}dev.tsv",
    "breakline_ref": "pos",
    "breakline_data": "$.",
    "char_max_size": 500,
    "word_max_size": 20000,
    "char_min_freq": 1,
    "word_min_freq": 1,
    "char_eos": true,
    "char_bos": true,
    "header": true,
    "sep": "\\t",
    "tasks": [
        {
            "name": "lemma",
            "target": true,
            "context": "sentence",
            "level": "char",
            "decoder": "attentional",
            "settings": {
                "bos": true,
                "eos": true,
                "lower": true,
                "target": "lemma"
            },
            "layer": -1
        },
        {
            "name": "pos"
        },
        {
            "name": "Gend"
        }
    ],
    "task_defaults": {
        "level": "token",
        "layer": -1,
        "decoder": "linear",
        "context": "sentence"
    },
    "patience": 8,
    "factor": 0.5,
    "threshold": 0.0001,
    "min_weight": 0.2,
    "include_lm": true,
    "lm_shared_softmax": true,
    "lm_schedule": {
        "patience": 2,
        "factor": 0.5,
        "weight": 0.2,
        "mode": "min"
    },
    "batch_size": 64,
    "dropout": 0.25,
    "lr": 0.001,
    "lr_factor": 0.5,
    "lr_patience": 10,
    "epochs": 100,
    "cell": "GRU",
    "num_layers": 1,
    "hidden_size": 128,
    "wemb_dim": 100,
    "cemb_dim": 150,
    "cemb_type": "rnn",
    "cemb_layers": 1,
    "checks_per_epoch": 1,
    "report_freq": 200,
    "verbose": true,
    "device": "cuda",
    "buffer_size": 10000,
    "minimize_pad": false,
    "word_dropout": 0,
    "shuffle": true,
    "optimizer": "Adam",
    "clip_norm": 5,
    "pretrain_embeddings": false,
    "load_pretrained_embeddings": "",
    "load_pretrained_encoder": "",
    "freeze_embeddings": false,
    "custom_cemb_cell": false,
    "merge_type": "concat",
    "scorer": "general",
    "linear_layers": 1
}""")

import copy
for file in files:
    config = copy.deepcopy(BASE_CONFIG)
    config["modelname"] = config["modelname"].format(file.replace(".", ","))
    config["input_path"] = config["input_path"].format(per=file)
    config["test_path"] = config["test_path"].format(per=file)
    config["dev_path"] = config["dev_path"].format(per=file)
    with open("../../../LASLA/configs/partial-simpler-{}.json".format(file), "w") as f:
        json.dump(config, f)
        