# Extensibilité des résultats en fonction de la taille du corpus d'entraînement

On applique une coupe au corpus d'entraînement de 1%, 2%, 5%, etc.

En fonction de cette coupe, on analyse le résultat sur trois tâches: 
- la lemmatisation car elle est la tâche centrale du lemmatiseur,
- la POS car elle nécessite une compréhension de la syntaxe et du vocabulaire,
- le genre car elle nécessite une approche morphologique.

On obtient les coupes suivantes :

In [35]:
import glob
import os.path
from collections import defaultdict

toks = defaultdict(lambda: defaultdict(lambda: 0))
files = []
for file in sorted(list(glob.glob("../../../LASLA/0.*.txt"))):
    percent = os.path.basename(file).replace(".txt", "")
    files.append(percent)
    with open(file) as f:
        train = False
        started = False
        for line in f:
            # We look for the first iteration of train
            # and then the first iteration of started
            if not train:
                train = line.strip() == "train"
                continue
            if not started:
                started = line.strip().split() == ["File", "Chunks", "Tokens"]
                continue
            if line.strip() == "# train's statistics":
                break
            text, chunks, tokens = line.strip().split()
            text = text.replace(".tsv", "")
            if text[-1].isnumeric():
                text = text[:-1]
            # Reduce by author to have a manageable list
            author = text.split("_")[0]
            toks[author][percent] += int(tokens)

In [38]:
# Create a table like representation
import tabulate
from IPython.display import HTML, display

keys = sorted(list(toks.keys()))

rows = [
    [""] + [file for file in files]
]

total = {file: 0 for file in files}

for key in keys:
    rows.append([key])
    rows[-1].extend([toks[key][file] for file in files])
    for file in files:
        total[file] += toks[key][file]

rows.append(["Total"]  + [total[file] for file in files])

display(HTML(tabulate.tabulate(rows, tablefmt='html')))

print(tabulate.tabulate(rows, tablefmt='latex'))

0,1,2,3,4,5,6,7,8
,0.01,0.05,0.1,0.2,0.25,0.3,0.4,0.5
Caesar,536.0,3224.0,6784.0,13525.0,17071.0,20843.0,28460.0,35607.0
Cato,172.0,977.0,1795.0,4107.0,4930.0,5649.0,7098.0,8601.0
Catullus,160.0,611.0,1084.0,2225.0,2825.0,3271.0,4254.0,4899.0
Cicero,5067.0,25296.0,47651.0,90787.0,111373.0,130462.0,170087.0,209291.0
Curtius,404.0,2901.0,5977.0,12330.0,15589.0,18233.0,24140.0,30352.0
Hirtius,59.0,313.0,597.0,1251.0,1525.0,1851.0,2369.0,2960.0
Horatius,257.0,2046.0,4084.0,8016.0,10016.0,12078.0,15874.0,19855.0
Juvenalis,304.0,989.0,1921.0,4126.0,5175.0,6267.0,8100.0,10586.0
Lucretius,614.0,2664.0,4947.0,9255.0,11506.0,13828.0,18419.0,23012.0


\begin{tabular}{lrrrrrrrr}
\hline
               &     0.01 &     0.05 &      0.1 &      0.2 &      0.25 &      0.3 &      0.4 &      0.5 \\
 Caesar        &   536    &  3224    &   6784   &  13525   &  17071    &  20843   &  28460   &  35607   \\
 Cato          &   172    &   977    &   1795   &   4107   &   4930    &   5649   &   7098   &   8601   \\
 Catullus      &   160    &   611    &   1084   &   2225   &   2825    &   3271   &   4254   &   4899   \\
 Cicero        &  5067    & 25296    &  47651   &  90787   & 111373    & 130462   & 170087   & 209291   \\
 Curtius       &   404    &  2901    &   5977   &  12330   &  15589    &  18233   &  24140   &  30352   \\
 Hirtius       &    59    &   313    &    597   &   1251   &   1525    &   1851   &   2369   &   2960   \\
 Horatius      &   257    &  2046    &   4084   &   8016   &  10016    &  12078   &  15874   &  19855   \\
 Juvenalis     &   304    &   989    &   1921   &   4126   &   5175    &   6267   &   8100   &  10586   \\
 Lu

In [48]:
# Create the configuration files for all files
import json

BASE_CONFIG = json.loads("""{
    "modelname": "model-percent-{}-",
    "modelpath": "./models/",
    "run_test": false,
    "max_sent_len": 35,
    "max_sents": 1000000,
    "input_path": "./protogenie-partial/{per}train.tsv",
    "test_path": "./protogenie-partial/test.tsv",
    "dev_path": "./protogenie-partial/dev.tsv",
    "breakline_ref": "pos",
    "breakline_data": "$.",
    "char_max_size": 500,
    "word_max_size": 20000,
    "char_min_freq": 1,
    "word_min_freq": 1,
    "char_eos": true,
    "char_bos": true,
    "header": true,
    "sep": "\\t",
    "tasks": [
        {
            "name": "lemma",
            "target": true,
            "context": "sentence",
            "level": "char",
            "decoder": "attentional",
            "settings": {
                "bos": true,
                "eos": true,
                "lower": true,
                "target": "lemma"
            },
            "layer": -1
        },
        {
            "name": "pos"
        },
        {
            "name": "Gend"
        }
    ],
    "task_defaults": {
        "level": "token",
        "layer": -1,
        "decoder": "linear",
        "context": "sentence"
    },
    "patience": 5,
    "factor": 0.5,
    "threshold": 0.0001,
    "min_weight": 0.2,
    "include_lm": true,
    "lm_shared_softmax": true,
    "lm_schedule": {
        "patience": 2,
        "factor": 0.5,
        "weight": 0.2,
        "mode": "min"
    },
    "batch_size": 256,
    "dropout": 0.25,
    "lr": 0.001,
    "lr_factor": 0.5,
    "lr_patience": 2,
    "epochs": 100,
    "cell": "GRU",
    "num_layers": 1,
    "hidden_size": 128,
    "wemb_dim": 100,
    "cemb_dim": 300,
    "cemb_type": "rnn",
    "cemb_layers": 2,
    "checks_per_epoch": 1,
    "report_freq": 200,
    "verbose": true,
    "device": "cuda",
    "buffer_size": 10000,
    "minimize_pad": false,
    "word_dropout": 0,
    "shuffle": true,
    "optimizer": "Adam",
    "clip_norm": 5,
    "pretrain_embeddings": false,
    "load_pretrained_embeddings": "",
    "load_pretrained_encoder": "",
    "freeze_embeddings": false,
    "custom_cemb_cell": false,
    "merge_type": "concat",
    "scorer": "general",
    "linear_layers": 1
}""")

import copy
for file in files:
    config = copy.deepcopy(BASE_CONFIG)
    config["modelname"] = config["modelname"].format(file.replace(".", ","))
    config["input_path"] = config["input_path"].format(per=file)
    with open("../../../LASLA/configs/partial-{}.json".format(file), "w") as f:
        json.dump(config, f)