# Objectif

L'objectif de ce notebook est de compiler la liste des oeuvres existantes, avec leur nombre de tokens, quelque soit leur utilisation dans l'entraînement général.

Étapes prévues :

| Numero | Type | Titre |
| ------ | ---- | ----- |
| 1      | Auto | Lister les oeuvres avec leurs tokens |
| 2      | Manu | Dispatcher dans des corpus dans une feuille csv |
| 3      | Config | Générer les corpus et les configurations |

# Étape 1

In [1]:
# We'll keep only those headers in step 3
kept_header = ["token", "lemma", "pos", "Gend"]

import glob
from collections import Counter
import os.path
import re


# Get the table
def get_title(filename):
    for group in re.findall(r"([A-Z][a-z]+)([A-Z][a-zA-Z]+)?0?(\d+)?\.tsv", filename):
        return ", ".join([g for g in group if g])


nb_tokens = Counter()

for file in sorted(list(glob.glob("../../../LASLA/mood-tense-voice/**/*.tsv"))):
    # Number of tokens = number of lines that are not empty - 1 for the header
    filename = os.path.basename(file)
    title = get_title(filename)
    with open(file) as f:
        count = -1
        for line in f:
            count += int(bool(line.strip()))
        nb_tokens[(title, filename)] += count

In [2]:
# Get the CSV
import csv

with open("1.4.5.d-lemmatisation-impact-liste-corpus-auto.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["title", "file", "tokens"])
    for ((title, file), cnt) in nb_tokens.items():
        writer.writerow([title, file, cnt])

## Étape 2

Travail manuel

## Étape 3

Génération des corpus

In [3]:
from collections import defaultdict

corpora = defaultdict(list) # filename : [corpusname]

with open("1.4.5.d-lemmatisation-impact-selection-corpus-manuelle.csv") as f:
    reader = csv.DictReader(f)
    need_build = reader.fieldnames[3:]
    for row in reader:
        for corpus in need_build:
            y = row[corpus] == "y"
            if y:
                corpora[row["file"]].append(corpus)

In [4]:
used_tags = ["token", "lemma", "pos", "Gend"]
tests = []
output_corpus = defaultdict(list)
for file in sorted(list(glob.glob("../../../LASLA/mood-tense-voice/**/*.tsv"))):
    filename = os.path.basename(file)
    if filename in corpora:
        with open(file) as f:
            headers = None
            sentences = [[]]
            for line_no, line in enumerate(f):
                if line_no == 0:
                    headers = line.strip().split("\t")
                    continue
                line = line.strip()
                if not line:
                    # Sentence end
                    sentences.append([])
                    continue
                sentences[-1].append({
                    key:val
                    for key, val in zip(headers, line.split("\t"))
                    if key in used_tags
                })

            for corpus in corpora[filename]:
                if corpus == "test":
                    corpus = filename.replace(".tsv", "")
                    tests.append(corpus)
                output_corpus[corpus].extend([s for s in sentences if s])

In [5]:
print(output_corpus.keys())
import os
import random
os.makedirs("./1.4.5.d-lemmatisation-impact/tests", exist_ok=True)

with open("./1.4.5.d-lemmatisation-impact-information-corpora.csv", "w") as stats:
    stats.write("corpus,train,dev,train_toks,dev_toks\n")
    for corpus in output_corpus:
        random.shuffle(output_corpus[corpus])
        train = int(len(output_corpus[corpus]) * 0.95)
        dev = len(output_corpus[corpus])-train
        train_toks = len([tok for s in output_corpus[corpus][:train] for tok in s])
        dev_toks = len([tok for s in output_corpus[corpus][train:] for tok in s])
        stats.write(",".join(list(map(str, [corpus, train, dev, train_toks, dev_toks])))+"\n")

        test_dir = ""
        if corpus in tests:
            with open("./1.4.5.d-lemmatisation-impact/tests/"+corpus+".tsv", "w") as f:
                f.write("\t".join(used_tags)+"\n")
                for sentence in output_corpus[corpus][:train]:
                    for row in sentence:
                        f.write("\t".join([row[tag] for tag in used_tags])+"\n")
                    f.write("\n")
            continue
        with open("./1.4.5.d-lemmatisation-impact/"+corpus+"-train.tsv", "w") as f:
            f.write("\t".join(used_tags)+"\n")
            for sentence in output_corpus[corpus][:train]:
                for row in sentence:
                    f.write("\t".join([row[tag] for tag in used_tags])+"\n")
                f.write("\n")

        with open("./1.4.5.d-lemmatisation-impact/"+corpus+"-dev.tsv", "w") as f:
            f.write("\t".join(used_tags)+"\n")
            for sentence in output_corpus[corpus][train:]:
                for row in sentence:
                    f.write("\t".join([row[tag] for tag in used_tags])+"\n")
                f.write("\n")

dict_keys(['prose', 'corpus-divers', 'Caesar_BellumGallicum_CaesBG3', 'Catullus_Catullus_Catul', 'cicero-discours', 'Cicero_DeAmicitia_CicAmici', 'Cicero_InCatilinam_CicCat1', 'ciceron-petit', 'Curtius_CurtiusHistoriaeAlexandriMagni_QCurt03', 'vers', 'horace+lucrece', 'Horatius_Epodi_HorEpodi', 'ovide', 'Ovidius_InIbin_OviIbin', 'Petronius_PetroniusSatiricon_PetronSa', 'theatre-plaute-seneque', 'PseudoCaesar1_BellumAfricanum_BAfr', 'Sallustius_Catilina_SalCatil', 'seneque-philo-autres', 'seneque-lucilium', 'Seneca_DeBrevitateVitae_SenBrevi', 'Seneca_Medea_SenMedea', 'tacite', 'Tacitus_TacGermania_TacGerma', 'virgile'])


## Étape 4: Configuration simples + complexes  

In [15]:
# Create the configuration files for all files
import json

BASE_CONFIG = json.loads("""{
    "modelname": "model-{}-",
    "modelpath": "./models/",
    "run_test": false,
    "max_sent_len": 35,
    "max_sents": 1000000,
    "input_path": "./protogenie-partial/{per}train.tsv",
    "dev_path": "./protogenie-partial/{per}dev.tsv",
    "breakline_ref": "pos",
    "breakline_data": "$.",
    "char_max_size": 500,
    "word_max_size": 20000,
    "char_min_freq": 1,
    "word_min_freq": 1,
    "char_eos": true,
    "char_bos": true,
    "header": true,
    "sep": "\\t",
    "tasks": [
        {
            "name": "lemma",
            "target": true,
            "context": "sentence",
            "level": "char",
            "decoder": "attentional",
            "settings": {
                "bos": true,
                "eos": true,
                "lower": true,
                "target": "lemma"
            },
            "layer": -1
        },
        {
            "name": "pos"
        },
        {
            "name": "Gend"
        }
    ],
    "task_defaults": {
        "level": "token",
        "layer": -1,
        "decoder": "linear",
        "context": "sentence"
    },
    "batch_size": 64,
    "dropout": 0.25,
    "lr": 0.001,
    "lr_factor": 0.5,
    "lr_patience": 10,
    "patience": 8,
    "factor": 0.5,
    "threshold": 0.0001,
    "min_weight": 0.2,
    "include_lm": true,
    "lm_shared_softmax": true,
    "lm_schedule": {
        "patience": 2,
        "factor": 0.5,
        "weight": 0.2,
        "mode": "min"
    },
    "epochs": 100,
    "cell": "GRU",
    "num_layers": 1,
    "hidden_size": 128,
    "wemb_dim": 100,
    "cemb_dim": 300,
    "cemb_type": "rnn",
    "cemb_layers": 2,
    "checks_per_epoch": 1,
    "report_freq": 200,
    "verbose": true,
    "device": "cuda",
    "buffer_size": 10000,
    "minimize_pad": false,
    "word_dropout": 0,
    "shuffle": true,
    "optimizer": "Adam",
    "clip_norm": 5,
    "pretrain_embeddings": false,
    "load_pretrained_embeddings": "",
    "load_pretrained_encoder": "",
    "freeze_embeddings": false,
    "custom_cemb_cell": false,
    "merge_type": "concat",
    "scorer": "general",
    "linear_layers": 1
}""")

import copy
for corpus in output_corpus:
    if corpus in tests:
        continue
    config = copy.deepcopy(BASE_CONFIG)
    config["modelname"] = config["modelname"].format("1.4.5.d-"+corpus.replace("+", "_")+"-")
    config["input_path"] = "./1.4.5.d-lemmatisation-impact/"+corpus+"-train.tsv"
    config["dev_path"] = "./1.4.5.d-lemmatisation-impact/"+corpus+"-dev.tsv"
    with open("configs/1.4.5.4-{}.json".format(corpus), "w") as f:
        json.dump(config, f)
        

In [16]:
# Create the configuration files for all files
import json

BASE_CONFIG = json.loads("""{
    "modelname": "model-{}-",
    "modelpath": "./models/",
    "run_test": false,
    "max_sent_len": 35,
    "max_sents": 1000000,
    "input_path": "./protogenie-partial/{per}train.tsv",
    "dev_path": "./protogenie-partial/{per}dev.tsv",
    "breakline_ref": "pos",
    "breakline_data": "$.",
    "char_max_size": 500,
    "word_max_size": 20000,
    "char_min_freq": 1,
    "word_min_freq": 1,
    "char_eos": true,
    "char_bos": true,
    "header": true,
    "sep": "\\t",
    "tasks": [
        {
            "name": "lemma",
            "target": true,
            "context": "sentence",
            "level": "char",
            "decoder": "attentional",
            "settings": {
                "bos": true,
                "eos": true,
                "lower": true,
                "target": "lemma"
            },
            "layer": -1
        },
        {
            "name": "pos"
        },
        {
            "name": "Gend"
        }
    ],
    "task_defaults": {
        "level": "token",
        "layer": -1,
        "decoder": "linear",
        "context": "sentence"
    },
    "batch_size": 64,
    "dropout": 0.25,
    "lr": 0.001,
    "lr_factor": 0.5,
    "lr_patience": 10,
    "patience": 8,
    "factor": 0.5,
    "threshold": 0.0001,
    "min_weight": 0.2,
    "include_lm": true,
    "lm_shared_softmax": true,
    "lm_schedule": {
        "patience": 2,
        "factor": 0.5,
        "weight": 0.2,
        "mode": "min"
    },
    "epochs": 100,
    "cell": "GRU",
    "num_layers": 1,
    "hidden_size": 128,
    "wemb_dim": 100,
    "cemb_dim": 150,
    "cemb_type": "rnn",
    "cemb_layers": 1,
    "checks_per_epoch": 1,
    "report_freq": 200,
    "verbose": true,
    "device": "cuda",
    "buffer_size": 10000,
    "minimize_pad": false,
    "word_dropout": 0,
    "shuffle": true,
    "optimizer": "Adam",
    "clip_norm": 5,
    "pretrain_embeddings": false,
    "load_pretrained_embeddings": "",
    "load_pretrained_encoder": "",
    "freeze_embeddings": false,
    "custom_cemb_cell": false,
    "merge_type": "concat",
    "scorer": "general",
    "linear_layers": 1
}""")

import copy
for corpus in output_corpus:
    if corpus in tests:
        continue
    config = copy.deepcopy(BASE_CONFIG)
    config["modelname"] = config["modelname"].format("1.4.5.d-"+corpus.replace("+", "_")+"-light-")
    config["input_path"] = "./1.4.5.d-lemmatisation-impact/"+corpus+"-train.tsv"
    config["dev_path"] = "./1.4.5.d-lemmatisation-impact/"+corpus+"-dev.tsv"
    with open("configs/1.4.5.4-{}-light.json".format(corpus), "w") as f:
        json.dump(config, f)
        