# Objectif

L'objectif de ce notebook est de créer les configurations à évaluer.

On séparera les études de configuration de lemmatisation et de tâches morpho-syntaxiques.

## Configurations Lemmatisation

Options ayant un impact:

- CEMB DIM
- CEMB TYPE
- CEMB LAYERS
- LEARNING RATES

```json
{
    // Output information
    "modelname": "latest-lasla-lat",
    "modelpath": "./models/",
    // Run information
    "run_test": true,
    // Input data information
    "input_path": "./datasets/lat/better-corpus/train.tsv",
    "test_path": "./datasets/lat/better-corpus/test.tsv",
    "dev_path": "./datasets/lat/better-corpus/dev.tsv",
    "header": true,
    "sep": "\t",
    "breakline_ref": "pos", // Not used here because empty lines mark sequence changes
    "breakline_data": "NONE", // Not used here
    // Input metrics
    "char_max_size": 500,
    "word_max_size": 28000,
    "max_sent_len": 35,
    "max_sents": 1000000,
    "char_min_freq": 1,
    "word_min_freq": 1,
    // Use EOS and BOS
    "char_eos": true,
    "char_bos": true,
    // Tasks
    "tasks": [
        {
            "name": "lemma",
            "target": true,
            "context": "sentence",
            "level": "char",
            "decoder": "attentional",
            "settings": {
                "bos": true,
                "eos": true,
                "lower": true,
                "target": "lemma"
            },
            "layer": -1
        },
        //[{"name": task} for task in tasks]
    ],
    "task_defaults": {
        "level": "token",
        "layer": -1,
        "decoder": "linear",
        "context": "sentence"
    },
    
    "patience": 8,
    "factor": 0.5,
    "threshold": 0.00001,
    "min_weight": 0.2,
    
    // Language Model Information
    "include_lm": true,
    "lm_shared_softmax": true,
    "lm_schedule": {
        "patience": 2,
        "factor": 0.5,
        "weight": 0.2,
        "mode": "min"
    },
    "batch_size": 256,
    "epochs": 100,
    "dropout": 0.25,
    "word_dropout": 0,
    // Learning rate update
    "lr": 0.001,
    "lr_factor": 0.5,
    "lr_patience": 10,
    "optimizer": "Adam",
    "clip_norm": 5,
    
    // Tache linéaires
    "linear_layers": 1,
    // Tache linéaires: Encodeur
    "hidden_size": 128,
    "num_layers": 1,
    "cell": "GRU",
    // Taches linéaires: Word Embedding et Mixer
    "wemb_dim": 100,
    "merge_type": "concat",
    // Lemmatisation : Decoder et CEMB
    "cemb_dim": 300,
    "cemb_type": "rnn",
    "cemb_layers": 2,
    "decoder_layers": 3, // Would be nice ?
    "custom_cemb_cell": false,
    // Training options
    "checks_per_epoch": 1,
    "report_freq": 200,
    "verbose": true,
    "device": "cuda",
    "buffer_size": 10000,  // Sentence in memory
    "minimize_pad": false,
    "shuffle": true,
    "pretrain_embeddings": false,
    "load_pretrained_embeddings": "",
    "load_pretrained_encoder": "",
    "freeze_embeddings": false,
    "scorer": "general"
}


```

In [36]:
In[len(In)-2]

'import copy\n\nCNN = copy.deepcopy(data)\nRNN = copy.deepcopy(data)\n\nCNN["modelname"] = "cemb_CNN_-"\nRNN["modelname"] = "cemb_RNN_-"\n\nwith open("./configs/1.4.4.a-CNN_vs_RNN-CNN.json", "w") as f:\n    json.dump(CNN, f)\n\nwith open("./configs/1.4.4.a-CNN_vs_RNN-RNN.json", "w") as f:\n    json.dump(RNN, f)'

In [31]:
import json

data = json.loads(
    "\n".join(
        [
            line.split("//")[0]
            for line in """{
            // Output information
            "modelname": "latest-lasla-lat",
            "modelpath": "./models/",
            // Run information
            "run_test": true,
            // Input data information
            "input_path": "./mood-tense-voice-pft/train.tsv",
            "test_path": "./mood-tense-voice-pft/test.tsv",
            "dev_path": "./mood-tense-voice-pft/dev.tsv",
            "header": true,
            "sep": "\\t",
            "breakline_ref": "pos", // Not used here because empty lines mark sequence changes
            "breakline_data": "NONE", // Not used here
            // Input metrics
            "char_max_size": 500,
            "word_max_size": 28000,
            "max_sent_len": 35,
            "max_sents": 1000000,
            "char_min_freq": 1,
            "word_min_freq": 1,
            // Use EOS and BOS
            "char_eos": true,
            "char_bos": true,
            // Tasks
            "tasks": [
                {
                    "name": "lemma",
                    "target": true,
                    "context": "sentence",
                    "level": "char",
                    "decoder": "attentional",
                    "settings": {
                        "bos": true,
                        "eos": true,
                        "lower": true,
                        "target": "lemma"
                    },
                    "layer": -1
                }//,
                //[{"name": task} for task in tasks]
            ],
            "task_defaults": {
                "level": "token",
                "layer": -1,
                "decoder": "linear",
                "context": "sentence"
            },

            "patience": 5,
            "factor": 0.5,
            "threshold": 0,
            "min_weight": 0,

            // Language Model Information
            "include_lm": true,
            "lm_shared_softmax": true,
            "lm_schedule": {
                "patience": 2,
                "factor": 0.5,
                "weight": 0.2,
                "mode": "min"
            },
            "batch_size": 256,
            "epochs": 100,
            "dropout": 0.25,
            "word_dropout": 0,
            // Learning rate update
            "lr": 0.001,
            "lr_factor": 0.75,
            "lr_patience": 2,
            "optimizer": "Adam",
            "clip_norm": 5,

            // Tache linéaires
            "linear_layers": 1,
            // Tache linéaires: Encodeur
            "hidden_size": 128,
            "num_layers": 1,
            "cell": "GRU",
            // Taches linéaires: Word Embedding et Mixer
            "wemb_dim": 100,
            "merge_type": "concat",
            // Lemmatisation : Decoder et CEMB
            "cemb_dim": 300,
            "cemb_type": "rnn",
            "cemb_layers": 2,
            "decoder_layers": 3, // Would be nice ?
            "custom_cemb_cell": false,
            // Training options
            "checks_per_epoch": 1,
            "report_freq": 200,
            "verbose": true,
            "device": "cuda",
            "buffer_size": 10000,  // Sentence in memory
            "minimize_pad": false,
            "shuffle": true,
            "pretrain_embeddings": false,
            "load_pretrained_embeddings": "",
            "load_pretrained_encoder": "",
            "freeze_embeddings": false,
            "scorer": "general"
        }
        """.split("\n") 
                             if not line.strip().startswith("//") and line.strip()
                            ]
                           )
                 )  # json.loads

In [32]:
# Add tasks
with open("/home/thibault/dev/LASLA/mood-tense-voice-pft/test.tsv") as f:
    for line in f:
        break
data["tasks"].extend([
    {"name": task}
    for task in line.split()
    if task not in ("lemma", "token", "Dis", "Entity")
])

## First test: CNN vs RNN


In [35]:
import copy

CNN = copy.deepcopy(data)
RNN = copy.deepcopy(data)

CNN["modelname"] = "cemb_CNN_-"
RNN["modelname"] = "cemb_RNN_-"

with open("./configs/1.4.4.a-CNN_vs_RNN-CNN.json", "w") as f:
    json.dump(CNN, f)

with open("./configs/1.4.4.a-CNN_vs_RNN-RNN.json", "w") as f:
    json.dump(RNN, f)