# Objectif

L'objectif de ce notebook est de créer les configurations à évaluer.

On séparera les études de configuration de lemmatisation et de tâches morpho-syntaxiques.

## Configurations Lemmatisation

Options ayant un impact:

- CEMB DIM
- CEMB TYPE
- CEMB LAYERS
- LEARNING RATES

```json
{
    // Output information
    "modelname": "latest-lasla-lat",
    "modelpath": "./models/",
    // No serialization
    "run_test": false,
    // Input data information
    "input_path": "./mood-tense-voice-pft/train.tsv",
    "test_path": "./mood-tense-voice-pft/test.tsv",
    "dev_path": "./mood-tense-voice-pft/dev.tsv",
    "header": true,
    "sep": "\\t",
    "breakline_ref": "pos", // Not used here because empty lines mark sequence changes
    "breakline_data": "NONE", // Not used here
    // Input metrics
    "char_max_size": 500,
    "word_max_size": 28000,
    "max_sent_len": 35,
    "max_sents": 1000000,
    "char_min_freq": 1,
    "word_min_freq": 1,
    // Use EOS and BOS
    "char_eos": true,
    "char_bos": true,
    // Tasks
    "tasks": [
        {
            "name": "lemma",
            "target": true,
            "context": "sentence",
            "level": "char",
            "decoder": "attentional",
            "settings": {
                "bos": true,
                "eos": true,
                "lower": true,
                "target": "lemma"
            },
            "layer": -1
        }
    ],
    "task_defaults": {
        "level": "token",
        "layer": -1,
        "decoder": "linear",
        "context": "sentence"
    },
    
    "patience": 8,
    "factor": 0.5,
    "threshold": 0.0001,
    "min_weight": 0.2,
    
    // Language Model Information
    "include_lm": true,
    "lm_shared_softmax": true,
    "lm_schedule": {
        "patience": 2,
        "factor": 0.5,
        "weight": 0.2,
        "mode": "min"
    },
    "batch_size": 128,
    "epochs": 100,
    "dropout": 0.25,
    "word_dropout": 0,
    // Learning rate update
    "lr": 0.001,
    "lr_factor": 0.5,
    "lr_patience": 10,
    "optimizer": "Adam",
    "clip_norm": 5,
    
    // Tache linéaires
    "linear_layers": 1,
    // Tache linéaires: Encodeur
    "hidden_size": 128,
    "num_layers": 1,
    "cell": "GRU",
    // Taches linéaires: Word Embedding et Mixer
    "wemb_dim": 100,
    "merge_type": "concat",
    // Lemmatisation : Decoder et CEMB
    "cemb_dim": 300,
    "cemb_type": "rnn",
    "cemb_layers": 2,
    "decoder_layers": 3, // Would be nice ?
    "custom_cemb_cell": false,
    // Training options
    "checks_per_epoch": 1,
    "report_freq": 200,
    "verbose": true,
    "device": "cuda",
    "buffer_size": 10000,  // Sentence in memory
    "minimize_pad": false,
    "shuffle": true,
    "pretrain_embeddings": false,
    "load_pretrained_embeddings": "",
    "load_pretrained_encoder": "",
    "freeze_embeddings": false,
    "scorer": "general"
}
```

In [1]:
import json

data = json.loads(
    "\n".join(
        [
            line.split("//")[0]
            for line in """{
    // Output information
    "modelname": "latest-lasla-lat",
    "modelpath": "./models/",
    // Run information
    "run_test": false,
    // Input data information
    "input_path": "./mood-tense-voice-pft/train.tsv",
    "test_path": "./mood-tense-voice-pft/test.tsv",
    "dev_path": "./mood-tense-voice-pft/dev.tsv",
    "header": true,
    "sep": "\\t",
    "breakline_ref": "pos", // Not used here because empty lines mark sequence changes
    "breakline_data": "NONE", // Not used here
    // Input metrics
    "char_max_size": 500,
    "word_max_size": 28000,
    "max_sent_len": 35,
    "max_sents": 1000000,
    "char_min_freq": 1,
    "word_min_freq": 1,
    // Use EOS and BOS
    "char_eos": true,
    "char_bos": true,
    // Tasks
    "tasks": [
        {
            "name": "lemma",
            "target": true,
            "context": "sentence",
            "level": "char",
            "decoder": "attentional",
            "settings": {
                "bos": true,
                "eos": true,
                "lower": true,
                "target": "lemma"
            },
            "layer": -1
        }
    ],
    "task_defaults": {
        "level": "token",
        "layer": -1,
        "decoder": "linear",
        "context": "sentence"
    },
    
    "patience": 8,
    "factor": 0.5,
    "threshold": 0.0001,
    "min_weight": 0.2,
    
    // Language Model Information
    "include_lm": true,
    "lm_shared_softmax": true,
    "lm_schedule": {
        "patience": 2,
        "factor": 0.5,
        "weight": 0.2,
        "mode": "min"
    },
    "batch_size": 128,
    "epochs": 100,
    "dropout": 0.25,
    "word_dropout": 0,
    // Learning rate update
    "lr": 0.001,
    "lr_patience": 10,
    "optimizer": "Adam",
    "clip_norm": 5,
    
    // Tache linéaires
    "linear_layers": 1,
    // Tache linéaires: Encodeur
    "hidden_size": 128,
    "num_layers": 1,
    "cell": "GRU",
    // Taches linéaires: Word Embedding et Mixer
    "wemb_dim": 100,
    "merge_type": "concat",
    // Lemmatisation : Decoder et CEMB
    "cemb_dim": 300,
    "cemb_type": "rnn",
    "cemb_layers": 2,
    "decoder_layers": 3, // Would be nice ?
    "custom_cemb_cell": false,
    // Training options
    "checks_per_epoch": 1,
    "report_freq": 200,
    "verbose": true,
    "device": "cuda",
    "buffer_size": 10000,  // Sentence in memory
    "minimize_pad": false,
    "shuffle": true,
    "pretrain_embeddings": false,
    "load_pretrained_embeddings": "",
    "load_pretrained_encoder": "",
    "freeze_embeddings": false,
    "scorer": "general"
}""".split("\n") 
                             if not line.strip().startswith("//") and line.strip()
                            ]
                           )
                 )  # json.loads

In [2]:
# Add tasks
with open("/home/thibault/dev/LASLA/mood-tense-voice-pft/test.tsv") as f:
    for line in f:
        break
data["tasks"].extend([
    {"name": task}
    for task in line.split()
    if task not in ("lemma", "token", "Dis", "Entity")
])

## Test 01: CNN vs RNN


In [3]:
import copy

patience = copy.deepcopy(data)
CNN = copy.deepcopy(data)
RNN = copy.deepcopy(data)

CNN["modelname"] = "cemb_CNN_-"
CNN["cemb_type"] = "cnn"
RNN["modelname"] = "cemb_RNN_-"

with open("./configs/1.4.4.a-CNN_vs_RNN-CNN.json", "w") as f:
    json.dump(CNN, f)

with open("./configs/1.4.4.a-CNN_vs_RNN-RNN.json", "w") as f:
    json.dump(RNN, f)

## Test 02: Different Patience

In [4]:

import copy

patience = copy.deepcopy(data)
patience["patience"] = 5

CNN = copy.deepcopy(patience)
RNN = copy.deepcopy(patience)

CNN["modelname"] = "cemb_CNN_-patience_5_"
CNN["cemb_type"] = "cnn"
RNN["modelname"] = "cemb_RNN_-patience_5_"

with open("./configs/1.4.4.a-CNN_vs_RNN-CNN-patience.json", "w") as f:
    json.dump(CNN, f)

with open("./configs/1.4.4.a-CNN_vs_RNN-RNN-patience.json", "w") as f:
    json.dump(RNN, f)

## Test 03: Smaller Patience

In [5]:

import copy

patience = copy.deepcopy(data)
patience["lr_patience"] = 4
patience["patience"] = 7

CNN = copy.deepcopy(patience)
RNN = copy.deepcopy(patience)

CNN["modelname"] = "cemb_CNN_-lrpatience_4;7_"
CNN["cemb_type"] = "cnn"
RNN["modelname"] = "cemb_RNN_-lrpatience_5;7_"

with open("./configs/1.4.4.a-CNN_vs_RNN-CNN-lrpatience.json", "w") as f:
    json.dump(CNN, f)

with open("./configs/1.4.4.a-CNN_vs_RNN-RNN-lrpatience.json", "w") as f:
    json.dump(RNN, f)

## Test 04: No Wemb

In [6]:
import copy

CNN = copy.deepcopy(CNN)
RNN = copy.deepcopy(RNN)

CNN["word_max_size"] = 20000
RNN["word_max_size"] = 20000

CNN["wemb_dim"] = 0
RNN["wemb_dim"] = 0

CNN["modelname"] = "cemb_CNN_-wemb_none_-lrpatience_4;7_"
CNN["cemb_type"] = "cnn"
RNN["modelname"] = "cemb_RNN_-wemb_none_-lrpatience_5;7_"

with open("./configs/1.4.4.a-CNN_vs_RNN-CNN-nowemb.json", "w") as f:
    json.dump(CNN, f)

with open("./configs/1.4.4.a-CNN_vs_RNN-RNN-nowemb.json", "w") as f:
    json.dump(RNN, f)

## No Wemb vs Wemb + Plus Grand Encodeur

In [7]:
import copy

Patience4_7 = copy.deepcopy(data)
Patience4_7.update({
    "lr_patience": 4,
    "patience": 7,
    "word_max_size": 20000,
})

RNN_NoWemb = copy.deepcopy(Patience4_7)
RNN_NoWemb_L2 = copy.deepcopy(Patience4_7)
RNN_NoWemb_L2_CEMB_400 = copy.deepcopy(Patience4_7)
RNN_Wemb = copy.deepcopy(Patience4_7)
RNN_Wemb_200 = copy.deepcopy(Patience4_7)

RNN_NoWemb.update({
    "wemb_dim": 0,
    "modelname": "cemb_RNN_-wemb_none_-lrpatience_4_7_-hidden-256",
    "hidden_size": 256,
    #"num_layers": 2
})
import pprint
pprint.pprint(RNN_NoWemb)
RNN_NoWemb_L2.update({
    "wemb_dim": 0,
    "modelname": "cemb_RNN_-wemb_none_-lrpatience_4_7_-hidden-256-layers_2",
    "hidden_size": 256,
    "num_layers": 2
})
RNN_NoWemb_L2_CEMB_400.update({
    "wemb_dim": 0,
    "modelname": "cemb_RNN_-wemb_none_-lrpatience_4_7_-hidden-256-layers_2-cemb_400",
    "hidden_size": 256,
    "cemb_dim": 400
})
RNN_Wemb.update({
    "wemb_dim": 100,
    "word_min_freq": 5,
    "modelname": "cemb_RNN_-wemb_100_min_5-lrpatience_4_7_-hidden-256",
    "hidden_size": 256,
    #"num_layers": 2
})
RNN_Wemb_200.update({
    "wemb_dim": 200,
    "word_min_freq": 5,
    "modelname": "cemb_RNN_-wemb_200_min_5_-lrpatience_4_7_-hidden-256",
    "hidden_size": 256,
    #"num_layers": 2
})

with open("./configs/1.4.4.a-RNN-Wemb_vs_no_wemb-nowemb-hidden_256.json", "w") as f:
    json.dump(RNN_NoWemb, f)

with open("./configs/1.4.4.a-RNN-Wemb_vs_no_wemb-nowemb-hidden_256-layers_2.json", "w") as f:
    json.dump(RNN_NoWemb_L2, f)
    
with open("./configs/1.4.4.a-RNN-Wemb_vs_no_wemb-nowemb-hidden_256-cemb_400.json", "w") as f:
    json.dump(RNN_NoWemb_L2_CEMB_400, f)
    
with open("./configs/1.4.4.a-RNN-Wemb_vs_no_wemb-wemb-hidden_256.json", "w") as f:
    json.dump(RNN_Wemb, f)
    
with open("./configs/1.4.4.a-RNN-Wemb_vs_no_wemb-wemb_200-hidden_256.json", "w") as f:
    json.dump(RNN_Wemb_200, f)

{'batch_size': 128,
 'breakline_data': 'NONE',
 'breakline_ref': 'pos',
 'buffer_size': 10000,
 'cell': 'GRU',
 'cemb_dim': 300,
 'cemb_layers': 2,
 'cemb_type': 'rnn',
 'char_bos': True,
 'char_eos': True,
 'char_max_size': 500,
 'char_min_freq': 1,
 'checks_per_epoch': 1,
 'clip_norm': 5,
 'custom_cemb_cell': False,
 'decoder_layers': 3,
 'dev_path': './mood-tense-voice-pft/dev.tsv',
 'device': 'cuda',
 'dropout': 0.25,
 'epochs': 100,
 'factor': 0.5,
 'freeze_embeddings': False,
 'header': True,
 'hidden_size': 256,
 'include_lm': True,
 'input_path': './mood-tense-voice-pft/train.tsv',
 'linear_layers': 1,
 'lm_schedule': {'factor': 0.5, 'mode': 'min', 'patience': 2, 'weight': 0.2},
 'lm_shared_softmax': True,
 'load_pretrained_embeddings': '',
 'load_pretrained_encoder': '',
 'lr': 0.001,
 'lr_patience': 4,
 'max_sent_len': 35,
 'max_sents': 1000000,
 'merge_type': 'concat',
 'min_weight': 0.2,
 'minimize_pad': False,
 'modelname': 'cemb_RNN_-wemb_none_-lrpatience_4_7_-hidden-256'

In [11]:
RNN_NoWemb_Cemb500 = copy.deepcopy(RNN_NoWemb_L2_CEMB_400)
RNN_NoWemb_Cemb400_Hid384 = copy.deepcopy(RNN_NoWemb_L2_CEMB_400)
RNN_NoWemb_Cemb500_Hid384 = copy.deepcopy(RNN_NoWemb_L2_CEMB_400)
RNN_NoWemb_RealL2_CEMB_400 = copy.deepcopy(RNN_NoWemb_L2_CEMB_400)
RNN_NoWemb_RealL2_CEMB_400_Hid384 = copy.deepcopy(RNN_NoWemb_L2_CEMB_400)
#with open("./configs/1.4.4.a-RNN-Wemb_vs_no_wemb-nowemb-hidden_256-cemb_400.json", "w") as f:
#    json.dump(RNN_NoWemb_L2_CEMB_400, f)

RNN_NoWemb_RealL2_CEMB_400.update({
    "wemb_dim": 0,
    "modelname": "cemb_RNN_-wemb_none_-lrpatience_4_7_-hidden-256-real-layers_2-cemb_400",
    "hidden_size": 256,
    "cemb_dim": 400,
    "num_layers": 2
})

RNN_NoWemb_RealL2_CEMB_400_Hid384.update({
    "wemb_dim": 0,
    "modelname": "cemb_RNN_-wemb_none_-lrpatience_4_7_-hidden-384-real-layers_2-cemb_400",
    "hidden_size": 384,
    "cemb_dim": 400,
    "num_layers": 2
})

RNN_NoWemb_Cemb500.update({
    "wemb_dim": 0,
    "modelname": "cemb_RNN_-wemb_none_-lrpatience_4_7_-hidden-256-layers_1-cemb_500",
    "hidden_size": 256,
    "cemb_dim": 500
})

RNN_NoWemb_Cemb400_Hid384.update({
    "wemb_dim": 0,
    "modelname": "cemb_RNN_-wemb_none_-lrpatience_4_7_-hidden-384-layers_1-cemb_400",
    "hidden_size": 384,
    "cemb_dim": 400
})

RNN_NoWemb_Cemb500_Hid384.update({
    "wemb_dim": 0,
    "modelname": "cemb_RNN_-wemb_none_-lrpatience_4_7_-hidden-384-layers_1-cemb_500",
    "hidden_size": 384,
    "cemb_dim": 500
})

configs = [
    RNN_NoWemb_Cemb500, RNN_NoWemb_Cemb400_Hid384, RNN_NoWemb_Cemb500_Hid384, 
    RNN_NoWemb_RealL2_CEMB_400, RNN_NoWemb_RealL2_CEMB_400_Hid384
]

for file in configs:
    with open("./configs/1.4.4.a-RNN-"+file["modelname"]+".json", "w") as f:
        json.dump(file, f)