In [1]:
import os
os.chdir('../')

In [2]:
config = """
name: "my_experiment"

# This configuration serves the purpose of documenting and explaining the settings, *NOT* as an example for good hyperparamter settings.

data: # specify your data here
    src: {lang_src}                       # src language: expected suffix of train files, e.g. "train.de"
    trg: {lang_tgt}                       # trg language
    train: {train_path}     # training data
    dev: {dev_path}         # development data for validation
    test: {test_path}       # test data for testing final model; optional
    level: {level}                  # segmentation level: either "word", "bpe" or "char"
    lowercase: True                 # lowercase the data, also for validation
    max_sent_length: 130             # filter out longer sentences from training (src+trg)
    src_voc_min_freq: 1             # src minimum frequency for a token to become part of the vocabulary
    trg_voc_min_freq: 1             # trg minimum frequency for a token to become part of the vocabulary
    #src_vocab: "my_model/src_vocab.txt"  # if specified, load a vocabulary from this file
    #trg_vocab: "my_model/trg_vocab.txt"  # one token per line, line number is index

pretrained_data: # specify your data here
    src: {pretrained_lang_src}                       # src language: expected suffix of train files, e.g. "train.de"
    trg: {pretrained_lang_tgt}                       # trg language
    train: {pretrained_train_path}     # training data
    dev: {pretrained_dev_path}         # development data for validation
    test: {pretrained_test_path}       # test data for testing final model; optional
    level: {level}                  # segmentation level: either "word", "bpe" or "char"
    lowercase: True                 # lowercase the data, also for validation
    max_sent_length: 150             # filter out longer sentences from training (src+trg)
    src_voc_min_freq: 1             # src minimum frequency for a token to become part of the vocabulary
    trg_voc_min_freq: 1             # trg minimum frequency for a token to become part of the vocabulary
    #src_vocab: "my_model/src_vocab.txt"  # if specified, load a vocabulary from this file
    #trg_vocab: "my_model/trg_vocab.txt"  # one token per line, line number is index

testing:                            # specify which inference algorithm to use for testing (for validation it's always greedy decoding)
    beam_size: 5                    # size of the beam for beam search
    alpha: 1.0                      # length penalty for beam search

training:                           # specify training details here
    reset_best_ckpt: False          # if True, reset the tracking of the best checkpoint and scores. Use for domain adaptation or fine-tuning with new metrics or dev data.
    reset_scheduler: False          # if True, overwrite scheduler in loaded checkpoint with parameters specified in this config. Use for domain adaptation or fine-tuning.
    reset_optimizer: False          # if True, overwrite optimizer in loaded checkpoint with parameters specified in this config. Use for domain adaptation or fine-tuning.
    random_seed: 42                 # set this seed to make training deterministic
    optimizer: "adam"               # choices: "sgd", "adam", "adadelta", "adagrad", "rmsprop", default is SGD
    learning_rate: 0.0002          # initial learning rate, default: 3.0e-4 / 0.005
    learning_rate_min: 0.0001       # stop learning when learning rate is reduced below this threshold, default: 1.0e-8
    #learning_rate_factor: 1        # factor for Noam scheduler (used with Transformer)
    #learning_rate_warmup: 4000     # warmup steps for Noam scheduler (used with Transformer)
    clip_grad_val: 1.0              # clip the gradients to this value when they exceed it, optional
    #clip_grad_norm: 1.0            # norm clipping instead of value clipping
    weight_decay: 0.                # l2 regularization, default: 0
    batch_size: 48                  # mini-batch size as number of sentences (when batch_type is "sentence"; default) or total number of tokens (when batch_type is "token")
    batch_type: "sentence"          # create batches with sentences ("sentence", default) or tokens ("token")
    eval_batch_size: 10            # mini-batch size for evaluation (see batch_size above)
    eval_batch_type: "sentence"     # evaluation batch type ("sentence", default) or tokens ("token")
    batch_multiplier: 1             # increase the effective batch size with values >1 to batch_multiplier*batch_size without increasing memory consumption by making updates only every batch_multiplier batches
    scheduling: "plateau"           # learning rate scheduling, optional, if not specified stays constant, options: "plateau", "exponential", "decaying", "noam" (for Transformer), "warmupexponentialdecay"
    patience: 600                     # specific to plateau scheduler: wait for this many validations without improvement before decreasing the learning rate
    decrease_factor: 0.5            # specific to plateau & exponential scheduler: decrease the learning rate by this factor
    epochs: 20                      # train for this many epochs
    validation_freq: {val_freq}            # validate after this many updates (number of mini-batches), default: 1000
    logging_freq: 1000               # log the training progress after this many updates, default: 100
    eval_metric: "bleu"             # validation metric, default: "bleu", other options: "chrf", "token_accuracy", "sequence_accuracy"
    early_stopping_metric: "eval_metric"   # when a new high score on this metric is achieved, a checkpoint is written, when "eval_metric" (default) is maximized, when "loss" or "ppl" is minimized
    model_dir: {model_dir} # directory where models and validation results are stored, required
    overwrite: True                 # overwrite existing model directory, default: False. Do not set to True unless for debugging!
    shuffle: True                   # shuffle the training data, default: True
    use_cuda: True                  # use CUDA for acceleration on GPU, required. Set to False when working on CPU.
    max_output_length: 60           # maximum output length for decoding, default: None. If set to None, allow sentences of max 1.5*src length
    print_valid_sents: []    # print this many validation sentences during each validation run, default: [0, 1, 2]
    keep_last_ckpts: 3              # keep this many of the latest checkpoints, if -1: all of them, default: 5
    label_smoothing: 0.0            # label smoothing: reference tokens will have 1-label_smoothing probability instead of 1, rest of probability mass is uniformly distributed over the rest of the vocabulary, default: 0.0 (off)

pretraining:                           # specify training details here
    reset_best_ckpt: False          # if True, reset the tracking of the best checkpoint and scores. Use for domain adaptation or fine-tuning with new metrics or dev data.
    reset_scheduler: False          # if True, overwrite scheduler in loaded checkpoint with parameters specified in this config. Use for domain adaptation or fine-tuning.
    reset_optimizer: False          # if True, overwrite optimizer in loaded checkpoint with parameters specified in this config. Use for domain adaptation or fine-tuning.
    random_seed: 42                 # set this seed to make training deterministic
    optimizer: "adam"               # choices: "sgd", "adam", "adadelta", "adagrad", "rmsprop", default is SGD
    learning_rate: 0.0004           # initial learning rate, default: 3.0e-4 / 0.005
    learning_rate_min: 0.00001       # stop learning when learning rate is reduced below this threshold, default: 1.0e-8
    #learning_rate_factor: 1        # factor for Noam scheduler (used with Transformer)
    #learning_rate_warmup: 4000     # warmup steps for Noam scheduler (used with Transformer)
    clip_grad_val: 1.0              # clip the gradients to this value when they exceed it, optional
    #clip_grad_norm: 1.0            # norm clipping instead of value clipping
    weight_decay: 0.                # l2 regularization, default: 0
    batch_size: 48                  # mini-batch size as number of sentences (when batch_type is "sentence"; default) or total number of tokens (when batch_type is "token")
    batch_type: "sentence"          # create batches with sentences ("sentence", default) or tokens ("token")
    eval_batch_size: 10            # mini-batch size for evaluation (see batch_size above)
    eval_batch_type: "sentence"     # evaluation batch type ("sentence", default) or tokens ("token")
    batch_multiplier: 1             # increase the effective batch size with values >1 to batch_multiplier*batch_size without increasing memory consumption by making updates only every batch_multiplier batches
    scheduling: "plateau"           # learning rate scheduling, optional, if not specified stays constant, options: "plateau", "exponential", "decaying", "noam" (for Transformer), "warmupexponentialdecay"
    patience: 600                     # specific to plateau scheduler: wait for this many validations without improvement before decreasing the learning rate
    decrease_factor: 0.5            # specific to plateau & exponential scheduler: decrease the learning rate by this factor
    epochs: 25                      # train for this many epochs
    validation_freq: {val_freq}            # validate after this many updates (number of mini-batches), default: 1000
    logging_freq: 1000               # log the training progress after this many updates, default: 100
    eval_metric: "bleu"             # validation metric, default: "bleu", other options: "chrf", "token_accuracy", "sequence_accuracy"
    early_stopping_metric: "eval_metric"   # when a new high score on this metric is achieved, a checkpoint is written, when "eval_metric" (default) is maximized, when "loss" or "ppl" is minimized
    model_dir: {model_dir} # directory where models and validation results are stored, required
    overwrite: True                 # overwrite existing model directory, default: False. Do not set to True unless for debugging!
    shuffle: True                   # shuffle the training data, default: True
    use_cuda: True                  # use CUDA for acceleration on GPU, required. Set to False when working on CPU.
    max_output_length: 60           # maximum output length for decoding, default: None. If set to None, allow sentences of max 1.5*src length
    print_valid_sents: []    # print this many validation sentences during each validation run, default: [0, 1, 2]
    keep_last_ckpts: 3              # keep this many of the latest checkpoints, if -1: all of them, default: 5
    label_smoothing: 0.0            # label smoothing: reference tokens will have 1-label_smoothing probability instead of 1, rest of probability mass is uniformly distributed over the rest of the vocabulary, default: 0.0 (off)

model:                              # specify your model architecture here
    initializer: "xavier"           # initializer for all trainable weights (xavier, zeros, normal, uniform)
    init_weight: 0.01               # weight to initialize; for uniform, will use [-weight, weight]
    init_gain: 1.0                  # gain for Xavier initializer (default: 1.0)
    bias_initializer: "zeros"       # initializer for bias terms (xavier, zeros, normal, uniform)
    embed_initializer: "normal"     # initializer for embeddings (xavier, zeros, normal, uniform)
    embed_init_weight: 0.1          # weight to initialize; for uniform, will use [-weight, weight]
    embed_init_gain: 1.0            # gain for Xavier initializer for embeddings (default: 1.0)
    init_rnn_orthogonal: False      # use orthogonal initialization for recurrent weights (default: False)
    lstm_forget_gate: 1.            # initialize LSTM forget gate with this value (default: 1.)
    tied_embeddings: False           # tie src and trg embeddings, only applicable if vocabularies are the same, default: False
    tied_softmax: False             # tie trg embeddings and softmax (for Transformer; can be used together with tied_embeddings), default: False
    encoder:
        type: "recurrent"           # encoder type: "recurrent" for LSTM or GRU, or "transformer" for a Transformer
        rnn_type: "gru"             # type of recurrent unit to use, either "gru" or "lstm", default: "lstm"
        embeddings:
            embedding_dim: {emb_size}      # size of embeddings
            scale: False            # scale the embeddings by sqrt of their size, default: False
            freeze: False           # if True, embeddings are not updated during training
        hidden_size: {hidden_size}            # size of RNN
        bidirectional: True         # use a bi-directional encoder, default: True
        dropout: 0.3                # apply dropout to the inputs to the RNN, default: 0.0
        num_layers: 2               # stack this many layers of equal size, default: 1
        freeze: False               # if True, encoder parameters are not updated during training (does not include embedding parameters)
    decoder:
        type: "recurrent"           # decoder type: "recurrent" for LSTM or GRU, or "transformer" for a Transformer
        rnn_type: "gru"
        embeddings:
            embedding_dim: {emb_size}
            scale: False
            freeze: False           # if True, embeddings are not updated during training
        hidden_size: {hidden_size}
        dropout: 0.3
        hidden_dropout: 0.2         # apply dropout to the attention vector, default: 0.0
        num_layers: 2
        input_feeding: True         # combine hidden state and attention vector before feeding to rnn, default: True
        init_hidden: "last"         # initialized the decoder hidden state: use linear projection of last encoder state ("bridge") or simply the last state ("last") or zeros ("zero"), default: "bridge"
        attention: "bahdanau"       # attention mechanism, choices: "bahdanau" (MLP attention), "luong" (bilinear attention), default: "bahdanau"
        freeze: False               # if True, decoder parameters are not updated during training (does not include embedding parameters, but attention)
"""

In [3]:
cfg_file = "joeynmt/configs/sample_{name}.yaml".format(name="transfer")

In [None]:
from pathlib import Path
from joeynmt.training import train_transfer

emb_size = 300
hidden_size = 512
base_dir = Path('data/transfer/preprocessed/')
for base_lang in ['splits.en']:
    #tr tk bn mr lt
    base_lang_dir = base_dir / base_lang
    for lang in os.listdir(base_lang_dir):
        if 'shp' not in lang:
            if 'tr' in lang\
                or 'tk' in lang\
                or 'bn' in lang\
                or 'mr' in lang\
                or 'lt' in lang:
                lang_dir = base_lang_dir / lang
                for segment in os.listdir(lang_dir):
                    segment_dir = lang_dir / segment

                    pretrained_lang_src, pretrained_lang_tgt = lang.split('-')
                    lang_src = pretrained_lang_src
                    lang_tgt = 'shp'

                    training_dir = Path(str(segment_dir).replace(pretrained_lang_tgt, 'shp'))

                    if 'bpe_drop' in segment:
                        level = 'bpe'
                    elif 'bpe' in segment:
                        level = 'bpe'
                    elif 'char' in segment:
                        level = 'char'
                    elif 'word' in segment:
                        level = 'word'
                    elif 'syl' in segment:
                        level = 'syl'
                    else:
                        level = None         

                    f_config = config.format(lang_src=lang_src, lang_tgt=lang_tgt, \
                        train_path=os.path.join(training_dir, 'train'),\
                        test_path=os.path.join(training_dir, 'test'),\
                        dev_path=os.path.join(training_dir, 'valid'),\
                        pretrained_lang_src=pretrained_lang_src,\
                        pretrained_lang_tgt=pretrained_lang_tgt,\
                        pretrained_train_path=os.path.join(segment_dir, 'train'),\
                        pretrained_test_path=os.path.join(segment_dir, 'test'),\
                        pretrained_dev_path=os.path.join(segment_dir, 'valid'),\
                        level=level,\
                        emb_size=emb_size,\
                        hidden_size=hidden_size,\
                        val_freq=10,\
                        model_dir=os.path.join('results/rnn/transfer_top', 'splits.en',\
                                            f'{pretrained_lang_src}-{pretrained_lang_tgt}_{emb_size}_{hidden_size}', segment))

                    with open("joeynmt/configs/sample_{name}.yaml".format(name="transfer"),'w') as f:
                        f.write(f_config)

                    !python3 joeynmt/joeynmt train_transfer "joeynmt/configs/sample_transfer.yaml"

2020-03-04 00:17:20,551 Hello! This is Joey-NMT.
2020-03-04 00:17:21.314911: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory
2020-03-04 00:17:21.314978: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory
2020-03-04 00:17:21.314988: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2020-03-04 00:17:21,968 Total params: 12477124
2020-03-04 00:17:21,969 Trainable parameters: ['decoder.att_vector_layer.bias', 'decoder.att_vector_layer.weight', 'decoder.attention.energy_layer.weight', 'decoder.attention.key_layer.

In [None]:
from pathlib import Path
from joeynmt.training import train_transfer

emb_size = 300
hidden_size = 512
base_dir = Path('data/transfer/preprocessed/')
for base_lang in ['splits.es']:
    #tr tk bn mr lt
    base_lang_dir = base_dir / base_lang
    for lang in os.listdir(base_lang_dir):
        if 'shp' not in lang:
            if 'tr' in lang\
                or 'hu' in lang\
                or 'fi' in lang\
                or 'ru' in lang\
                or 'pl' in lang:
                lang_dir = base_lang_dir / lang
                for segment in os.listdir(lang_dir):
                    segment_dir = lang_dir / segment

                    pretrained_lang_src, pretrained_lang_tgt = lang.split('-')
                    lang_src = pretrained_lang_src
                    lang_tgt = 'shp'

                    training_dir = Path(str(segment_dir).replace(pretrained_lang_tgt, 'shp'))

                    if 'bpe_drop' in segment:
                        level = 'bpe'
                    elif 'bpe' in segment:
                        level = 'bpe'
                    elif 'char' in segment:
                        level = 'char'
                    elif 'word' in segment:
                        level = 'word'
                    elif 'syl' in segment:
                        level = 'syl'
                    else:
                        level = None         

                    f_config = config.format(lang_src=lang_src, lang_tgt=lang_tgt, \
                        train_path=os.path.join(training_dir, 'train'),\
                        test_path=os.path.join(training_dir, 'test'),\
                        dev_path=os.path.join(training_dir, 'valid'),\
                        pretrained_lang_src=pretrained_lang_src,\
                        pretrained_lang_tgt=pretrained_lang_tgt,\
                        pretrained_train_path=os.path.join(segment_dir, 'train'),\
                        pretrained_test_path=os.path.join(segment_dir, 'test'),\
                        pretrained_dev_path=os.path.join(segment_dir, 'valid'),\
                        level=level,\
                        emb_size=emb_size,\
                        hidden_size=hidden_size,\
                        val_freq=10,\
                        model_dir=os.path.join('results/rnn/transfer_top', 'splits.es',\
                                            f'{pretrained_lang_src}-{pretrained_lang_tgt}_{emb_size}_{hidden_size}', segment))

                    with open("joeynmt/configs/sample_{name}_es.yaml".format(name="transfer"),'w') as f:
                        f.write(f_config)

                    !python3 joeynmt/joeynmt train_transfer "joeynmt/configs/sample_transfer_es.yaml"

In [None]:
print(8)