In [1]:
import os
os.chdir('../')

In [2]:
!find . -name .ipynb* -exec rm -rf {} \;

find: ‘./data/translate/raw/Religioso/.ipynb_checkpoints’: No such file or directory
find: ‘./data/translate/preprocessed/Religioso/char/.ipynb_checkpoints’: No such file or directory
find: ‘./nb/.ipynb_checkpoints’: No such file or directory
find: ‘./src/.ipynb_checkpoints’: No such file or directory
find: ‘./results/.ipynb_checkpoints’: No such file or directory
find: ‘./joeynmt/joeynmt/.ipynb_checkpoints’: No such file or directory


In [3]:
config = """
name: "my_experiment"

# This configuration serves the purpose of documenting and explaining the settings, *NOT* as an example for good hyperparamter settings.

data: # specify your data here
    src: {lang_src}                       # src language: expected suffix of train files, e.g. "train.de"
    trg: {lang_tgt}                       # trg language
    train: {train_path}     # training data
    dev: {dev_path}         # development data for validation
    test: {test_path}       # test data for testing final model; optional
    level: {level}                  # segmentation level: either "word", "bpe" or "char"
    lowercase: True                 # lowercase the data, also for validation
    max_sent_length: 150             # filter out longer sentences from training (src+trg)
    src_voc_min_freq: 1             # src minimum frequency for a token to become part of the vocabulary
    trg_voc_min_freq: 1             # trg minimum frequency for a token to become part of the vocabulary
    #src_vocab: "my_model/src_vocab.txt"  # if specified, load a vocabulary from this file
    #trg_vocab: "my_model/trg_vocab.txt"  # one token per line, line number is index

testing:                            # specify which inference algorithm to use for testing (for validation it's always greedy decoding)
    beam_size: 5                    # size of the beam for beam search
    alpha: 1.0                      # length penalty for beam search

training:                           # specify training details here
    reset_best_ckpt: False          # if True, reset the tracking of the best checkpoint and scores. Use for domain adaptation or fine-tuning with new metrics or dev data.
    reset_scheduler: False          # if True, overwrite scheduler in loaded checkpoint with parameters specified in this config. Use for domain adaptation or fine-tuning.
    reset_optimizer: False          # if True, overwrite optimizer in loaded checkpoint with parameters specified in this config. Use for domain adaptation or fine-tuning.
    random_seed: 42                 # set this seed to make training deterministic
    optimizer: "adam"               # choices: "sgd", "adam", "adadelta", "adagrad", "rmsprop", default is SGD
    learning_rate: 0.0005           # initial learning rate, default: 3.0e-4 / 0.005
    learning_rate_min: 0.0001       # stop learning when learning rate is reduced below this threshold, default: 1.0e-8
    #learning_rate_factor: 1        # factor for Noam scheduler (used with Transformer)
    #learning_rate_warmup: 4000     # warmup steps for Noam scheduler (used with Transformer)
    clip_grad_val: 1.0              # clip the gradients to this value when they exceed it, optional
    #clip_grad_norm: 1.0            # norm clipping instead of value clipping
    weight_decay: 0.                # l2 regularization, default: 0
    batch_size: 48                  # mini-batch size as number of sentences (when batch_type is "sentence"; default) or total number of tokens (when batch_type is "token")
    batch_type: "sentence"          # create batches with sentences ("sentence", default) or tokens ("token")
    eval_batch_size: 10            # mini-batch size for evaluation (see batch_size above)
    eval_batch_type: "sentence"     # evaluation batch type ("sentence", default) or tokens ("token")
    batch_multiplier: 1             # increase the effective batch size with values >1 to batch_multiplier*batch_size without increasing memory consumption by making updates only every batch_multiplier batches
    scheduling: "plateau"           # learning rate scheduling, optional, if not specified stays constant, options: "plateau", "exponential", "decaying", "noam" (for Transformer), "warmupexponentialdecay"
    patience: 500                     # specific to plateau scheduler: wait for this many validations without improvement before decreasing the learning rate
    decrease_factor: 0.5            # specific to plateau & exponential scheduler: decrease the learning rate by this factor
    epochs: 30                      # train for this many epochs
    validation_freq: {val_freq}            # validate after this many updates (number of mini-batches), default: 1000
    logging_freq: 1000               # log the training progress after this many updates, default: 100
    eval_metric: "bleu"             # validation metric, default: "bleu", other options: "chrf", "token_accuracy", "sequence_accuracy"
    early_stopping_metric: "eval_metric"   # when a new high score on this metric is achieved, a checkpoint is written, when "eval_metric" (default) is maximized, when "loss" or "ppl" is minimized
    model_dir: {model_dir} # directory where models and validation results are stored, required
    overwrite: True                 # overwrite existing model directory, default: False. Do not set to True unless for debugging!
    shuffle: True                   # shuffle the training data, default: True
    use_cuda: True                  # use CUDA for acceleration on GPU, required. Set to False when working on CPU.
    max_output_length: 60           # maximum output length for decoding, default: None. If set to None, allow sentences of max 1.5*src length
    print_valid_sents: []    # print this many validation sentences during each validation run, default: [0, 1, 2]
    keep_last_ckpts: 3              # keep this many of the latest checkpoints, if -1: all of them, default: 5
    label_smoothing: 0.0            # label smoothing: reference tokens will have 1-label_smoothing probability instead of 1, rest of probability mass is uniformly distributed over the rest of the vocabulary, default: 0.0 (off)

model:                              # specify your model architecture here
    initializer: "xavier"           # initializer for all trainable weights (xavier, zeros, normal, uniform)
    init_weight: 0.01               # weight to initialize; for uniform, will use [-weight, weight]
    init_gain: 1.0                  # gain for Xavier initializer (default: 1.0)
    bias_initializer: "zeros"       # initializer for bias terms (xavier, zeros, normal, uniform)
    embed_initializer: "normal"     # initializer for embeddings (xavier, zeros, normal, uniform)
    embed_init_weight: 0.1          # weight to initialize; for uniform, will use [-weight, weight]
    embed_init_gain: 1.0            # gain for Xavier initializer for embeddings (default: 1.0)
    init_rnn_orthogonal: False      # use orthogonal initialization for recurrent weights (default: False)
    lstm_forget_gate: 1.            # initialize LSTM forget gate with this value (default: 1.)
    tied_embeddings: False           # tie src and trg embeddings, only applicable if vocabularies are the same, default: False
    tied_softmax: False             # tie trg embeddings and softmax (for Transformer; can be used together with tied_embeddings), default: False
    encoder:
        type: "recurrent"           # encoder type: "recurrent" for LSTM or GRU, or "transformer" for a Transformer
        rnn_type: "gru"             # type of recurrent unit to use, either "gru" or "lstm", default: "lstm"
        embeddings:
            embedding_dim: {emb_size}      # size of embeddings
            scale: False            # scale the embeddings by sqrt of their size, default: False
            freeze: False           # if True, embeddings are not updated during training
        hidden_size: {hidden_size}            # size of RNN
        bidirectional: True         # use a bi-directional encoder, default: True
        dropout: 0.3                # apply dropout to the inputs to the RNN, default: 0.0
        num_layers: 2               # stack this many layers of equal size, default: 1
        freeze: False               # if True, encoder parameters are not updated during training (does not include embedding parameters)
    decoder:
        type: "recurrent"           # decoder type: "recurrent" for LSTM or GRU, or "transformer" for a Transformer
        rnn_type: "gru"
        embeddings:
            embedding_dim: {emb_size}
            scale: False
            freeze: False           # if True, embeddings are not updated during training
        hidden_size: {hidden_size}
        dropout: 0.3
        hidden_dropout: 0.2         # apply dropout to the attention vector, default: 0.0
        num_layers: 2
        input_feeding: True         # combine hidden state and attention vector before feeding to rnn, default: True
        init_hidden: "last"         # initialized the decoder hidden state: use linear projection of last encoder state ("bridge") or simply the last state ("last") or zeros ("zero"), default: "bridge"
        attention: "bahdanau"       # attention mechanism, choices: "bahdanau" (MLP attention), "luong" (bilinear attention), default: "bahdanau"
        freeze: False               # if True, decoder parameters are not updated during training (does not include embedding parameters, but attention)
"""

In [None]:
base_path = 'data/translate/preprocessed'
datas = os.listdir(base_path)
emb_size = 300
hidden_size = 512
for data in datas:
    data_path = os.path.join(base_path, data)
    for lang_in, lang_out in [['es', 'shp'], ['shp', 'es']]:
        segmentations = os.listdir(data_path)
        for segment in segmentations:
            segment_path = os.path.join(data_path, segment)
            print(os.path.join('results/translate', data, segment))
            if 'bpe_drop' in segment:
                level = 'bpe'
            elif 'bpe' in segment:
                level = 'bpe'
            elif 'char' in segment:
                level = 'char'
            elif 'word' in segment:
                level = 'word'
            elif 'syl' in segment:
                level = 'syl'
            else:
                level = None
                
            val_freq = 10
            f_config = config.format(lang_src=lang_in, lang_tgt=lang_out, 
                                     train_path=os.path.join(segment_path, 'train'),
                                     test_path=os.path.join(segment_path, 'test'),
                                     dev_path=os.path.join(segment_path, 'valid'),
                                     level=level,
                                     emb_size=emb_size,
                                     hidden_size=hidden_size,
                                     val_freq=val_freq,
                                     model_dir=os.path.join('results/translate',\
                                                            f'{lang_in}-{lang_out}_{emb_size}_{hidden_size}', segment))

            with open("joeynmt/configs/transformer_{name}.yaml".format(name="test"),'w') as f:
                f.write(f_config)

            !python3 joeynmt/joeynmt train "joeynmt/configs/transformer_test.yaml"

results/translate/Religioso/char
2020-02-25 10:08:37,506 Hello! This is Joey-NMT.
2020-02-25 10:08:41.511416: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory
2020-02-25 10:08:41.511558: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory
2020-02-25 10:08:41.511580: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly.
2020-02-25 10:08:46,385 Total params: 12451936
2020-02-25 10:08:46,385 Trainable parameters: ['decoder.att_vector_layer.bias', 'decoder.att_vector_layer.weight', 'decoder.attention.energy_layer.weigh

In [None]:
segment_path

In [10]:
corpus

['char',
 'bpe_drop_10000',
 'bpe_drop_2000',
 'bpe_drop_3000',
 'bpe_1000',
 'bpe_3000',
 'bpe_7000',
 'bpe_drop_8000',
 'bpe_10000',
 'bpe_2000',
 'bpe_drop_6000',
 'word',
 'bpe_5000',
 'bpe_drop_9000',
 'bpe_drop_7000',
 'bpe_9000',
 'bpe_drop_5000',
 'bpe_6000',
 'bpe_4000',
 'bpe_drop_4000',
 'bpe_drop_1000',
 'bpe_8000']

In [10]:
base_path = 'data/translate/preprocessed'
langs = os.listdir(base_path)
emb_size = 300
hidden_size = 256
for lang in langs:
    lang_path = os.path.join(base_path, lang)
    corpus = os.listdir(lang_path)
    corpus_path = os.path.join(lang_path, corpus[0])
    segmentations = os.listdir(corpus_path)
    lang_src = lang.split("-")[0]
    lang_tgt = lang.split("-")[1]
    for lang_in, lang_out in [[lang_src, lang_tgt], [lang_tgt, lang_src]]:
        for segmentation in segmentations:
            segmentation_path = os.path.join(corpus_path, segmentation)
            print(os.path.join('results/nmt', lang, segmentation))
            partitions = os.listdir(segmentation_path)
            if 'bpe_drop' in segmentation:
                level = 'bpe'
            elif 'bpe' in segmentation:
                level = 'bpe'
            elif 'char' in segmentation:
                level = 'char'
            elif 'word' in segmentation:
                level = 'word'
            elif 'syl' in segmentation:
                level = 'syl'
            else:
                level = None
                
            if lang in ['slk-eng', 'shp-es', 'shp-en', 'cat-eng', 'ces-eng', 'bul-eng',\
                        'ron-eng', 'lvs-eng', 'hrv-eng']:
                val_freq = 10
            else:
                val_freq = 40
            f_config = config.format(lang_src=lang_in, lang_tgt=lang_out, 
                                     train_path=os.path.join(segmentation_path, 'train'),
                                     test_path=os.path.join(segmentation_path, 'test'),
                                     dev_path=os.path.join(segmentation_path, 'valid'),
                                     level=level,
                                     emb_size=emb_size,
                                     hidden_size=hidden_size,
                                     val_freq=val_freq,
                                     model_dir=os.path.join('results/nmt',\
                                                            f'{lang_in}-{lang_out}_{emb_size}_{hidden_size}', segmentation))

            with open("joeynmt/configs/transformer_{name}.yaml".format(name="test"),'w') as f:
                f.write(f_config)

            !python3 joeynmt/joeynmt train "joeynmt/configs/transformer_test.yaml"

100%|██████████| 6745/6745 [00:00<00:00, 30522.11it/s]


In [11]:
random_seed(seed)
rnn = torch.nn.LSTM(emb_sz, n_hid, dropout=0.25, num_layers=2, bidirectional=True, batch_first=True)

random_seed(seed)
encoder = PytorchSeq2SeqWrapper(rnn)

random_seed(seed)
source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

attention = DotProductAttention()
#attention = LinearAttention(n_hid, n_hid, activation=Activation.by_name('tanh')())

In [12]:
max_decoding_steps = 100   # TODO: make this variable
random_seed(seed)
model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                      target_embedding_dim=emb_sz,
                      target_namespace='target_tokens',
                      attention=attention,
                      seed=seed,
                      scheduled_sampling_ratio=0,
                      mode='word',
                      beam_size=1,
                      use_bleu=True).cuda(cuda_id)

In [13]:
optimizer = optim.Adam(model.parameters())
iterator = BucketIterator(batch_size=48, sorting_keys=[("source_tokens", "num_tokens")])

iterator.index_with(vocab)

In [8]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  validation_metric='+BLEU',
                  num_epochs=5,
                  #serialization_dir='temp1',
                  cuda_device=cuda_id)

trainer.train()   

You provided a validation dataset but patience was set to None, meaning that early stopping is disabled
loss: 6.6997 ||: 100%|██████████| 125/125 [00:12<00:00,  9.84it/s]
BLEU: 0.0000, loss: 6.4466 ||: 100%|██████████| 16/16 [00:01<00:00, 10.13it/s]
loss: 6.0505 ||: 100%|██████████| 125/125 [00:12<00:00, 10.34it/s]
BLEU: 0.0000, loss: 6.2729 ||: 100%|██████████| 16/16 [00:03<00:00,  4.33it/s]
loss: 5.7174 ||: 100%|██████████| 125/125 [00:12<00:00, 10.30it/s]
BLEU: 0.0000, loss: 6.1824 ||: 100%|██████████| 16/16 [00:03<00:00,  4.24it/s]
loss: 5.3607 ||: 100%|██████████| 125/125 [00:12<00:00, 10.14it/s]
BLEU: 0.0025, loss: 6.1199 ||: 100%|██████████| 16/16 [00:04<00:00,  3.84it/s]
loss: 4.9794 ||: 100%|██████████| 125/125 [00:12<00:00, 10.29it/s]
BLEU: 0.0000, loss: 6.0624 ||: 100%|██████████| 16/16 [00:02<00:00,  5.55it/s]


{'best_epoch': 3,
 'peak_cpu_memory_MB': 2419.492,
 'peak_gpu_0_memory_MB': 2851,
 'training_duration': '00:01:17',
 'training_start_epoch': 0,
 'training_epochs': 4,
 'epoch': 4,
 'training_loss': 4.979407690048218,
 'training_cpu_memory_MB': 2419.492,
 'training_gpu_0_memory_MB': 2851,
 'validation_BLEU': 1.8683536506125043e-06,
 'validation_loss': 6.062360018491745,
 'best_validation_BLEU': 0.002512392654034492,
 'best_validation_loss': 6.1198570728302}

In [14]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  validation_metric='+BLEU',
                  num_epochs=5,
                  #serialization_dir='temp1',
                  cuda_device=cuda_id)

trainer.train()   

You provided a validation dataset but patience was set to None, meaning that early stopping is disabled
loss: 6.7027 ||: 100%|██████████| 125/125 [00:13<00:00,  9.19it/s]
BLEU: 0.0000, loss: 6.4570 ||: 100%|██████████| 16/16 [00:01<00:00,  9.67it/s]
loss: 6.0616 ||: 100%|██████████| 125/125 [00:13<00:00,  9.30it/s]
BLEU: 0.0000, loss: 6.2846 ||: 100%|██████████| 16/16 [00:04<00:00,  3.71it/s]
loss: 5.7169 ||: 100%|██████████| 125/125 [00:13<00:00,  9.28it/s]
BLEU: 0.0000, loss: 6.1733 ||: 100%|██████████| 16/16 [00:04<00:00,  3.48it/s]
loss: 5.3694 ||: 100%|██████████| 125/125 [00:13<00:00,  9.25it/s]
BLEU: 0.0035, loss: 6.1236 ||: 100%|██████████| 16/16 [00:02<00:00,  5.98it/s]
loss: 4.9955 ||: 100%|██████████| 125/125 [00:13<00:00,  9.42it/s]
BLEU: 0.0000, loss: 6.0683 ||: 100%|██████████| 16/16 [00:02<00:00,  5.57it/s]


{'best_epoch': 3,
 'peak_cpu_memory_MB': 2502.136,
 'peak_gpu_0_memory_MB': 2851,
 'training_duration': '00:01:23',
 'training_start_epoch': 0,
 'training_epochs': 4,
 'epoch': 4,
 'training_loss': 4.995536445617676,
 'training_cpu_memory_MB': 2502.136,
 'training_gpu_0_memory_MB': 2851,
 'validation_BLEU': 1.6411359862230068e-06,
 'validation_loss': 6.068288892507553,
 'best_validation_BLEU': 0.003538747884663674,
 'best_validation_loss': 6.123574078083038}

In [None]:
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  validation_metric='+BLEU',
                  num_epochs=5,
                  #serialization_dir='temp1',
                  cuda_device=cuda_id)

trainer.train()   

You provided a validation dataset but patience was set to None, meaning that early stopping is disabled
loss: 6.7027 ||: 100%|██████████| 125/125 [00:13<00:00,  9.19it/s]
BLEU: 0.0000, loss: 6.4570 ||: 100%|██████████| 16/16 [00:01<00:00,  9.67it/s]
loss: 6.0616 ||: 100%|██████████| 125/125 [00:13<00:00,  9.30it/s]
BLEU: 0.0000, loss: 6.2846 ||: 100%|██████████| 16/16 [00:04<00:00,  3.71it/s]
loss: 5.7169 ||: 100%|██████████| 125/125 [00:13<00:00,  9.28it/s]
BLEU: 0.0000, loss: 6.1733 ||: 100%|██████████| 16/16 [00:04<00:00,  3.48it/s]
loss: 5.3874 ||:  35%|███▌      | 44/125 [00:05<00:09,  8.32it/s]

In [11]:
trainer.train()

{'best_epoch': 8,
 'best_validation_BLEU': 0.011378326800353748,
 'best_validation_loss': 6.618885815143585}

In [12]:
with open("temp/best.th", 'rb') as f_model:
    model.load_state_dict(torch.load(f_model))
model.eval()
e = evaluate(model, test_dataset, iterator, cuda_id, None)    

predictor = SimpleSeq2SeqPredictor(model, reader)

BLEU: 0.00, loss: 6.68 ||: 100%|██████████| 16/16 [00:02<00:00,  7.01it/s]


In [10]:
import itertools
for instance in itertools.islice(validation_dataset, 5):
    print('SOURCE:', instance.fields['source_tokens'].tokens)
    print('GOLD:', instance.fields['target_tokens'].tokens)
    print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])

SOURCE: [@start@, no, junten, tesoros, y, reservas, aquí, en, la, tierra, donde, la, polilla, y, el, óxido, hacen, estragos, y, donde, los, ladrones, rompen, el, muro, y, roban, @end@]
GOLD: [@start@, nato, mainxon, icha, jawéki, kaiakasi, iamakanwe, ja, jawékibora, arai, keyotai, jatíribibora, jawen, ibon, pia, payotai, jainxonra, yometsobaon, mato, yometsoati, atipanke, @end@]
PRED: ['jaskara', 'iken', 'ja', 'non', 'mato', 'ja', 'non', 'ja', 'ja', 'ja', 'ja', 'ja', 'ja', 'non', 'mato']
SOURCE: [@start@, ahí, tienen, pues, a, tito, mi, compañero, y, ayudante, cerca, de, ustedes, y, con, él, tienen, a, hermanos, nuestros, delegados, de, las, iglesias, personas, que, son, la, gloria, de, cristo, @end@]
GOLD: [@start@, ja, titora, ebetanbi, niai, iki, itan, mato, akinni, ebetanbi, teetai, iki, jatian, ja, non, wetsa, rabéra, non, wetsabaon, raana, iki, jaton, ikábo, oinxonra, jonibaon, non, ibo, rabikanai, @end@]
PRED: ['ja', 'mato', 'mato', 'yoiai', 'ja', 'non', 'mato', 'ja', 'non', 'ma

In [10]:

def main(data_dir, results_dir, cuda_id=1, emb_sz=300, n_hid=512, epochs=50, seed=0):
    #name_file='../data/nmt/preprocessed/shp-es/Flashcards/syl'
    #dir_results = 'results_3'
    os.makedirs(results_dir, exist_ok=True)
    
    data_dir = Path.cwd() / data_dir / 'preprocessed'
    results_dir = Path.cwd() / results_dir
    print(data_dir)
    langs = os.listdir(data_dir)
    print(langs)
    for lang in langs:
        folders = os.listdir(data_dir / lang)
        with open(results_dir / f'{lang}_{emb_sz}_{seed}.txt', 'w') as f_results:
            for folder in folders:
                segmentations = os.listdir(data_dir / lang / folder)
                print(segmentations)
                for segmentation in segmentations:
                    print('start with:', segmentation)
                    path = data_dir / lang / folder / segmentation
                    shutil.rmtree('temp', ignore_errors=True)
                    os.makedirs('temp', exist_ok=True)
                    
                    random_seed(seed)
                    reader = Seq2SeqDatasetReader(
                        source_tokenizer=WordTokenizer(word_splitter=JustSpacesWordSplitter()),
                        target_tokenizer=WordTokenizer(word_splitter=JustSpacesWordSplitter()),
                        delimiter='\t',
                        source_token_indexers={'tokens': SingleIdTokenIndexer()},
                        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})

                    train_dataset = reader.read(os.path.join(path, 'train.tsv'))
                    validation_dataset = reader.read(os.path.join(path, 'valid.tsv'))
                    test_dataset = reader.read(os.path.join(path, 'test.tsv'))


                    vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                                      min_count={'tokens': 1, 'target_tokens': 1})

                    random_seed(seed)
                    en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                             embedding_dim=emb_sz)

                    random_seed(seed)
                    encoder = PytorchSeq2SeqWrapper(torch.nn.GRU(emb_sz, n_hid, dropout=0.25, num_layers=2, bidirectional=True, batch_first=True))

                    source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})
                    attention = DotProductAttention()

                    max_decoding_steps = 100   # TODO: make this variable
                    random_seed(seed)
                    model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                                          target_embedding_dim=emb_sz,
                                          target_namespace='target_tokens',
                                          attention=attention,
                                          seed=seed,
                                          mode=segmentation,
                                          beam_size=1,
                                          use_bleu=True).cuda(cuda_id)

                    optimizer = optim.Adam(model.parameters())
                    iterator = BucketIterator(batch_size=100, sorting_keys=[("source_tokens", "num_tokens")])

                    iterator.index_with(vocab)

                    if not os.path.exists('temp/best.th'):
                        trainer = Trainer(model=model,
                                              optimizer=optimizer,
                                              iterator=iterator,
                                              train_dataset=train_dataset,
                                              validation_dataset=validation_dataset,
                                              validation_metric='+BLEU',
                                              num_epochs=epochs,
                                              serialization_dir='temp',
                                              cuda_device=cuda_id)

                        trainer.train()     

                    with open("temp/best.th", 'rb') as f_model:
                        model.load_state_dict(torch.load(f_model))
                    model.eval()
                    e = evaluate(model, test_dataset, iterator, cuda_id, None)    

                    predictor = SimpleSeq2SeqPredictor(model, reader)

                    scores = []
                    if segmentation == 'syl' or segmentation == 'char':
                        for instance in test_dataset:
                            ref = (''.join([token.text for token in instance.fields['target_tokens'].tokens][1:-1]).replace('@@', ' ').replace('+', '').strip()).split()
                            hyp = (''.join(predictor.predict_instance(instance)['predicted_tokens']).replace('@@', ' ').replace('+', '').strip()).split()
                            score = cer(hyp, ref)
                            scores.append(score)
                    else:
                        for instance in test_dataset:
                            ref = (' '.join([token.text for token in instance.fields['target_tokens'].tokens][1:-1]).replace('@@ ', '').replace('+', '').strip()).split()
                            hyp = (' '.join(predictor.predict_instance(instance)['predicted_tokens']).replace('@@ ', '').replace('+', '').strip()).split()
                            score = cer(hyp, ref)
                            scores.append(score)

                    average = sum(scores) / len(scores)
                    variance = sum((s - average) ** 2 for s in scores) / len(scores)
                    standard_deviation = math.sqrt(variance)

                    print(f'{str(segmentation)} CharacTER: {average} BLEU: {e["BLEU"]}', file=f_results)
    
if __name__ == '__main__':
    fire.Fire(main)
                                                                        

FileExistsError: [Errno 17] File exists: '/home/krivas/.local/share/jupyter/runtime/kernel-291d3c59-6d17-4313-8051-f97e189c433c.json'

In [None]:
reader = Seq2SeqDatasetReader(
        source_tokenizer=WordTokenizer(),
        target_tokenizer=WordTokenizer(),
        source_token_indexers={'tokens': SingleIdTokenIndexer()},
        target_token_indexers={'tokens': SingleIdTokenIndexer(namespace='target_tokens')})
train_dataset = reader.read('../data/translate/preprocessed/Educativo/word/train.tsv')
validation_dataset = reader.read('../data/translate/preprocessed/Educativo/word/valid.tsv')

In [None]:
vocab = Vocabulary.from_instances(train_dataset + validation_dataset,
                                      min_count={'tokens': 1, 'target_tokens': 1})


In [None]:
en_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                             embedding_dim=EN_EMBEDDING_DIM)
encoder = PytorchSeq2SeqWrapper(
torch.nn.LSTM(EN_EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

In [None]:
source_embedder = BasicTextFieldEmbedder({"tokens": en_embedding})

attention = DotProductAttention()

max_decoding_steps = 20   # TODO: make this variable
model = SimpleSeq2Seq(vocab, source_embedder, encoder, max_decoding_steps,
                          target_embedding_dim=ZH_EMBEDDING_DIM,
                          target_namespace='target_tokens',
                          attention=attention,
                          beam_size=8,
                          use_bleu=True).cuda()
optimizer = optim.Adam(model.parameters())
iterator = BucketIterator(batch_size=32, sorting_keys=[("source_tokens", "num_tokens")])

iterator.index_with(vocab)

In [None]:
trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=1,
                      cuda_device=CUDA_DEVICE)

In [15]:
for i in range(10):
    print('Epoch: {}'.format(i))
    trainer.train()

    predictor = SimpleSeq2SeqPredictor(model, reader)

    for instance in itertools.islice(validation_dataset, 5):
        print('SOURCE:', instance.fields['source_tokens'].tokens)
        print('GOLD:', instance.fields['target_tokens'].tokens)
        print('PRED:', predictor.predict_instance(instance)['predicted_tokens'])

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch: 0


loss: 6.1913 ||: 100%|██████████| 125/125 [00:03<00:00, 41.52it/s]
BLEU: 0.0002, loss: 5.7586 ||: 100%|██████████| 16/16 [00:00<00:00, 25.53it/s]
  0%|          | 0/125 [00:00<?, ?it/s]

SOURCE: [@start@, menos, que, y, tantos, como, @end@]
GOLD: [@start@, ichatamainoax, iki, ika, @end@]
PRED: ['¿']
SOURCE: [@start@, sector, de, biblioteca, @end@]
GOLD: [@start@, yoyo, ati, kirikabo, benxoatinko, @end@]
PRED: ['¿']
SOURCE: [@start@, anota, sus, ideas, en, la, pizarra, @end@]
GOLD: [@start@, jaton, shinanbo, wishawe, pisarain, @end@]
PRED: ['¿']
SOURCE: [@start@, ¿, qué, dificultades, tuvieron, para, escribir, ?, @end@]
GOLD: [@start@, ¿, jawe, atikoma, jawekibomein, akanke, merakin, wishatiain, ?, @end@]
PRED: ['¿']
SOURCE: [@start@, recursos, disponibles, @end@]
GOLD: [@start@, jain, jayata, jawekibo, biboantibores, @end@]
PRED: []
Epoch: 1


loss: 5.5190 ||: 100%|██████████| 125/125 [00:05<00:00, 22.95it/s]
BLEU: 0.0000, loss: 5.5032 ||: 100%|██████████| 16/16 [00:01<00:00, 15.44it/s]
  0%|          | 0/125 [00:00<?, ?it/s]

SOURCE: [@start@, menos, que, y, tantos, como, @end@]
GOLD: [@start@, ichatamainoax, iki, ika, @end@]
PRED: ['¿', 'bake']
SOURCE: [@start@, sector, de, biblioteca, @end@]
GOLD: [@start@, yoyo, ati, kirikabo, benxoatinko, @end@]
PRED: ['¿', 'ati']
SOURCE: [@start@, anota, sus, ideas, en, la, pizarra, @end@]
GOLD: [@start@, jaton, shinanbo, wishawe, pisarain, @end@]
PRED: ['¿', 'bake']
SOURCE: [@start@, ¿, qué, dificultades, tuvieron, para, escribir, ?, @end@]
GOLD: [@start@, ¿, jawe, atikoma, jawekibomein, akanke, merakin, wishatiain, ?, @end@]
PRED: ['¿', 'yoyo', 'ati']
SOURCE: [@start@, recursos, disponibles, @end@]
GOLD: [@start@, jain, jayata, jawekibo, biboantibores, @end@]
PRED: ['yoyo']
Epoch: 2


loss: 5.2188 ||: 100%|██████████| 125/125 [00:05<00:00, 24.04it/s]
BLEU: 0.0279, loss: 5.3261 ||: 100%|██████████| 16/16 [00:01<00:00, 12.91it/s]
  0%|          | 0/125 [00:00<?, ?it/s]

SOURCE: [@start@, menos, que, y, tantos, como, @end@]
GOLD: [@start@, ichatamainoax, iki, ika, @end@]
PRED: ['yoyo', 'ati']
SOURCE: [@start@, sector, de, biblioteca, @end@]
GOLD: [@start@, yoyo, ati, kirikabo, benxoatinko, @end@]
PRED: ['yoyo', 'ati']
SOURCE: [@start@, anota, sus, ideas, en, la, pizarra, @end@]
GOLD: [@start@, jaton, shinanbo, wishawe, pisarain, @end@]
PRED: ['benbo', 'bake']
SOURCE: [@start@, ¿, qué, dificultades, tuvieron, para, escribir, ?, @end@]
GOLD: [@start@, ¿, jawe, atikoma, jawekibomein, akanke, merakin, wishatiain, ?, @end@]
PRED: ['benbo', 'bake']
SOURCE: [@start@, recursos, disponibles, @end@]
GOLD: [@start@, jain, jayata, jawekibo, biboantibores, @end@]
PRED: ['yoyo', 'ati']
Epoch: 3


loss: 4.9188 ||: 100%|██████████| 125/125 [00:06<00:00, 20.44it/s]
BLEU: 0.0369, loss: 5.1097 ||: 100%|██████████| 16/16 [00:01<00:00, 13.15it/s]
  0%|          | 0/125 [00:00<?, ?it/s]

SOURCE: [@start@, menos, que, y, tantos, como, @end@]
GOLD: [@start@, ichatamainoax, iki, ika, @end@]
PRED: ['yoyo', 'ati']
SOURCE: [@start@, sector, de, biblioteca, @end@]
GOLD: [@start@, yoyo, ati, kirikabo, benxoatinko, @end@]
PRED: ['yoyo', 'ati']
SOURCE: [@start@, anota, sus, ideas, en, la, pizarra, @end@]
GOLD: [@start@, jaton, shinanbo, wishawe, pisarain, @end@]
PRED: ['yoyo', 'ati']
SOURCE: [@start@, ¿, qué, dificultades, tuvieron, para, escribir, ?, @end@]
GOLD: [@start@, ¿, jawe, atikoma, jawekibomein, akanke, merakin, wishatiain, ?, @end@]
PRED: ['¿', 'jawe', 'ati', '?']
SOURCE: [@start@, recursos, disponibles, @end@]
GOLD: [@start@, jain, jayata, jawekibo, biboantibores, @end@]
PRED: ['yoyo', 'ati']
Epoch: 4


loss: 4.6388 ||: 100%|██████████| 125/125 [00:05<00:00, 20.94it/s]
BLEU: 0.0425, loss: 4.9994 ||: 100%|██████████| 16/16 [00:01<00:00, 13.33it/s]
  0%|          | 0/125 [00:00<?, ?it/s]

SOURCE: [@start@, menos, que, y, tantos, como, @end@]
GOLD: [@start@, ichatamainoax, iki, ika, @end@]
PRED: ['yoyo', 'ati', 'kopi']
SOURCE: [@start@, sector, de, biblioteca, @end@]
GOLD: [@start@, yoyo, ati, kirikabo, benxoatinko, @end@]
PRED: ['yoyo', 'ati']
SOURCE: [@start@, anota, sus, ideas, en, la, pizarra, @end@]
GOLD: [@start@, jaton, shinanbo, wishawe, pisarain, @end@]
PRED: ['wishawe', 'itan', 'ainbo', 'bakebo']
SOURCE: [@start@, ¿, qué, dificultades, tuvieron, para, escribir, ?, @end@]
GOLD: [@start@, ¿, jawe, atikoma, jawekibomein, akanke, merakin, wishatiain, ?, @end@]
PRED: ['¿', 'jawe', 'akai', '?']
SOURCE: [@start@, recursos, disponibles, @end@]
GOLD: [@start@, jain, jayata, jawekibo, biboantibores, @end@]
PRED: ['tsinkiti']
Epoch: 5


loss: 4.4164 ||: 100%|██████████| 125/125 [00:05<00:00, 22.22it/s]
BLEU: 0.0462, loss: 4.9462 ||: 100%|██████████| 16/16 [00:01<00:00, 11.31it/s]
  0%|          | 0/125 [00:00<?, ?it/s]

SOURCE: [@start@, menos, que, y, tantos, como, @end@]
GOLD: [@start@, ichatamainoax, iki, ika, @end@]
PRED: ['jaskaaxon', 'yoyo', 'ati']
SOURCE: [@start@, sector, de, biblioteca, @end@]
GOLD: [@start@, yoyo, ati, kirikabo, benxoatinko, @end@]
PRED: ['yoyo', 'ati', 'atipanke']
SOURCE: [@start@, anota, sus, ideas, en, la, pizarra, @end@]
GOLD: [@start@, jaton, shinanbo, wishawe, pisarain, @end@]
PRED: ['jaskaaxon', 'yoyo', 'ati', 'kopi']
SOURCE: [@start@, ¿, qué, dificultades, tuvieron, para, escribir, ?, @end@]
GOLD: [@start@, ¿, jawe, atikoma, jawekibomein, akanke, merakin, wishatiain, ?, @end@]
PRED: ['¿', 'jawekeskaaxonmein', 'non', 'akai', '?']
SOURCE: [@start@, recursos, disponibles, @end@]
GOLD: [@start@, jain, jayata, jawekibo, biboantibores, @end@]
PRED: ['ja', 'pekao']
Epoch: 6


loss: 4.2323 ||: 100%|██████████| 125/125 [00:03<00:00, 41.38it/s]
BLEU: 0.0454, loss: 4.9051 ||: 100%|██████████| 16/16 [00:01<00:00, 15.02it/s]
  0%|          | 0/125 [00:00<?, ?it/s]

SOURCE: [@start@, menos, que, y, tantos, como, @end@]
GOLD: [@start@, ichatamainoax, iki, ika, @end@]
PRED: ['jan', 'teeti', 'jawekibo']
SOURCE: [@start@, sector, de, biblioteca, @end@]
GOLD: [@start@, yoyo, ati, kirikabo, benxoatinko, @end@]
PRED: ['yoyo', 'awe']
SOURCE: [@start@, anota, sus, ideas, en, la, pizarra, @end@]
GOLD: [@start@, jaton, shinanbo, wishawe, pisarain, @end@]
PRED: ['wishawe', 'itan', 'yokawe']
SOURCE: [@start@, ¿, qué, dificultades, tuvieron, para, escribir, ?, @end@]
GOLD: [@start@, ¿, jawe, atikoma, jawekibomein, akanke, merakin, wishatiain, ?, @end@]
PRED: ['¿', 'jawekeskaaxonmein', 'non', 'akai', '?']
SOURCE: [@start@, recursos, disponibles, @end@]
GOLD: [@start@, jain, jayata, jawekibo, biboantibores, @end@]
PRED: ['ja', 'pekao']
Epoch: 7


loss: 4.0623 ||: 100%|██████████| 125/125 [00:03<00:00, 37.38it/s]
BLEU: 0.0476, loss: 4.9044 ||: 100%|██████████| 16/16 [00:01<00:00, 15.59it/s]
  0%|          | 0/125 [00:00<?, ?it/s]

SOURCE: [@start@, menos, que, y, tantos, como, @end@]
GOLD: [@start@, ichatamainoax, iki, ika, @end@]
PRED: ['jan', 'teeti', 'jawekibo']
SOURCE: [@start@, sector, de, biblioteca, @end@]
GOLD: [@start@, yoyo, ati, kirikabo, benxoatinko, @end@]
PRED: ['jato', 'yokawe']
SOURCE: [@start@, anota, sus, ideas, en, la, pizarra, @end@]
GOLD: [@start@, jaton, shinanbo, wishawe, pisarain, @end@]
PRED: ['wishawe', 'itan', 'jan', 'teeti', 'jawekibo']
SOURCE: [@start@, ¿, qué, dificultades, tuvieron, para, escribir, ?, @end@]
GOLD: [@start@, ¿, jawe, atikoma, jawekibomein, akanke, merakin, wishatiain, ?, @end@]
PRED: ['¿', 'jawekeskaaxonmein', 'non', 'akai', '?']
SOURCE: [@start@, recursos, disponibles, @end@]
GOLD: [@start@, jain, jayata, jawekibo, biboantibores, @end@]
PRED: ['ja', 'pekao']
Epoch: 8


loss: 3.9055 ||: 100%|██████████| 125/125 [00:02<00:00, 43.41it/s]
BLEU: 0.0519, loss: 4.8805 ||: 100%|██████████| 16/16 [00:01<00:00, 13.98it/s]
  0%|          | 0/125 [00:00<?, ?it/s]

SOURCE: [@start@, menos, que, y, tantos, como, @end@]
GOLD: [@start@, ichatamainoax, iki, ika, @end@]
PRED: ['wishawe', 'mesko', 'shinanbo']
SOURCE: [@start@, sector, de, biblioteca, @end@]
GOLD: [@start@, yoyo, ati, kirikabo, benxoatinko, @end@]
PRED: ['yoyo', 'ati', 'kirika']
SOURCE: [@start@, anota, sus, ideas, en, la, pizarra, @end@]
GOLD: [@start@, jaton, shinanbo, wishawe, pisarain, @end@]
PRED: ['wishawe', 'mesko', 'wishabo']
SOURCE: [@start@, ¿, qué, dificultades, tuvieron, para, escribir, ?, @end@]
GOLD: [@start@, ¿, jawe, atikoma, jawekibomein, akanke, merakin, wishatiain, ?, @end@]
PRED: ['¿', 'jawekeskaaxonmein', 'non', 'ati', 'iki', '?']
SOURCE: [@start@, recursos, disponibles, @end@]
GOLD: [@start@, jain, jayata, jawekibo, biboantibores, @end@]
PRED: ['ja', 'pekao']
Epoch: 9


loss: 3.7493 ||: 100%|██████████| 125/125 [00:06<00:00, 18.99it/s]
BLEU: 0.0487, loss: 4.8917 ||: 100%|██████████| 16/16 [00:01<00:00, 13.19it/s]

SOURCE: [@start@, menos, que, y, tantos, como, @end@]
GOLD: [@start@, ichatamainoax, iki, ika, @end@]
PRED: ['wishawe', 'ja', 'pekao']
SOURCE: [@start@, sector, de, biblioteca, @end@]
GOLD: [@start@, yoyo, ati, kirikabo, benxoatinko, @end@]
PRED: ['yoyo', 'akanai']
SOURCE: [@start@, anota, sus, ideas, en, la, pizarra, @end@]
GOLD: [@start@, jaton, shinanbo, wishawe, pisarain, @end@]
PRED: ['wishawe', 'mesko', 'shinanbo', 'ikainko']
SOURCE: [@start@, ¿, qué, dificultades, tuvieron, para, escribir, ?, @end@]
GOLD: [@start@, ¿, jawe, atikoma, jawekibomein, akanke, merakin, wishatiain, ?, @end@]
PRED: ['¿', 'jawekeskaaxonmein', 'non', 'ati', 'iki', '?']
SOURCE: [@start@, recursos, disponibles, @end@]
GOLD: [@start@, jain, jayata, jawekibo, biboantibores, @end@]
PRED: ['ja', 'pekao']



