In [3]:
import opennmt
import os
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
import numpy as np
import sacrebleu
import pyonmttok
from opennmt.utils import checkpoint as checkpoint_util
from pyonmttok import SentencePieceTokenizer

In [2]:
def count_weights(model):
  trainable_count = np.sum([K.count_params(w) for w in model.trainable_weights])
  non_trainable_count = np.sum([K.count_params(w) for w in model.non_trainable_weights])

  print('Total params: {:,}'.format(trainable_count + non_trainable_count))
  print('Trainable params: {:,}'.format(trainable_count))
  print('Non-trainable params: {:,}'.format(non_trainable_count))

def compute_scores(runner, features_filename, labels_filename, pred_filename, include_ppl=False, include_ter=False):
  runner.infer(features_filename, pred_filename)

  dot_idx = pred_filename.index('.')
  base_pred_name = pred_filename[0:dot_idx]
  dot_idx = labels_filename.index('.')
  base_model_name = labels_filename[0:dot_idx]
  pred_filename = detokenize_data(base_pred_name, base_model_name)
  detokenized_labels_filename = detokenize_data(base_model_name, base_model_name)
  preds = []
  truth = []
  with open(pred_filename) as f:
    preds = f.readlines()

  with open(detokenized_labels_filename) as f:
    truth = f.readlines()

  scores = dict()
  if include_ppl:
    scores = runner.evaluate(
        features_file=features_filename,
        labels_file=labels_filename)
  
  bleu = sacrebleu.corpus_bleu(preds, [truth])
  scores.update({'bleu': bleu.score})
  if include_ter:
    ter = sacrebleu.corpus_ter(preds, [truth])
    scores.update({'ter': ter.score})
  
  return scores

def detokenize(model_basename, tokenized_basename):
  model_path = os.path.join("sentencepiece_models", model_basename + ".model")
  vocabulary_path = os.path.join("sentencepiece_models", f"{model_basename}.vocab")
  detokenizer = SentencePieceTokenizer(model_path=model_path,
                                     vocabulary_path=vocabulary_path,)
  
  with open(f"{tokenized_basename}.tok") as f:
    with open(f"{tokenized_basename}.txt", mode="w") as fout:
      for line in f.readlines():
        fout.write(detokenizer.detokenize(line.strip().split(" ")) + "\n")

  return f"{tokenized_basename}.txt"

def tokenize(input_file, basename):
  model_path = os.path.join("sentencepiece_models", f"{basename}.model")
  vocabulary_path = os.path.join("sentencepiece_models", f"{basename}.vocab")
  tokenizer = SentencePieceTokenizer(model_path=model_path,
                                     vocabulary_path=vocabulary_path,)
  
  with open(os.path.join(f"{input_file}")) as f:
    with open(os.path.join(f"{basename}.tok"), mode="w") as fout:
      for line in f.readlines():
        if line.strip():
          fout.write(" ".join(tokenizer.tokenize(line)[0]) + "\n")

In [7]:
# Build vocab (uses SentencePiece)
# source = catalan   (ca)
# pivot  = spanish   (es)
# target = italian  (it)

!onmt-build-vocab --sentencepiece model_type=bpe --size 32000 --save_vocab sentencepiece_models/src src_pvt_data/src_train.txt
!onmt-build-vocab --sentencepiece model_type=bpe --size 32000 --save_vocab sentencepiece_models/pvt_src src_pvt_data/pvt_src_train.txt

!onmt-build-vocab --sentencepiece model_type=bpe --size 32000 --save_vocab sentencepiece_models/tgt pvt_tgt_data/tgt_train.txt
!onmt-build-vocab --sentencepiece model_type=bpe --size 32000 --save_vocab sentencepiece_models/pvt_tgt pvt_tgt_data/pvt_tgt_train.txt

!onmt-build-vocab --sentencepiece model_type=bpe --size 32000 --save_vocab sentencepiece_models/src_tgt src_tgt_data/src_tgt_train.txt
!onmt-build-vocab --sentencepiece model_type=bpe --size 32000 --save_vocab sentencepiece_models/tgt_src src_tgt_data/tgt_src_train.txt

2021-12-11 13:12:12.918785: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-11 13:12:12.918837: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
sentencepiece_trainer.cc(177) LOG(INFO) Running command:  --model_type=bpe --vocab_size=32000 --input=/tmp/tmpd1w03nxn --model_prefix=sentencepiece_models/tgt
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /tmp/tmpd1w03nxn
  input_format: 
  model_prefix: sentencepiece_models/tgt
  model_type: BPE
  vocab_size: 32000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentence

trainer_interface.cc(466) LOG(INFO) all chars count=2020598378
trainer_interface.cc(477) LOG(INFO) Done: 99.9503% characters are covered.
trainer_interface.cc(487) LOG(INFO) Alphabet size=180
trainer_interface.cc(488) LOG(INFO) Final character coverage=0.999503
trainer_interface.cc(520) LOG(INFO) Done! preprocessed 22705199 sentences.
trainer_interface.cc(526) LOG(INFO) Tokenizing input sentences with whitespace: 22705199
trainer_interface.cc(537) LOG(INFO) Done! 3966034
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=38679200 min_freq=56055
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=11852128 size=20 all=9252 active=3301 piece=in
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6201669 size=40 all=11927 active=5976 piece=is
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=4232549 size=60 all=14651 active=8700 piece=am
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3295029 size=80 all=17302 active=11351 piece=il
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=112984 size=1340 all=238947 active=16211 piece=demos
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=111488 size=1360 all=241367 active=18631 piece=▁esti
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=109719 size=1380 all=244053 active=21317 piece=▁diferentes
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=107665 size=1400 all=246765 active=24029 piece=echa
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=107648 min_freq=6151
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=106167 size=1420 all=248479 active=13923 piece=OM
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=103887 size=1440 all=251238 active=16681 piece=ulares
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=102183 size=1460 all=253416 active=18859 piece=▁Med
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=100239 size=1480 all=255666 active=21109 piece=▁ev
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=98384 size=1500 all=257919 active=23362 piece=▁Cons
b

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=47747 size=2740 all=400050 active=24802 piece=int
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=47293 size=2760 all=402336 active=27088 piece=▁Pos
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=46893 size=2780 all=404989 active=29741 piece=portador
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=46335 size=2800 all=406887 active=31639 piece=ly
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=46327 min_freq=2687
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=45911 size=2820 all=410211 active=23076 piece=tario
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=45325 size=2840 all=412728 active=25593 piece=▁ru
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=44778 size=2860 all=414595 active=27460 piece=▁san
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=44472 size=2880 all=417158 active=30023 piece=▁limpieza
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=44099 size=2900 all=418770 active=31635 piece=álvula
bpe_model_t

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28491 size=4120 all=539722 active=29736 piece=ético
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28277 size=4140 all=540969 active=30983 piece=▁algodón
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28030 size=4160 all=543019 active=33033 piece=IB
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=27815 size=4180 all=544486 active=34500 piece=▁principalmente
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=27681 size=4200 all=547609 active=37623 piece=▁encuentran
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=27663 min_freq=1655
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=27457 size=4220 all=549894 active=29660 piece=▁Tab
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=27259 size=4240 all=551589 active=31355 piece=▁montaje
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=27155 size=4260 all=553227 active=32993 piece=▁aper
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26984 size=4280 all=555117 active=34883 piec

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=19118 min_freq=1185
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=18991 size=5520 all=665395 active=34916 piece=93
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=18913 size=5540 all=667054 active=36575 piece=▁dro
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=18841 size=5560 all=668902 active=38423 piece=▁maravillos
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=18738 size=5580 all=669866 active=39387 piece=ERA
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=18645 size=5600 all=671436 active=40957 piece=loud
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=18643 min_freq=1162
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=18569 size=5620 all=673302 active=35307 piece=▁Pay
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=18434 size=5640 all=675314 active=37319 piece=▁Seguridad
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=18326 size=5660 all=676344 active=38349 piece=ye
bpe_model_trainer.cc

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=14092 size=6900 all=778048 active=44258 piece=▁ejercicio
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=14089 min_freq=889
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=14025 size=6920 all=779895 active=40746 piece=ild
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=13971 size=6940 all=781270 active=42121 piece=▁consecu
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=13895 size=6960 all=782075 active=42926 piece=▁Sir
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=13840 size=6980 all=783347 active=44198 piece=▁Om
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=13792 size=7000 all=784334 active=45185 piece=▁3,
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=13790 min_freq=876
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=13749 size=7020 all=785623 active=40417 piece=CC
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=13690 size=7040 all=787386 active=42180 piece=ezas
bpe_model_trainer.cc(258)

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=10885 size=8280 all=871935 active=47529 piece=▁tejidos
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=10847 size=8300 all=872705 active=48299 piece=▁agentes
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=10844 min_freq=721
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=10804 size=8320 all=874100 active=45025 piece=▁smart
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=10774 size=8340 all=875341 active=46266 piece=▁sellado
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=10733 size=8360 all=876545 active=47470 piece=ificados
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=10699 size=8380 all=877376 active=48301 piece=▁Holiday
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=10653 size=8400 all=879062 active=49987 piece=▁atmósfera
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=10651 min_freq=711
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=10608 size=8420 all=880471 active=45360 piece=▁dude
b

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=8697 size=9660 all=962442 active=53174 piece=▁bordo
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=8675 size=9680 all=963323 active=54055 piece=zón
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=8649 size=9700 all=964904 active=55636 piece=▁testi
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=8648 min_freq=598
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=8633 size=9720 all=965793 active=49124 piece=álogo
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=8608 size=9740 all=966652 active=49983 piece=▁ladr
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=8581 size=9760 all=967762 active=51093 piece=▁Digitaces
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=8560 size=9780 all=968542 active=51873 piece=▁grabado
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=8534 size=9800 all=969435 active=52766 piece=▁intensidad
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=8534 min_freq=592
bpe_model_trainer.

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=7158 size=11060 all=1050263 active=56810 piece=▁presentan
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=7139 size=11080 all=1051627 active=58174 piece=▁proporcionado
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=7117 size=11100 all=1052157 active=58704 piece=▁Full
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=7116 min_freq=507
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=7104 size=11120 all=1052884 active=53265 piece=▁transformación
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=7084 size=11140 all=1053989 active=54370 piece=▁Vino
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=7064 size=11160 all=1054758 active=55139 piece=cudia
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=7047 size=11180 all=1056378 active=56759 piece=▁recetas
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=7036 size=11200 all=1058606 active=58987 piece=▁Spr
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=7034 min

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6054 size=12420 all=1137596 active=57505 piece=-2019
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6042 size=12440 all=1138580 active=58489 piece=▁financiero
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6027 size=12460 all=1139903 active=59812 piece=▁Proto
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=6011 size=12480 all=1141480 active=61389 piece=▁eco
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5997 size=12500 all=1142366 active=62275 piece=▁derr
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=5997 min_freq=435
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5980 size=12520 all=1143316 active=58042 piece=▁velocidades
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5962 size=12540 all=1144673 active=59399 piece=▁virtud
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5948 size=12560 all=1145596 active=60322 piece=▁mam
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5938 size=12580 all=1146639 active=61365 p

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5146 size=13800 all=1215018 active=65195 piece=▁Americas
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=5145 min_freq=385
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5136 size=13820 all=1216233 active=61963 piece=▁cálido
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5125 size=13840 all=1217920 active=63650 piece=mann
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5115 size=13860 all=1218627 active=64357 piece=▁Nh
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5099 size=13880 all=1220147 active=65877 piece=▁nutrición
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5084 size=13900 all=1221000 active=66730 piece=▁comisiones
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=5083 min_freq=382
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5074 size=13920 all=1221781 active=61828 piece=lig
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=5062 size=13940 all=1224179 active=64225 piece=▁Oh
bpe_mo

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=4427 size=15180 all=1292064 active=68653 piece=hon
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=4416 size=15200 all=1292849 active=69438 piece=▁millón
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=4416 min_freq=342
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=4408 size=15220 all=1293654 active=65447 piece=▁coser
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=4400 size=15240 all=1294491 active=66284 piece=▁clasificada
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=4395 size=15260 all=1295270 active=67063 piece=▁Jardines
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=4389 size=15280 all=1296629 active=68422 piece=▁Wood
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=4381 size=15300 all=1297934 active=69726 piece=arra
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=4379 min_freq=339
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=4373 size=15320 all=1298670 active=65468 piece=▁luminoso
b

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3885 size=16540 all=1359742 active=69635 piece=▁formal
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3877 size=16560 all=1360254 active=70147 piece=▁ax
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3872 size=16580 all=1361316 active=71209 piece=▁señoras
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3860 size=16600 all=1362013 active=71906 piece=▁Generado
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=3860 min_freq=307
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3853 size=16620 all=1363087 active=69175 piece=▁Recon
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3845 size=16640 all=1363645 active=69733 piece=▁exportar
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3839 size=16660 all=1364720 active=70808 piece=▁fantástico
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3830 size=16680 all=1365649 active=71737 piece=▁Berry
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3823 size=16700 all=1366656 active=72

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=3452 min_freq=280
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3446 size=17920 all=1429853 active=72780 piece=umbo
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3441 size=17940 all=1430746 active=73673 piece=▁asociada
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3433 size=17960 all=1431665 active=74592 piece=▁NT
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3428 size=17980 all=1432720 active=75647 piece=▁complicado
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3423 size=18000 all=1433474 active=76401 piece=▁lucro
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=3423 min_freq=278
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3416 size=18020 all=1434458 active=72653 piece=foro
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3411 size=18040 all=1436383 active=74578 piece=▁Paraiso
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3406 size=18060 all=1437985 active=76180 piece=torante
bpe_

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3084 size=19280 all=1494478 active=78377 piece=▁calabaza
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3079 size=19300 all=1495866 active=79765 piece=inary
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=3079 min_freq=255
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3075 size=19320 all=1496145 active=74999 piece=▁deterior
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3070 size=19340 all=1497264 active=76118 piece=▁evaluaciones
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3066 size=19360 all=1498231 active=77085 piece=▁elecciones
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3060 size=19380 all=1499409 active=78263 piece=▁Station
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3057 size=19400 all=1499926 active=78780 piece=▁support
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=3057 min_freq=254
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3052 size=19420 all=1501066 active=76119 

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2774 size=20640 all=1556506 active=79201 piece=▁DÍA
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2769 size=20660 all=1557411 active=80106 piece=rap
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2765 size=20680 all=1558468 active=81163 piece=▁exci
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2759 size=20700 all=1559243 active=81938 piece=bana
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=2759 min_freq=235
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2756 size=20720 all=1559848 active=78509 piece=ecraft
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2751 size=20740 all=1560913 active=79574 piece=idia
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2747 size=20760 all=1562106 active=80767 piece=▁4,5
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2744 size=20780 all=1563807 active=82468 piece=▁oculto
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2740 size=20800 all=1564291 active=82952 piece=▁lienzo
bpe_

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=2517 min_freq=219
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2514 size=22020 all=1617588 active=81487 piece=▁Castilla
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2510 size=22040 all=1618762 active=82661 piece=▁219
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2508 size=22060 all=1620400 active=84299 piece=▁incapa
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2504 size=22080 all=1621257 active=85156 piece=▁pines
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2501 size=22100 all=1622200 active=86099 piece=▁Alquilar
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=2501 min_freq=217
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2498 size=22120 all=1622962 active=81871 piece=▁congresos
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2494 size=22140 all=1623929 active=82838 piece=▁With
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2492 size=22160 all=1624860 active=83769 piece=eral
bp

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2288 size=23400 all=1674160 active=87647 piece=▁Dil
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=2288 min_freq=203
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2285 size=23420 all=1674675 active=84126 piece=-1000
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2283 size=23440 all=1675309 active=84760 piece=▁Impresion
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2280 size=23460 all=1675966 active=85417 piece=▁Bever
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2278 size=23480 all=1676768 active=86219 piece=▁contaminantes
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2275 size=23500 all=1677895 active=87346 piece=▁Spir
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=2275 min_freq=202
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2271 size=23520 all=1678602 active=84559 piece=▁Arts
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2268 size=23540 all=1679235 active=85192 piece=▁tomarse

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2094 size=24760 all=1729484 active=88870 piece=▁cojines
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2091 size=24780 all=1730009 active=89395 piece=fal
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2089 size=24800 all=1730829 active=90215 piece=▁Eliminación
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=2088 min_freq=190
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2086 size=24820 all=1731330 active=87043 piece=▁coleccion
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2082 size=24840 all=1732026 active=87739 piece=imer
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2080 size=24860 all=1733061 active=88774 piece=▁aquello
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2077 size=24880 all=1734088 active=89801 piece=XP
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=2076 size=24900 all=1734911 active=90624 piece=alizamos
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=2076 min_freq=189
b

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1931 size=26120 all=1784047 active=89605 piece=▁hidratante
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1929 size=26140 all=1785115 active=90673 piece=▁intestino
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1926 size=26160 all=1785989 active=91547 piece=▁Chevrolet
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1922 size=26180 all=1786300 active=91858 piece=pati
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1920 size=26200 all=1787007 active=92565 piece=▁Resid
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=1920 min_freq=178
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1918 size=26220 all=1788043 active=90361 piece=▁510
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1916 size=26240 all=1789103 active=91421 piece=▁RAD
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1913 size=26260 all=1790185 active=92503 piece=855
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1911 size=26280 all=1791619 active=93937 pie

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1773 size=27500 all=1841388 active=95954 piece=▁perpe
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=1773 min_freq=168
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1770 size=27520 all=1841882 active=92541 piece=329
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1769 size=27540 all=1843325 active=93984 piece=▁Agar
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1767 size=27560 all=1843995 active=94654 piece=poker
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1765 size=27580 all=1844529 active=95188 piece=iser
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1763 size=27600 all=1845577 active=96236 piece=proof
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=1763 min_freq=167
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1761 size=27620 all=1846436 active=93005 piece=ruéb
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1760 size=27640 all=1847090 active=93659 piece=Necesitas
bpe_model_train

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1647 size=28860 all=1893613 active=96874 piece=tár
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1645 size=28880 all=1894273 active=97534 piece=xito
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1642 size=28900 all=1895442 active=98703 piece=amba
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=1642 min_freq=158
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1641 size=28920 all=1896274 active=95489 piece=▁PREMI
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1639 size=28940 all=1897542 active=96757 piece=▁RAL
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1637 size=28960 all=1898814 active=98028 piece=omnio
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1636 size=28980 all=1899212 active=98426 piece=▁cobran
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1634 size=29000 all=1900560 active=99774 piece=doso
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=1634 min_freq=157
bpe_model_trainer.c

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1530 size=30220 all=1945254 active=97660 piece=▁Buscamos
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1528 size=30240 all=1945855 active=98261 piece=▁Links
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1526 size=30260 all=1947437 active=99843 piece=tijo
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1525 size=30280 all=1947949 active=100355 piece=▁Dreams
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1523 size=30300 all=1948574 active=100980 piece=▁VELO
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=1523 min_freq=149
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1522 size=30320 all=1949793 active=98635 piece=▁Habitual
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1520 size=30340 all=1950596 active=99438 piece=aventura
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1518 size=30360 all=1951289 active=100131 piece=▁Movs
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1517 size=30380 all=1951678 active=10052

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1423 size=31580 all=1993505 active=101685 piece=▁Paphos
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1422 size=31600 all=1993863 active=102043 piece=▁redund
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=1422 min_freq=142
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1420 size=31620 all=1995317 active=101136 piece=tici
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1418 size=31640 all=1996459 active=102278 piece=TRAL
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1417 size=31660 all=1996932 active=102751 piece=▁Ilustración
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1416 size=31680 all=1997732 active=103551 piece=▁Expedidores
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1414 size=31700 all=1998948 active=104767 piece=sun
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=1414 min_freq=142
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1413 size=31720 all=1999384 active=100244 piece=

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=4045 size=660 all=62522 active=6875 piece=▁té
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3903 size=680 all=63806 active=8159 piece=▁ent
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3821 size=700 all=64788 active=9141 piece=▁utilitz
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=3814 min_freq=640
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3692 size=720 all=65574 active=4004 piece=▁z
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3564 size=740 all=67187 active=5617 piece=sa
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3458 size=760 all=68376 active=6806 piece=aquest
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3364 size=780 all=70022 active=8452 piece=▁convert
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3237 size=800 all=70633 active=9063 piece=▁prof
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=3229 min_freq=558
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=3121 size=

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1018 size=2120 all=121538 active=6562 piece=▁cavall
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1008 size=2140 all=122117 active=7141 piece=▁igu
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=996 size=2160 all=122968 active=7992 piece=▁ells
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=986 size=2180 all=123832 active=8856 piece=▁oblig
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=976 size=2200 all=124249 active=9273 piece=andre
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=976 min_freq=215
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=967 size=2220 all=124744 active=6683 piece=▁Jac
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=955 size=2240 all=125294 active=7233 piece=rés
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=944 size=2260 all=125919 active=7858 piece=▁abandon
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=931 size=2280 all=126624 active=8563 piece=ór
bpe_model_trainer.cc(258) LOG(INFO) Add

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=524 size=3600 all=159162 active=9941 piece=▁llarga
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=524 min_freq=133
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=520 size=3620 all=159658 active=8455 piece=irada
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=516 size=3640 all=159876 active=8673 piece=15
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=514 size=3660 all=160326 active=9123 piece=iren
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=510 size=3680 all=160887 active=9684 piece=ogia
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=507 size=3700 all=161305 active=10102 piece=ílica
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=507 min_freq=130
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=504 size=3720 all=161663 active=8418 piece=▁Abans
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=501 size=3740 all=162106 active=8861 piece=gica
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=498 

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=349 min_freq=96
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=347 size=5020 all=185851 active=9721 piece=▁privi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=345 size=5040 all=186323 active=10193 piece=▁béns
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=344 size=5060 all=186676 active=10546 piece=▁límits
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=342 size=5080 all=187039 active=10909 piece=▁Han
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=341 size=5100 all=187271 active=11141 piece=▁Napoleó
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=341 min_freq=94
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=339 size=5120 all=187563 active=9656 piece=experi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=337 size=5140 all=187848 active=9941 piece=▁líders
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=335 size=5160 all=188380 active=10473 piece=arquitectura
bpe_model_trainer.cc(258) LOG(IN

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=245 size=6480 all=208138 active=11530 piece=ibilitat
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=244 size=6500 all=208656 active=12048 piece=▁curta
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=244 min_freq=73
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=243 size=6520 all=209090 active=10867 piece=▁angl
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=242 size=6540 all=209305 active=11082 piece=cles
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=241 size=6560 all=209442 active=11219 piece=▁tx
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=240 size=6580 all=209650 active=11427 piece=enç
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=239 size=6600 all=210061 active=11838 piece=frac
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=239 min_freq=72
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=238 size=6620 all=210251 active=10693 piece=gust
bpe_model_trainer.cc(258) LOG(INFO) Added: freq

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=187 min_freq=59
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=186 size=7920 all=225795 active=11527 piece=▁Fill
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=186 size=7940 all=225928 active=11660 piece=▁mecànica
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=185 size=7960 all=226161 active=11893 piece=▁Congo
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=184 size=7980 all=226224 active=11956 piece=riz
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=184 size=8000 all=226447 active=12179 piece=▁crítiques
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=184 min_freq=58
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=183 size=8020 all=226963 active=11839 piece=enberg
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=182 size=8040 all=227137 active=12013 piece=ube
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=182 size=8060 all=227372 active=12248 piece=▁vols
bpe_model_trainer.cc(258) LOG(INFO) A

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=147 min_freq=49
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=147 size=9420 all=240298 active=12082 piece=hospital
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=146 size=9440 all=240436 active=12220 piece=ocal
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=146 size=9460 all=240536 active=12320 piece=▁jesuïtes
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=145 size=9480 all=240891 active=12675 piece=▁Enter
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=145 size=9500 all=240940 active=12724 piece=▁enfrontament
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=145 min_freq=49
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=144 size=9520 all=241398 active=12505 piece=▁dorm
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=144 size=9540 all=241483 active=12590 piece=▁destacats
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=143 size=9560 all=241835 active=12942 piece=iguel
bpe_model_trainer.cc(258

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=119 size=10880 all=252388 active=13621 piece=ology
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=119 size=10900 all=252466 active=13699 piece=ximació
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=119 min_freq=42
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=119 size=10920 all=252495 active=12649 piece=▁societats
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=118 size=10940 all=252848 active=13002 piece=▁Què
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=118 size=10960 all=252980 active=13134 piece=▁alumne
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=118 size=10980 all=252983 active=13137 piece=▁emperadors
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=117 size=11000 all=253229 active=13383 piece=ersey
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=117 min_freq=42
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=117 size=11020 all=253258 active=12688 piece=▁tribut
bpe_model_trainer.cc

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=100 size=12300 all=262969 active=13791 piece=▁Concert
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=100 min_freq=37
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=100 size=12320 all=262986 active=13159 piece=▁religions
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=99 size=12340 all=263231 active=13404 piece=▁Fon
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=99 size=12360 all=263414 active=13587 piece=osofia
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=99 size=12380 all=263463 active=13636 piece=▁esperit
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=99 size=12400 all=263457 active=13630 piece=▁classificat
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=99 min_freq=37
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=98 size=12420 all=263833 active=13547 piece=ellat
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=98 size=12440 all=264003 active=13717 piece=▁armat
bpe_model_trainer.cc(258

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=85 size=13720 all=272181 active=13695 piece=SC
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=85 size=13740 all=272594 active=14108 piece=gram
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=85 size=13760 all=272925 active=14439 piece=ugawa
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=85 size=13780 all=273034 active=14548 piece=▁Coron
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=85 size=13800 all=273114 active=14628 piece=▁inform
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=85 min_freq=33
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=85 size=13820 all=273138 active=13670 piece=▁assignar
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=85 size=13840 all=273127 active=13659 piece=▁consolidar
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=84 size=13860 all=273557 active=14089 piece=emps
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=84 size=13880 all=273823 active=14355 piece=▁Pequ
bpe_model_trainer.cc(258)

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=74 min_freq=29
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=73 size=15220 all=281210 active=14044 piece=BI
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=73 size=15240 all=281568 active=14402 piece=▁Ple
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=73 size=15260 all=281752 active=14586 piece=▁disf
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=73 size=15280 all=281812 active=14646 piece=▁rostre
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=73 size=15300 all=281813 active=14647 piece=odinàmica
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=73 min_freq=29
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=73 size=15320 all=281832 active=14103 piece=▁infantesa
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=73 size=15340 all=281814 active=14085 piece=▁sensibilitat
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=72 size=15360 all=282134 active=14405 piece=epid
bpe_model_trainer.cc(258) LOG(IN

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=64 size=16660 all=289342 active=14731 piece=▁fluor
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=64 size=16680 all=289366 active=14755 piece=▁abolir
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=64 size=16700 all=289406 active=14795 piece=▁Cristià
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=64 min_freq=26
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=64 size=16720 all=289411 active=14476 piece=▁abreujat
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=64 size=16740 all=289394 active=14459 piece=▁assassinar
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=63 size=16760 all=289565 active=14630 piece=▁xi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=63 size=16780 all=289856 active=14921 piece=▁Hid
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=63 size=16800 all=290042 active=15107 piece=▁Léon
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=63 min_freq=26
bpe_model_trainer.cc(258) LOG(INF

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=57 size=18100 all=295409 active=14784 piece=▁disminueix
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=57 min_freq=24
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=56 size=18120 all=295533 active=14895 piece=epp
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=56 size=18140 all=295818 active=15180 piece=▁Alf
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=56 size=18160 all=296001 active=15363 piece=ignon
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=56 size=18180 all=296157 active=15519 piece=▁cans
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=56 size=18200 all=296281 active=15643 piece=▁deien
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=56 min_freq=24
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=56 size=18220 all=296314 active=14848 piece=▁Italia
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=56 size=18240 all=296332 active=14866 piece=▁Cretaci
bpe_model_trainer.cc(258) LOG(INFO) A

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=50 size=19560 all=301998 active=15370 piece=▁robar
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=50 size=19580 all=302029 active=15401 piece=▁abolit
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=50 size=19600 all=302060 active=15432 piece=▁tindre
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=50 min_freq=22
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=50 size=19620 all=302076 active=15119 piece=▁hàbitat
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=50 size=19640 all=302085 active=15128 piece=▁apreciar
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=50 size=19660 all=302077 active=15120 piece=▁referent
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=50 size=19680 all=302065 active=15108 piece=▁inestable
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=50 size=19700 all=302046 active=15089 piece=▁sorprendre
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=50 min_freq=22
bpe_model_trainer.c

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=45 min_freq=20
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=45 size=21020 all=306798 active=15324 piece=▁mencionat
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=45 size=21040 all=306800 active=15326 piece=▁migracions
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=44 size=21060 all=306853 active=15379 piece=LD
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=44 size=21080 all=307190 active=15716 piece=▁GL
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=44 size=21100 all=307436 active=15962 piece=iche
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=44 min_freq=20
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=44 size=21120 all=307655 active=15561 piece=▁Kab
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=44 size=21140 all=307821 active=15727 piece=cules
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=44 size=21160 all=307931 active=15837 piece=▁Livi
bpe_model_trainer.cc(258) LOG(INFO) Added:

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=22420 all=312262 active=15728 piece=▁1694
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=22440 all=312369 active=15835 piece=▁Stro
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=22460 all=312495 active=15961 piece=▁Afers
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=22480 all=312518 active=15984 piece=▁andal
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=22500 all=312560 active=16026 piece=efectes
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=40 min_freq=19
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=22520 all=312601 active=15669 piece=▁Vespas
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=22540 all=312642 active=15710 piece=▁inflig
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=22560 all=312670 active=15738 piece=observar
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=22580 all=312676 active=15744 piece=▁elegida
bpe_model_trainer.c

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=37 size=23840 all=316365 active=15793 piece=▁subdivisió
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=37 size=23860 all=316347 active=15775 piece=espectroscòpia
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=23880 all=316509 active=15937 piece=abo
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=23900 all=316839 active=16267 piece=▁RT
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=36 min_freq=18
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=23920 all=317095 active=16087 piece=xaca
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=23940 all=317215 active=16207 piece=▁Tin
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=23960 all=317348 active=16340 piece=qüent
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=23980 all=317436 active=16428 piece=▁Chev
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=24000 all=317536 active=16528 piece=▁còmp
bpe_model_trainer.cc(167

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=33 min_freq=16
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=25320 all=321488 active=16278 piece=ushi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=25340 all=321621 active=16410 piece=▁WWF
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=25360 all=321704 active=16493 piece=austr
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=25380 all=321874 active=16663 piece=▁1490
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=25400 all=321919 active=16708 piece=▁Clem
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=33 min_freq=16
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=25420 all=322005 active=16178 piece=▁Nuri
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=25440 all=322102 active=16275 piece=▁epic
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=25460 all=322182 active=16355 piece=entrev
bpe_model_trainer.cc(258) LOG(INFO) Added: freq

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=31 size=26740 all=325137 active=16235 piece=▁suburbis
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=31 size=26760 all=325128 active=16226 piece=▁espècimen
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=31 size=26780 all=325112 active=16210 piece=▁atmosferes
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=31 size=26800 all=325098 active=16196 piece=▁vestimenta
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=31 min_freq=15
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=31 size=26820 all=325078 active=16235 piece=▁subscripció
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=31 size=26840 all=325058 active=16215 piece=▁localitzacions
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=30 size=26860 all=325255 active=16412 piece=cas
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=30 size=26880 all=325563 active=16720 piece=▁SR
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=30 size=26900 all=325751 active=16908 piece=hner


bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=28160 all=329397 active=16870 piece=▁Paz
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=28180 all=329502 active=16975 piece=▁pog
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=28200 all=329593 active=17066 piece=hardt
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=28 min_freq=14
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=28220 all=329687 active=16564 piece=otten
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=28240 all=329768 active=16645 piece=▁1608
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=28260 all=329847 active=16724 piece=▁Hild
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=28280 all=329892 active=16769 piece=▁Tren
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=28300 all=329976 active=16853 piece=▁vici
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=28 min_freq=14
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=29580 all=332870 active=17288 piece=▁Kri
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=29600 all=333005 active=17423 piece=▁fún
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=26 min_freq=14
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=29620 all=333108 active=16751 piece=cauld
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=29640 all=333202 active=16845 piece=ologi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=29660 all=333296 active=16939 piece=▁1440
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=29680 all=333376 active=17019 piece=▁Homo
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=29700 all=333457 active=17100 piece=▁Syst
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=26 min_freq=14
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=29720 all=333517 active=16729 piece=▁musc
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=24 size=31020 all=335653 active=17018 piece=erl
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=24 size=31040 all=335894 active=17259 piece=éré
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=24 size=31060 all=336037 active=17402 piece=avan
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=24 size=31080 all=336261 active=17626 piece=ochi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=24 size=31100 all=336413 active=17778 piece=▁3-1
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=24 min_freq=13
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=24 size=31120 all=336502 active=16910 piece=▁XXV
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=24 size=31140 all=336609 active=17017 piece=Udine
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=24 size=31160 all=336733 active=17141 piece=jarat
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=24 size=31180 all=336809 active=17217 piece=▁1277
bpe_model_trainer.cc(258) LOG(INFO) Added

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=78372 size=60 all=10571 active=6288 piece=qu
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=59706 size=80 all=12608 active=8325 piece=▁A
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=47615 size=100 all=14196 active=9913 piece=ch
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=46472 min_freq=3740
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40370 size=120 all=16162 active=2755 piece=ano
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=34613 size=140 all=18280 active=4873 piece=▁sta
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28009 size=160 all=20709 active=7302 piece=▁pre
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=23740 size=180 all=23014 active=9607 piece=▁vi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=20459 size=200 all=25178 active=11771 piece=zza
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=20380 min_freq=2571
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=18602 size=220

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=1479 min_freq=302
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1457 size=1620 all=104723 active=5893 piece=▁=
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1426 size=1640 all=105433 active=6603 piece=bito
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1405 size=1660 all=105857 active=7027 piece=dati
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1388 size=1680 all=106601 active=7771 piece=▁principali
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1376 size=1700 all=107343 active=8513 piece=▁temp
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=1374 min_freq=285
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1359 size=1720 all=107918 active=5920 piece=▁sconfi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1342 size=1740 all=108667 active=6669 piece=▁noto
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=1321 size=1760 all=109086 active=7088 piece=▁econ
bpe_model_trainer.cc(258) LOG(INFO) A

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=643 size=3100 all=146218 active=9679 piece=▁Cre
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=643 min_freq=160
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=638 size=3120 all=146829 active=7863 piece=▁diffici
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=630 size=3140 all=147432 active=8466 piece=ys
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=626 size=3160 all=147913 active=8947 piece=▁immedia
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=620 size=3180 all=148172 active=9206 piece=té
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=615 size=3200 all=148548 active=9582 piece=▁America
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=615 min_freq=154
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=610 size=3220 all=148876 active=7755 piece=▁on
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=606 size=3240 all=149415 active=8294 piece=▁Charles
bpe_model_trainer.cc(258) LOG(INFO) Added: freq

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=392 min_freq=108
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=391 size=4620 all=175274 active=8997 piece=▁campionato
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=389 size=4640 all=175829 active=9552 piece=▁consiglio
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=386 size=4660 all=176492 active=10215 piece=mini
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=384 size=4680 all=176872 active=10595 piece=▁lancio
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=383 size=4700 all=177191 active=10914 piece=▁raggiunto
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=382 min_freq=105
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=381 size=4720 all=177587 active=9256 piece=università
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=379 size=4740 all=177915 active=9584 piece=▁conce
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=377 size=4760 all=178237 active=9906 piece=marono
bpe_model_trainer.cc(25

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=272 min_freq=81
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=271 size=6120 all=199107 active=10175 piece=▁matematico
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=269 size=6140 all=199370 active=10438 piece=▁accol
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=268 size=6160 all=199750 active=10818 piece=Irlanda
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=267 size=6180 all=200159 active=11227 piece=▁aggre
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=266 size=6200 all=200307 active=11375 piece=▁premi
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=266 min_freq=80
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=264 size=6220 all=200514 active=10212 piece=cas
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=263 size=6240 all=200756 active=10454 piece=▁Fred
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=263 size=6260 all=200859 active=10557 piece=▁satellite
bpe_model_trainer.cc(258) LOG(

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=202 size=7560 all=216648 active=11541 piece=▁Den
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=201 size=7580 all=216850 active=11743 piece=iano
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=201 size=7600 all=217039 active=11932 piece=▁ribellione
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=201 min_freq=65
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=200 size=7620 all=217479 active=11292 piece=▁legata
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=199 size=7640 all=217781 active=11594 piece=▁even
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=199 size=7660 all=217823 active=11636 piece=▁coalizione
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=198 size=7680 all=218095 active=11908 piece=▁limiti
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=197 size=7700 all=218348 active=12161 piece=acque
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=197 min_freq=64
bpe_model_trainer.cc(258) LOG(

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=160 min_freq=54
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=160 size=9020 all=231710 active=11633 piece=▁vincere
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=159 size=9040 all=231943 active=11866 piece=▁1888
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=159 size=9060 all=232007 active=11930 piece=▁chiusura
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=158 size=9080 all=232248 active=12171 piece=▁voli
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=158 size=9100 all=232278 active=12201 piece=▁esecutivo
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=158 min_freq=54
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=157 size=9120 all=232625 active=11961 piece=▁ampie
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=157 size=9140 all=232701 active=12037 piece=▁organizzò
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=156 size=9160 all=232908 active=12244 piece=▁rivelò
bpe_model_trainer.cc(258

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=130 min_freq=46
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=130 size=10420 all=244072 active=12200 piece=▁intellettuali
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=129 size=10440 all=244557 active=12685 piece=▁1886
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=129 size=10460 all=244671 active=12799 piece=▁pittori
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=128 size=10480 all=244722 active=12850 piece=▁44
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=128 size=10500 all=244994 active=13120 piece=▁priori
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=128 min_freq=46
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=128 size=10520 all=245006 active=12256 piece=▁Conservatorio
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=127 size=10540 all=245538 active=12788 piece=elico
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=127 size=10560 all=245682 active=12932 piece=zionati
bpe_model_tra

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=108 size=11820 all=255220 active=12842 piece=▁ES
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=108 size=11840 all=255620 active=13242 piece=▁1835
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=108 size=11860 all=255748 active=13370 piece=▁preis
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=108 size=11880 all=255797 active=13419 piece=▁divenire
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=107 size=11900 all=255893 active=13515 piece=xel
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=107 min_freq=40
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=107 size=11920 all=256158 active=13048 piece=▁radar
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=107 size=11940 all=256203 active=13093 piece=▁africana
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=107 size=11960 all=256192 active=13082 piece=▁sopravvivere
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=106 size=11980 all=256542 active=13432 piece=chesi
bpe_model

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=92 size=13240 all=264193 active=13390 piece=▁enormi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=92 size=13260 all=264191 active=13388 piece=▁raccolse
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=91 size=13280 all=264201 active=13398 piece=osi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=91 size=13300 all=264552 active=13749 piece=cario
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=91 min_freq=36
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=91 size=13320 all=264761 active=13416 piece=▁mandò
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=91 size=13340 all=264791 active=13446 piece=▁preciso
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=91 size=13360 all=264793 active=13448 piece=▁geografia
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=90 size=13380 all=265058 active=13713 piece=Alta
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=90 size=13400 all=265299 active=13954 piece=▁1799
bpe_model_trainer.cc(

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=79 min_freq=32
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=79 size=14720 all=272710 active=13753 piece=ellino
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=79 size=14740 all=272811 active=13854 piece=▁ottimi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=79 size=14760 all=272828 active=13871 piece=▁favorito
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=79 size=14780 all=272822 active=13865 piece=▁elementare
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=78 size=14800 all=272883 active=13926 piece=ice
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=78 min_freq=32
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=78 size=14820 all=273209 active=13957 piece=▁pig
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=78 size=14840 all=273351 active=14099 piece=ccordo
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=78 size=14860 all=273424 active=14172 piece=▁sorti
bpe_model_trainer.cc(258) LOG(INFO

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=69 min_freq=29
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=69 size=16120 all=279965 active=14350 piece=mile
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=69 size=16140 all=280163 active=14548 piece=lusso
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=69 size=16160 all=280325 active=14710 piece=ticità
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=69 size=16180 all=280370 active=14755 piece=▁rocca
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=69 size=16200 all=280419 active=14804 piece=▁storio
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=69 min_freq=29
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=69 size=16220 all=280456 active=14051 piece=▁caroling
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=69 size=16240 all=280465 active=14060 piece=▁Heidelberg
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=68 size=16260 all=280447 active=14042 piece=Ab
bpe_model_trainer.cc(258) LOG(INFO) 

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=61 size=17560 all=287216 active=14474 piece=▁causate
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=61 size=17580 all=287209 active=14467 piece=▁Southern
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=61 size=17600 all=287200 active=14458 piece=▁praticare
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=61 min_freq=26
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=61 size=17620 all=287181 active=14341 piece=▁sconosciute
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=60 size=17640 all=287487 active=14647 piece=back
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=60 size=17660 all=287682 active=14842 piece=mbert
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=60 size=17680 all=287805 active=14965 piece=▁Stur
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=60 size=17700 all=287851 active=15011 piece=▁Luisa
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=60 min_freq=26
bpe_model_trainer.cc(258) L

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=54 size=19000 all=292745 active=14781 piece=▁avvenuti
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=54 min_freq=24
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=54 size=19020 all=292750 active=14643 piece=▁ceramiche
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=54 size=19040 all=292745 active=14638 piece=▁pubblicita
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=54 size=19060 all=292738 active=14631 piece=▁circonferenza
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=53 size=19080 all=292966 active=14859 piece=ioux
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=53 size=19100 all=293225 active=15118 piece=▁pel
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=53 min_freq=24
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=53 size=19120 all=293375 active=14803 piece=École
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=53 size=19140 all=293460 active=14888 piece=▁Ring
bpe_model_trainer.cc(258

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=48 min_freq=22
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=48 size=20420 all=298245 active=15067 piece=▁ces
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=48 size=20440 all=298404 active=15226 piece=▁1575
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=48 size=20460 all=298503 active=15325 piece=▁Lisa
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=48 size=20480 all=298629 active=15451 piece=▁Bosco
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=48 size=20500 all=298688 active=15510 piece=▁zolfo
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=48 min_freq=22
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=48 size=20520 all=298711 active=14958 piece=▁basava
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=48 size=20540 all=298734 active=14981 piece=▁Chimica
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=48 size=20560 all=298731 active=14978 piece=▁fermata
bpe_model_trainer.cc(258) LOG(INFO) A

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=44 size=21860 all=303130 active=15128 piece=▁ripristino
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=44 size=21880 all=303117 active=15115 piece=▁sorprendente
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=43 size=21900 all=303334 active=15332 piece=mna
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=43 min_freq=21
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=43 size=21920 all=303660 active=15472 piece=inde
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=43 size=21940 all=303893 active=15705 piece=▁Usa
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=43 size=21960 all=303984 active=15796 piece=▁1618
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=43 size=21980 all=304079 active=15891 piece=▁Shir
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=43 size=22000 all=304167 active=15979 piece=tativa
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=43 min_freq=20
bpe_model_trainer.cc(258) LOG(INFO)

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=23280 all=308317 active=15439 piece=▁nascosta
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=23300 all=308310 active=15432 piece=▁combinata
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=40 min_freq=19
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=23320 all=308304 active=15410 piece=▁Eisenhower
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=23340 all=308299 active=15405 piece=▁interazione
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=40 size=23360 all=308279 active=15385 piece=▁drasticamente
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=39 size=23380 all=308452 active=15558 piece=laí
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=39 size=23400 all=308619 active=15725 piece=andò
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=39 min_freq=19
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=39 size=23420 all=308803 active=15609 piece=▁Kul
bpe_model_trainer.c

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=24720 all=312654 active=15725 piece=ettico
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=24740 all=312700 active=15771 piece=▁Sachs
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=24760 all=312756 active=15827 piece=▁sensu
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=24780 all=312788 active=15859 piece=▁Strate
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=24800 all=312810 active=15881 piece=▁lipidi
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=36 min_freq=18
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=24820 all=312832 active=15662 piece=▁Britain
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=24840 all=312818 active=15648 piece=▁escluse
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=24860 all=312816 active=15646 piece=▁tornano
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=36 size=24880 all=312806 active=15636 piece=▁confessa
bpe_model_tra

bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=33 min_freq=17
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=26220 all=316806 active=15895 piece=▁spera
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=26240 all=316836 active=15925 piece=▁Coloro
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=26260 all=316847 active=15936 piece=▁disabi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=26280 all=316871 active=15960 piece=▁vostra
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=26300 all=316869 active=15958 piece=▁Jenkins
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=33 min_freq=17
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=26320 all=316859 active=15834 piece=▁confuci
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=26340 all=316857 active=15832 piece=▁routine
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=33 size=26360 all=316840 active=15815 piece=▁adottati
bpe_model_trainer.cc(258) 

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=31 size=27620 all=319649 active=15969 piece=▁Albuquerque
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=31 size=27640 all=319629 active=15949 piece=▁supervisore
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=31 size=27660 all=319611 active=15931 piece=▁raggiungibile
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=30 size=27680 all=319797 active=16117 piece=VII
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=30 size=27700 all=320041 active=16361 piece=▁FS
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=30 min_freq=16
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=30 size=27720 all=320185 active=16138 piece=onis
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=30 size=27740 all=320341 active=16294 piece=▁Das
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=30 size=27760 all=320467 active=16420 piece=Earth
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=30 size=27780 all=320576 active=16529 piece=▁1501
bpe_model_traine

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=29040 all=323653 active=16420 piece=umati
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=29060 all=323662 active=16429 piece=▁1667
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=29080 all=323730 active=16497 piece=▁Itur
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=29100 all=323824 active=16591 piece=▁dira
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=28 min_freq=15
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=29120 all=323926 active=16285 piece=distan
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=29140 all=324012 active=16371 piece=▁Carte
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=29160 all=324063 active=16422 piece=▁Sando
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=29180 all=324081 active=16440 piece=▁climi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=28 size=29200 all=324092 active=16451 piece=▁ribos
bpe_model_trainer.cc(167) LO

bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=30460 all=326950 active=16572 piece=▁June
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=30480 all=327014 active=16636 piece=▁Uffi
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=30500 all=327079 active=16701 piece=attiva
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=26 min_freq=14
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=30520 all=327165 active=16436 piece=vibile
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=30540 all=327208 active=16479 piece=▁Maure
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=30560 all=327254 active=16525 piece=▁bifor
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=30580 all=327303 active=16574 piece=▁subli
bpe_model_trainer.cc(258) LOG(INFO) Added: freq=26 size=30600 all=327353 active=16624 piece=tivismo
bpe_model_trainer.cc(167) LOG(INFO) Updating active symbols. max_freq=26 min_freq=14
bpe_model_trainer.cc(258) LOG(INFO) Add

trainer_interface.cc(626) LOG(INFO) Saving vocabs: sentencepiece_models/tgt_src.vocab
INFO:tensorflow:Converting SentencePiece vocabulary to OpenNMT-tf format...


In [26]:
tokenize("src_tgt_test.txt","src_tgt_test")
tokenize("src_tgt_train.txt","src_tgt_train")
tokenize("src_tgt_val.txt","src_tgt_val")
tokenize("tgt_src_test.txt","tgt_src_test")
tokenize("tgt_src_train.txt","tgt_src_train")
tokenize("tgt_src_val.txt","tgt_src_val")

tokenize("src_test.txt","src_test")
tokenize("src_train.txt","src_train")
tokenize("src_val.txt","src_val")
tokenize("pvt_src_test.txt","pvt_src_test")
tokenize("pvt_src_train.txt","pvt_src_train")
tokenize("pvt_src_val.txt","pvt_src_val")

tokenize("pvt_tgt_test.txt","pvt_tgt_test")
tokenize("pvt_tgt_train.txt","pvt_tgt_train")
tokenize("pvt_tgt_val.txt","pvt_tgt_val")
tokenize("tgt_test.txt","tgt_test")
tokenize("tgt_train.txt","tgt_train")
tokenize("tgt_val.txt","tgt_val")

'tgt.txt'

In [8]:
config_src_pvt = {
    "model_dir": "src_pvt_model/",
    "data": {
        "train_features_file": "src_pvt_data/src_train.tok/",
        "train_labels_file": "src_pvt_data/pvt_src_train.tok/",
        "eval_features_file": "src_pvt_data/src_val.tok/",
        "eval_labels_file": "src_pvt_data/pvt_src_val.tok/",
        "source_vocabulary": "sentencepiece_models/src.vocab/",
        "target_vocabulary": "sentencepiece_models/pvt_src.vocab/",
    },
    "train": {
        "max_step": 25000,
        "save_checkpoints_steps": 500,
        "keep_checkpoint_max": 2,
    },
    "eval": {
        "save_eval_predictions": True,
        "steps": 50000,
        "max_exports_to_keep": 2,
        "early_stopping": {
            "metric": "loss",
            "min_improvement": 0.1,
            "steps": 100,
        },
    }
}

config_pvt_tgt = {
    "model_dir": "/content/pvt_tgt_model/",
    "data": {
        "train_features_file": "/content/pvt_tgt_data/pvt_tgt_train.tok/",
        "train_labels_file": "/content/pvt_tgt_data/tgt_train.tok/",
        "eval_features_file": "/content/pvt_tgt_data/pvt_tgt_val.tok/",
        "eval_labels_file": "/content/pvt_tgt_data/tgt_val.tok/",
        "source_vocabulary": "/content/sentencepiece_models/pvt_tgt.vocab/",
        "target_vocabulary": "/content/sentencepiece_models/tgt.vocab/",
    },
    "train": {
        "max_step": 25000,
        "save_checkpoints_steps": 500,
        "keep_checkpoint_max": 2,
    },
    "eval": {
        "save_eval_predictions": True,
        "steps": 50000,
        "max_exports_to_keep": 2,
        "early_stopping": {
            "metric": "loss",
            "min_improvement": 0.1,
            "steps": 100,
        },
    }
}

config_src_tgt = {
    "model_dir": "/content/src_tgt_model/",
    "data": {
        "train_features_file": "/content/src_tgt_data/src_tgt_train.tok",
        "train_labels_file": "/content/src_tgt_data/tgt_src_train.tok",
        "eval_features_file": "/content/src_tgt_data/src_tgt_val.tok",
        "eval_labels_file": "/content/src_tgt_data/tgt_src_val.tok",
        "source_vocabulary": "/content/sentencepiece_models/src_tgt.vocab",
        "target_vocabulary": "/content/sentencepiece_models/tgt_src.vocab",
    },
    "train": {
        "max_step": 25000,
        "save_checkpoints_steps": 500,
        "keep_checkpoint_max": 2,
    },
    "eval": {
        "save_eval_predictions": True,
        "steps": 50000,
        "max_exports_to_keep": 2,
        "early_stopping": {
            "metric": "loss",
            "min_improvement": 0.1,
            "steps": 100,
        },
    }
}

config_baseline = {
    "model_dir": "/content/baseline_model/",
    "data": {
        "train_features_file": "/content/src_tgt_data/src_tgt_train.tok/",
        "train_labels_file": "/content/src_tgt_data/tgt_src_train.tok/",
        "eval_features_file": "/content/src_tgt_data/src_tgt_val.tok/",
        "eval_labels_file": "/content/src_tgt_data/tgt_src_val.tok/",
        "source_vocabulary": "/content/sentencepiece_models/src_tgt.vocab/",
        "target_vocabulary": "/content/sentencepiece_models/tgt_src.vocab/",
    },
    "train": {
        "max_step": 25000,
        "save_checkpoints_steps": 500,
        "keep_checkpoint_max": 2,
    },
    "eval": {
        "save_eval_predictions": True,
        "steps": 50000,
        "max_exports_to_keep": 2,
        "early_stopping": {
            "metric": "loss",
            "min_improvement": 0.1,
            "steps": 100,
        },
    }
}

In [9]:
learning_rate = opennmt.schedules.NoamDecay(scale=2.0, model_dim=512, warmup_steps=8000)
optimizer = tfa.optimizers.LazyAdam(learning_rate)

2021-12-11 13:26:20.188249: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-12-11 13:26:20.188701: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-12-11 13:26:20.188745: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (DESKTOP-3JSBOFT): /proc/driver/nvidia/version does not exist
2021-12-11 13:26:20.191701: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
# Training source-pivot model
src_pvt_model = opennmt.models.TransformerBase()
src_pvt_runner = opennmt.Runner(src_pvt_model, config_src_pvt, auto_config=True)
sp_config = src_pvt_runner._finalize_config(training=True)

src_pvt_runner.train(num_devices=1, with_eval=True)

INFO:tensorflow:Using OpenNMT-tf version 2.23.0
INFO:tensorflow:Using model:
(model): TransformerBase(
  (examples_inputter): SequenceToSequenceInputter(
    (features_inputter): WordEmbedder()
    (labels_inputter): WordEmbedder()
    (inputters): ListWrapper(
      (0): WordEmbedder()
      (1): WordEmbedder()
    )
  )
  (encoder): SelfAttentionEncoder(
    (position_encoder): SinusoidalPositionEncoder(
      (reducer): SumReducer()
    )
    (layer_norm): LayerNorm()
    (layers): ListWrapper(
      (0): SelfAttentionEncoderLayer(
        (self_attention): TransformerLayerWrapper(
          (layer): MultiHeadAttention(
            (linear_queries): Dense(512)
            (linear_keys): Dense(512)
            (linear_values): Dense(512)
            (linear_output): Dense(512)
          )
          (input_layer_norm): LayerNorm()
        )
        (ffn): TransformerLayerWrapper(
          (layer): FeedForwardNetwork(
            (inner): Dense(2048)
            (outer): Dense(512)
  

INFO:tensorflow:Using parameters:
data:
  eval_features_file: src_pvt_data/src_val.txt/
  eval_labels_file: src_pvt_data/pvt_src_val.txt/
  source_vocabulary: sentencepiece_models/src.vocab/
  target_vocabulary: sentencepiece_models/pvt_src.vocab/
  train_features_file: src_pvt_data/src_train.txt/
  train_labels_file: src_pvt_data/pvt_src_train.txt/
eval:
  batch_size: 32
  batch_type: examples
  early_stopping:
    metric: loss
    min_improvement: 0.1
    steps: 100
  length_bucket_width: 5
  max_exports_to_keep: 2
  save_eval_predictions: true
  steps: 50000
infer:
  batch_size: 32
  batch_type: examples
  length_bucket_width: 5
model_dir: src_pvt_model/
params:
  average_loss_in_time: true
  beam_width: 4
  decay_params:
    model_dim: 512
    warmup_steps: 8000
  decay_type: NoamDecay
  label_smoothing: 0.1
  learning_rate: 2.0
  num_hypotheses: 1
  optimizer: LazyAdam
  optimizer_params:
    beta_1: 0.9
    beta_2: 0.998
score:
  batch_size: 64
  batch_type: examples
  length_buc

2021-12-11 13:29:12.141066: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 2527594 of 31750988
2021-12-11 13:29:22.141071: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 5057119 of 31750988
2021-12-11 13:29:32.141073: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 7580650 of 31750988
2021-12-11 13:29:42.141072: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 10105716 of 31750988
2021-12-11 13:29:52.141068: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 12633059 of 31750988
2021-12-11 13:30:02.141069: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:380] Filling up shuffle buffer (this may take a while): 15151844 of 31750988
2021-12-11 13:30:12.141075: I tensorflow/core/kernels/d

INFO:tensorflow:Number of model parameters: 93326081
INFO:tensorflow:Number of model weights: 260 (trainable = 260, non trainable = 0)
INFO:tensorflow:Saved checkpoint src_pvt_model/ckpt-1


KeyboardInterrupt: 

In [None]:
# Training pivot-target model
pvt_tgt_model = opennmt.models.TransformerBase()
pvt_tgt_runner = opennmt.Runner(pvt_tgt_model, config_pvt_tgt, auto_config=True)
pt_config = pvt_tgt_runner._finalize_config(training=True)

pvt_tgt_runner.train(num_devices=1, with_eval=True)

In [None]:
# Restore both models weights
src_pvt_model.initialize(data_config=sp_config['data'], params=sp_config['params'])
src_pvt_model.create_variables(optimizer=optimizer)

pvt_tgt_model.initialize(data_config=pt_config['data'], params=pt_config['params'])
pvt_tgt_model.create_variables(optimizer=optimizer)

checkpoint_path = sp_config['model_dir']
checkpoint = checkpoint_util.Checkpoint.from_config(sp_config, src_pvt_model, optimizer=optimizer)
checkpoint.restore(checkpoint_path=checkpoint_path, weights_only=True)

checkpoint_path = pt_config['model_dir']
checkpoint = checkpoint_util.Checkpoint.from_config(pt_config, pvt_tgt_model, optimizer=optimizer)
checkpoint.restore(checkpoint_path=checkpoint_path, weights_only=True)

count_weights(src_pvt_model)
count_weights(pvt_tgt_model)

In [None]:
# Transfer weights to src_tgt_model
src_tgt_model = opennmt.models.TransformerBase()
src_tgt_runner = opennmt.Runner(src_tgt_model, config_src_tgt, auto_config=True)
st_config = src_tgt_runner._finalize_config(training=True)

src_tgt_model.initialize(data_config=st_config['data'], params=st_config['params'])
src_tgt_model.create_variables(optimizer=optimizer)

src_tgt_model.encoder = src_pvt_model.encoder
src_tgt_model.decoder = pvt_tgt_model.decoder

new_checkpoint = checkpoint_util.Checkpoint.from_config(st_config, src_tgt_model, optimizer=optimizer)
new_checkpoint.save()

In [7]:
# Training source-target model (using pretrained models)
src_tgt_model = opennmt.models.TransformerBase()
src_tgt_runner = opennmt.Runner(src_tgt_model, config_src_tgt, auto_config=True)
st_config = src_tgt_runner._finalize_config(training=True)
src_tgt_runner.train(num_devices=1, with_eval=True)

INFO:tensorflow:Using OpenNMT-tf version 2.19.0
INFO:tensorflow:Using model:
(model): TransformerBase(
  (examples_inputter): SequenceToSequenceInputter(
    (features_inputter): WordEmbedder()
    (labels_inputter): WordEmbedder()
    (inputters): ListWrapper(
      (0): WordEmbedder()
      (1): WordEmbedder()
    )
  )
  (encoder): SelfAttentionEncoder(
    (position_encoder): SinusoidalPositionEncoder(
      (reducer): SumReducer()
    )
    (layer_norm): LayerNorm()
    (layers): ListWrapper(
      (0): SelfAttentionEncoderLayer(
        (self_attention): TransformerLayerWrapper(
          (layer): MultiHeadAttention(
            (linear_queries): Dense(512)
            (linear_keys): Dense(512)
            (linear_values): Dense(512)
            (linear_output): Dense(512)
          )
          (input_layer_norm): LayerNorm()
        )
        (ffn): TransformerLayerWrapper(
          (layer): FeedForwardNetwork(
            (inner): Dense(2048)
            (outer): Dense(512)
  

In [None]:
# Training source-target model (using no models)
baseline_model = opennmt.models.TransformerBase()
baseline_runner = opennmt.Runner(baseline_model, config_baseline, auto_config=True)

baseline_runner.train(num_devices=1, with_eval=True)

In [None]:
# Compute scores
baseline_scores = compute_scores(
    runner=baseline_runner,
    features_filename="/content/src_tgt_data/src_tgt_test.txt",
    labels_filename="/content/src_tgt_data/tgt_src_test.txt",
    pred_filename="/content/baseline_pred.txt")

pivot_based_tl_scores = compute_scores(
    runner=src_tgt_runner,
    features_filename="/content/src_tgt_data/src_tgt_test.txt",
    labels_filename="/content/src_tgt_data/tgt_src_test.txt",
    pred_filename="/content/src_to_tgt_pred.txt")

print(f"============ Baseline Source-Target NMT Evaluation ============\n {baseline_scores}")
print(f"============ Pretrain Source-Target NMT Evaluation ============\n {pivot_based_tl_scores}")