In [7]:
%ls

'Sentence pairs in English-Toki Pona - 2024-04-02.tsv'   src-eval.txt
 attempt1.ipynb                                          src-train.txt
 broken_tgt_nor_in.vocab                                 src-val.txt
 config.yaml                                             src.txt
 from_scratch.slurm                                      tgt-eval.txt
 [0m[01;34mmodels[0m/                                                 tgt-train.txt
 [01;34mmodels_early_stop[0m/                                      tgt-val.txt
 slurm-28958415.out                                      tgt.txt
 slurm-28958627.out                                      train.log
 slurm-28958779.out


In [1]:
from collections import defaultdict

import string

import matplotlib.pyplot as plt

def remove_punct(s: str):
    for punct in (string.punctuation + '\t'):
        s = s.replace(punct, "")
        s = s.replace("  ", " ")
        s = s.replace(chr(160)," ") # get rid of the stupid no-break space

    return(s)

https://link.springer.com/chapter/10.1007/978-3-031-36616-1_52

Base Model configs

![](https://media.springernature.com/lw368/springer-static/image/chp%3A10.1007%2F978-3-031-36616-1_52/MediaObjects/539942_1_En_52_Tab2_HTML.png)

In [2]:
config_model = '''# config_base_model.yaml


## Where the samples will be written
save_data: run

# Training files
data:
    corpus_1:
        path_src: src-train.txt
        path_tgt: tgt-train.txt
    valid:
        path_src: src-val.txt
        path_tgt: tgt-val.txt

# Vocabulary files, generated by onmt_build_vocab
src_vocab: src.vocab
tgt_vocab: tgt.vocab

# Vocabulary size - should be the same as in sentence piece
src_vocab_size: 50000
tgt_vocab_size: 50000

# Filter out source/target longer than n if [filtertoolong] enabled
src_seq_length: 150
src_seq_length: 150

# Tokenization options
src_subword_model: source.model
tgt_subword_model: target.model

# Where to save the log file and the output models/checkpoints
log_file: train.log
save_model: models_scratch/modelv1

# Stop training if it does not imporve after n validations
# early_stopping: 4

# Default: 5000 - Save a model checkpoint for each n
save_checkpoint_steps: 1000

# To save space, limit checkpoints to last n
# keep_checkpoint: 3

seed: 3435

# Default: 100000 - Train the model to max n steps
# Increase to 200000 or more for large datasets
# For fine-tuning, add up the required steps to the original steps
# decrease steps 
train_steps: 5000

# # Default: 10000 - Run validation after n steps
# # once every 78 
valid_steps: 500
valid_metrics: ["BLEU"]

# # Early Stop
# early_stopping: 10
# early_stopping_criteria: "accuracy"

# early_stopping:
#     # (optional) The target metric name (default: "loss").
#     metric: bleu
#     # (optional) The metric should improve at least by this much to be considered
#     # as an improvement (default: 0)
#     min_improvement: 0.01
#     steps: 10

# Default: 4000 - for large datasets, try up to 8000
warmup_steps: 1000
report_every: 100

# Number of GPUs, and IDs of GPUs
world_size: 1
gpu_ranks: [0]

# Batching
bucket_size: 17096 # whole corpus
num_workers: 8  # Default: 2, set to 0 when RAM out of memory
batch_type: "sents"
batch_size: 128   # Tokens per batch, change when CUDA out of memory
valid_batch_size: 2048
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]

# Optimization
optim: "adam"
learning_rate: 2
decay_method: "noam"
adam_beta1: 0.9
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "sents"

# Model
encoder_type: transformer
decoder_type: transformer
position_encoding: true
layers: 2
heads: 2
word_vec_size: 128
hidden_size: 128
transformer_ff: 512
dropout_steps: [0]
dropout: [0.1]
attention_dropout: [0.1]

'''
with open("config.yaml", "w+") as config_yaml:
  config_yaml.write(config_model)

In [None]:
# bare-bones base config
base_config = """
save_data: run

# Training files
data:
    corpus_1:
        path_src: src-train.txt
        path_tgt: tgt-train.txt
    valid:
        path_src: src-val.txt
        path_tgt: tgt-val.txt

# Vocabulary files, generated by onmt_build_vocab
src_vocab: src.vocab
tgt_vocab: tgt.vocab

# Where to save the log file and the output models/checkpoints
log_file: train.log
save_model: models_scratch/modelbase

train_steps: 10000
report_every: 100

# # Default: 10000 - Run validation after n steps
# # once every 78 
valid_steps: 1000
valid_metrics: ["BLEU"]

# Number of GPUs, and IDs of GPUs
world_size: 1
gpu_ranks: [0]

# Batching
bucket_size: 17096 # whole corpus
num_workers: 8  # Default: 2, set to 0 when RAM out of memory
batch_type: "sents"
batch_size: 128   # Tokens per batch, change when CUDA out of memory
valid_batch_size: 2048
max_generator_batches: 2
accum_count: [4]
accum_steps: [0]

# Optimization
optim: "adam"
learning_rate: 2
decay_method: "noam"
adam_beta1: 0.9
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.1
param_init: 0
param_init_glorot: true
normalization: "sents"

"""

with open("base_config.yaml", "w+") as config_yaml:
  config_yaml.write(base_config)

model evaluated after every 9984 steps (128 batch size * 78 steps)

In [3]:
unique_tok = defaultdict(set)
with open("Sentence pairs in English-Toki Pona - 2024-04-02.tsv", "r") as pairFile:
    for pair in pairFile:
        l=pair.split('\t')
        en = l[1]
        tok = l[3]
        en = en.lower()
        tok = tok.lower()
        en = remove_punct(en)
        tok = remove_punct(tok)
        unique_tok[tok].add(en)
pairs = []
for en, targets in unique_tok.items():
  # remove ambiguous sentences
  if len(targets) == 1:
    pairs.append((en, list(targets)[0]))
  
# import random

# random.shuffle(pairs)

print(len(pairs))

24424


In [4]:
with open("src.txt", "w") as srcFile:
  with open("tgt.txt", "w") as tgtFile:
    for l in pairs:
      srcFile.write(l[0])
      tgtFile.write(l[1] + '\n')

In [5]:
def splitSrcTarget(srcName,valName,trainName,evalName,train_size,val_size):
  with open(srcName, "r") as srcFile:
    with open(valName, "w") as srcVal:
      i = 0
      for sent in srcFile:
        srcVal.write(sent)
        i += 1
        if i >= val_size:
          print(i)
          break
    with open(trainName, "w") as srcTrain:
      i = 0
      for sent in srcFile:
        srcTrain.write(sent)
        i += 1
        if i >= train_size:
          print(i)
          break
      
    with open(evalName, "w") as srcEval:
      i = 0
      for sent in srcFile:
        srcEval.write(sent)
        i += 1
      print(i)
splitSrcTarget("src.txt","src-val.txt","src-train.txt","src-eval.txt",17096,2442) # 70%, 10%, 20% 
splitSrcTarget("tgt.txt","tgt-val.txt","tgt-train.txt","tgt-eval.txt",17096,2442) # 70%, 10%, 20%

2442
17096
4886
2442
17096
4886


In [8]:
!onmt_build_vocab -config config.yaml -n_sample -1 -num_threads 2

Corpus corpus_1's weight should be given. We default it to 1 for you.
[2024-04-03 19:52:54,240 INFO] Counter vocab from -1 samples.
[2024-04-03 19:52:54,240 INFO] n_sample=-1: Build vocab on full datasets.
[2024-04-03 19:52:54,576 INFO] Counters src: 934
[2024-04-03 19:52:54,576 INFO] Counters tgt: 7088


Sometimes the vocab breaks, hmm