<a href="https://colab.research.google.com/github/hashk1/english-tokipona-translator/blob/main/english_tokipona_translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# English-tokipona translator on Google Colaboratory

### Install libraries

In [None]:
!pip install tatoebatools
!git clone https://github.com/hashk1/tensor2tensor.git -b hashk1-fix-trainer-and-decoder
!cd tensor2tensor && pip install .

### Download data from Tatoeba

In [None]:
from tatoebatools import ParallelCorpus, tatoeba
tatoeba.dir = "./data"

In [None]:
with open("english.txt", "w") as f1:
  with open("tokipona.txt", "w") as f2:
    for sentence, translation in ParallelCorpus("eng", "toki"):
      f1.write(sentence.text + "\n")
      f2.write(translation.text + "\n")

### Add scripts to generate t2t formatted data

In [None]:
!mkdir -p t2t
!echo "from . import myproblem" > t2t/__init__.py

In [None]:
code = """from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_problems
from tensor2tensor.utils import registry

@registry.register_problem
class eng2toki(text_problems.Text2TextProblem):
    @property
    def approx_vocab_size(self):
        return 2**13

    @property
    def is_generate_per_split(self):
        return False

    @property
    def dataset_splits(self):
        return [{
            "split": problem.DatasetSplit.TRAIN,
            "shards": 9,
        }, {
            "split": problem.DatasetSplit.EVAL,
            "shards": 1,
        }]

    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        filename_input = "./english.txt"
        filename_output = "./tokipona.txt"

        with open(filename_input) as f_in, open(filename_output) as f_out:
            for src, tgt in zip(f_in, f_out):
                src = src.strip()
                tgt = tgt.strip()
                if not src or not tgt:
                    continue
                yield {"inputs": src, "targets": tgt}"""
                
!echo '$code' > t2t/myproblem.py

### Run data generator

In [None]:
!t2t-datagen \
  --data_dir=. \
  --tmp_dir=./t2t \
  --problem=eng2toki \
  --t2t_usr_dir=./t2t

### Train the data
- Default train steps is 250,000.
- If you change steps, then add option `--train_steps=XXXX` .

In [None]:
!t2t-trainer \
  --data_dir=. \
  --problem=eng2toki \
  --model=transformer \
  --hparams_set=transformer_base_single_gpu \
  --output_dir=./t2t \
  --t2t_usr_dir=./t2t

### Run decoder
- input: English, output: toki pona
- q: quit the decoder

In [None]:
!t2t-decoder \
   --data_dir=./ \
   --problem=eng2toki \
   --model=transformer \
   --hparams_set=transformer_base_single_gpu \
   --output_dir=./t2t \
   --model_dir=./t2t \
   --decode_hparams="beam_size=4,alpha=0.6" \
   --decode_interactive=true \
   --t2t_usr_dir=./t2t 

### Save model to Google Drive



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!mkdir -p drive/MyDrive/t2t/

In [None]:
!cp -pr t2t/eval* drive/MyDrive/t2t/
!cp -p t2t/*.py drive/MyDrive/t2t/
!cp -p [cefgh]* drive/MyDrive/t2t/
!cp -p model.ckpt-250000.* drive/MyDrive/t2t/