In [0]:
# (Optional)
# We will output model files into mounted google drive.
# Output directory can be changed.
from google.colab import drive
drive.mount('/content/drive')

In [2]:
# Default tokenizer uses sentence piece.
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/00/95/7f357995d5eb1131aa2092096dca14a6fc1b1d2860bd99c22a612e1d1019/sentencepiece-0.1.82-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K    100% |████████████████████████████████| 1.0MB 18.6MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.82


In [3]:
# Cloning the project repository
# Do not forget "--recursive" option 
# to clone a submodule (tokenization for default tokenizer) at the same time.
!git clone --recursive https://github.com/iki-taichi/tf-keras-transformer.git

Cloning into 'tf-keras-transformer'...
remote: Enumerating objects: 28, done.[K
remote: Counting objects: 100% (28/28), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 28 (delta 3), reused 23 (delta 2), pack-reused 0[K
Unpacking objects: 100% (28/28), done.
Submodule 'src/tokenization' (https://github.com/iki-taichi/tokenization.git) registered for path 'src/tokenization'
Cloning into '/content/tf-keras-transformer/src/tokenization'...
remote: Enumerating objects: 21, done.        
remote: Counting objects: 100% (21/21), done.        
remote: Compressing objects: 100% (15/15), done.        
remote: Total 21 (delta 5), reused 15 (delta 3), pack-reused 0        
Submodule path 'src/tokenization': checked out 'f3290db6b1179985b2a29b7968b8d6fcbef48f73'


In [0]:
# From now, we will work in the repository directory
import os
os.chdir('tf-keras-transformer')

In [5]:
# Preparation of a demo dataset
# based on https://alaginrc.nict.go.jp/WikiCorpus/index_E.html
# The dataset contains about 0.5 million English-Japanese sentence pairs
!python src/get_kyoto_corpus.py

start downloading...
downloaded
start editing files...
to convert 14111 xml files...
data sample:(data/kyoto_corpus/EPR/EPR00709.xml)["Ikisomimi no mikoto (or Okisomimi no mikoto, the date of birth and death unknown) was a member of the Imperial family who appears in 'Nihonshoki' (Chronicles of Japan) who lived during the Kofun period (tumulus period).", '息石耳命（いきそみみのみこと、またはおきそみみのみこと、生没年未詳）は、「日本書紀」に登場する古墳時代の皇族。']
data sample:(data/kyoto_corpus/EPR/EPR00709.xml)['He was the first Imperial prince of Emperor Annei, and his mother was Nunasokonakatsuhime no mikoto.', '安寧天皇の第一皇子で、母は渟名底仲媛命。']
data sample:(data/kyoto_corpus/EPR/EPR00709.xml)['His younger maternal half-brothers include Emperor Itoku.', '同母弟に懿徳天皇らがいる。']
data sample:(data/kyoto_corpus/EPR/EPR00709.xml)["Amatoyotsuhime no mikoto (Empress of Emperor Itoku) was his only child, and it is described in 'Sendai Kujihongi' (Ancient Japanese History) that he had no heir.", '子は天豊津媛命（懿徳天皇皇后）のみで、跡継ぎがなかったと「先代旧事本紀」に記されている。']
data/kyoto_corpus/

In [0]:
# Import fitting module
from src.fitting import FitEnvironment

In [7]:
# Configure and start fitting
env = FitEnvironment(
        use_tpu=True,
        batch_size=8,
        input_len=(1024, 1024),
        num_epoch=5,
        output_dir='/content/drive/My Drive/transformer_model',
        data_path=['data/kyoto_en_ja.csv'],
        valid_path=['data/kyoto_en_ja_valid.csv'],
        resume_model_path=None,
        resume_initial_epoch=None,
    )
# run (training 5 epoch with 0.5 million samples) will take a time.
# The summary of my case:
# Total elapsed time: 6483 s
#    - before starting fit_generator method: 119 s
#    - compilation of model on TPU for train mode: 151 s
#    - re-compilation for eval mode: 68 s
#    - the other computation: 6145 s
# (I think 5 epochs is not enough for convergence)
wrapper = env.run()

INFO:tensorflow:training starting with env={
  "model_config": "{\n  \"use_tpu\": true,\n  \"tpu_grpc_url\": \"grpc://10.27.174.2:8470\",\n  \"src_tokenizer\": \"sp_uncase_en_ja_40000\",\n  \"tar_tokenizer\": \"sp_uncase_en_ja_40000\",\n  \"use_same_embed\": true,\n  \"block_num\": [\n    6,\n    6\n  ],\n  \"embed_dim\": 768,\n  \"hidden_dim\": 3072,\n  \"head_num\": 12,\n  \"attention_activation\": \"relu\",\n  \"feed_forward_activation\": \"gelu\",\n  \"dropout_rate\": 0.1,\n  \"input_len\": [\n    1024,\n    1024\n  ],\n  \"token_num\": [\n    null,\n    null\n  ]\n}",
  "use_tpu": true,
  "input_len": [
    1024,
    1024
  ],
  "work_dir": "model/translator_en_ja",
  "output_dir": "/content/drive/My Drive/transformer_model",
  "data_path": [
    "data/kyoto_en_ja.csv"
  ],
  "valid_path": [
    "data/kyoto_en_ja_valid.csv"
  ],
  "show_model_summary": true,
  "batch_size": 8,
  "num_epoch": 5,
  "warm_step": 4000,
  "train_callbacks": [
    "<src.custom_callbacks.BatchLearningRat

In [0]:
# Prediction
# We will make a loaded model run on CPU 
# because TPU requires batch size can be devided by the number of devices (8).
# Clear TPU configuration left in the current thread before creating them. 
tf.keras.backend.clear_session()
from src.transformer import TransformerWrapper
trans = TransformerWrapper('/content/drive/My Drive/transformer_model/model.05.hdf5')

In [26]:
trans('Kyoto is a Japanese city.')

'京都(きょうと)は、日本の市。'

In [30]:
trans('A statue of a Buddhist monk, taken to be TAIRA no Kiyomori, holding scriptures.')

'平清盛像を安置し、経巻を施す。'

In [31]:
trans("The expression 'Hongan-ji Temple power' is used here because 'Hongan-ji Temple school' would be confused with the current Jodo Shinshu Hongan-ji school (Nishi Hongan-ji Temple school).")

'「本願寺勢力」という表現は、現在の浄土真宗本願寺派と混同され、現在の本願寺派と混同されている。'