<a href="https://colab.research.google.com/github/notAlex2/Translation-Team08-IFT6759/blob/master/notebooks/harman_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains code on how to train Byte-Pair tokenizer on your own dataset. Byte-Pair encoding handles Out-Of-Vocabulary words as well!

In [0]:
import os

project_path = "/content/drive/My Drive/machine-translation"
os.chdir(project_path)

data_path = os.path.join(project_path, 'data')

In [0]:
! pip install transformers===2.7.0 

try:
    import tokenizers
except ImportError as e:
    ! pip install tokenizers

In [0]:
import transformers
import tensorflow as tf
import io
import json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import (
    PreTrainedTokenizer, 
    AutoTokenizer
)


def set_seed(seed):
    tf.random.set_seed(seed)

set_seed(10)

In [0]:
def train_tokenizer(save_tokenizer_path: str,
                    training_files: str,
                    VOCAB_EXIST: bool,
                    VOCAB_SIZE: int) -> None:

    # Need to train tokenizer only once
    # If already saved tokenizer, set VOCAB_EXIST to True
    if not VOCAB_EXIST:
        # create vocab from scratch
        bpe_tokenizer = tokenizers.ByteLevelBPETokenizer(lowercase=True)
        special_tokens = ["<s>","<pad>","</s>","<unk>","<mask>"]

        bpe_tokenizer.train(
                        files=training_files, 
                        vocab_size=VOCAB_SIZE, 
                        min_frequency=2, 
                        show_progress=True,
                        special_tokens=special_tokens
                        )

        if not os.path.exists(save_tokenizer_path):
            os.makedirs(save_tokenizer_path)
        
        # This saves 2 files, which are required later by the tokenizer: merges.txt and vocab.json
        bpe_tokenizer.save(save_tokenizer_path)
        

        model_type="roberta" # roBERTa model is better than BERT for language modelling
        config = {
            "vocab_size": VOCAB_SIZE,
            "model_type": model_type 
            }

        config_path = os.path.join(save_tokenizer_path, "config.json")
        with open(config_path, 'w') as fp:
            json.dump(config, fp)

In [0]:
VOCAB_EXIST = True
VOCAB_SIZE = 40000
pretrained_tokenizer_path = "tokenizer_data"

unaligned_en_path = os.path.join(data_path, 'unaligned.en')
aligned_en_path = os.path.join(data_path, 'train.lang1')
training_files = [unaligned_en_path, aligned_en_path]

# train your Byte Pair Tokenizer on your own training files!
train_tokenizer(pretrained_tokenizer_path, training_files, VOCAB_EXIST, VOCAB_SIZE)

### Load Saved Tokenizer

In [0]:
# make sure the path contains 3 files: config.json, merges.txt and vocab.json
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_path, cache_dir=None)

### How to use Tokenizer

In [7]:
text = "Montreal is a great city".strip()
tokenizer.tokenize(text)

['M', 'ont', 'real', 'Ġis', 'Ġa', 'Ġgreat', 'Ġcity']

In [8]:
encoded_seq = tokenizer.encode(text)
encoded_seq

[0, 225, 49, 2096, 9317, 306, 263, 805, 3195, 2]

In [9]:
# decode sequence back!
tokenizer.decode(encoded_seq, skip_special_tokens=False)

'<s> Montreal is a great city</s>'

In [10]:
tokenizer.decode(encoded_seq, skip_special_tokens=True)

' Montreal is a great city'

In [11]:
tokens = tokenizer.encode_plus(text)
tokens

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [0, 225, 49, 2096, 9317, 306, 263, 805, 3195, 2]}

In [12]:
tokens["input_ids"]

[0, 225, 49, 2096, 9317, 306, 263, 805, 3195, 2]

In [13]:
tokenizer.get_special_tokens_mask(tokens["input_ids"], already_has_special_tokens=True)

[1, 0, 0, 0, 0, 0, 0, 0, 0, 1]

In [14]:
# pad sequences!
padded_seq = pad_sequences([tokens["input_ids"]], padding='post', value=tokenizer.pad_token_id, maxlen=15)
padded_seq[0]

array([   0,  225,   49, 2096, 9317,  306,  263,  805, 3195,    2,    1,
          1,    1,    1,    1], dtype=int32)

In [15]:
tokenizer.get_special_tokens_mask(padded_seq[0], already_has_special_tokens=True)

[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]

#### Un-tokenize inputs

In [16]:
tokenizer.decode(padded_seq[0], skip_special_tokens=False)

'<s> Montreal is a great city</s><pad><pad><pad><pad><pad>'

In [17]:
tokenizer.decode(padded_seq[0], skip_special_tokens=True)

' Montreal is a great city'

In [18]:
# encode batch in one go!
text1 = "Montreal is a great city".strip()
text2 = "California has good weather".strip()

texts = [text1, text2]
tokenizer.batch_encode_plus(texts)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[0, 225, 49, 2096, 9317, 306, 263, 805, 3195, 2],
  [0, 225, 39, 289, 8955, 407, 793, 5872, 2]]}

### Use of Tokenizer with `tf.Dataset` Object

In [0]:
# datalaoder stuff
def tokenize_string(tokenizer, raw_string):
    return tokenizer.encode(raw_string)
  
def encode_dataset(file_path, pretrained_tokenizer_path, num_examples):
    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_path)
    sentences = io.open(file_path, encoding='UTF-8').read().strip().split('\n')
    encoded_sequences = [tokenize_string(tokenizer, sentence) for sentence in sentences[:num_examples]]
    encoded_sequences = pad_sequences(encoded_sequences, padding='post', value=tokenizer.pad_token_id)
    decoded_sequences = [tokenizer.decode(x) for x in encoded_sequences]
    return encoded_sequences, decoded_sequences


def data_generator_fn():
    NUM_EXAMPLES = 10
    encoded_sequences, decoded_sequences = encode_dataset(unaligned_en_path, pretrained_tokenizer_path, NUM_EXAMPLES)

    for enc_seq, decoded_seq in zip(encoded_sequences, decoded_sequences):
        yield enc_seq, decoded_seq

BATCH_SIZE = 2
# dataset object
dataset = tf.data.Dataset.from_generator(
    data_generator_fn,
    output_types=(tf.int32, tf.string)
    ).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [20]:
for enc, dec in dataset.take(1):
    print(enc)
    print(dec)

tf.Tensor(
[[    0   225    42   280   266  1354  4876   288   266  8427   324   648
    728  1102 14787    16  1372    16  3012    16  1397   299  2886    17
  13103    18   225    45    88   411  1819    18     2     1     1     1
      1     1     1     1     1     1     1     1     1     1     1     1
      1     1     1     1     1]
 [    0   225    43  1443   728   491   544  3766   266  2506   225    46
   3454    16   698   225    45   438   266   767   225    57    55  5719
   4873   282  3769   225    39    76  1132    16  5122   352   290 22882
    325   225    52   357  1798   225    46   377  1326   225    39 13824
    547    87  2976    18     2]], shape=(2, 53), dtype=int32)
tf.Tensor(
[b"<s> For the second phase of the trials we just had different sizes, small, medium, large and extra-large. It's true.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>"
 b'<s> Geng had been my host the previous January, when I was the