<a href="https://colab.research.google.com/github/notAlex2/Translation-Team08-IFT6759/blob/master/notebooks/harman_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains code on how to train Byte-Pair tokenizer on your own dataset. Byte-Pair encoding handles Out-Of-Vocabulary words as well!

In [0]:
import os

project_path = "/content/drive/My Drive/machine-translation"
os.chdir(project_path)

data_path = os.path.join(project_path, 'data')

In [0]:
! pip install transformers===2.7.0 

try:
    import tokenizers
except ImportError as e:
    ! pip install tokenizers

In [0]:
import transformers
import tensorflow as tf
import io
import json
from tensorflow.keras.preprocessing.sequence import pad_sequences

def set_seed(seed):
    tf.random.set_seed(seed)

set_seed(10)

In [0]:
def train_tokenizer(save_tokenizer_path: str,
                    training_files: str,
                    special_tokens, 
                    min_frequency: int,
                    lowercase: bool,
                    VOCAB_SIZE: int) -> None:

    # Need to train tokenizer only once
    # create vocab from scratch
    bpe_tokenizer = tokenizers.ByteLevelBPETokenizer(lowercase=lowercase)

    bpe_tokenizer.train(
                    files=training_files, 
                    vocab_size=VOCAB_SIZE, 
                    min_frequency=min_frequency, 
                    show_progress=True,
                    special_tokens=special_tokens
                    )

    if not os.path.exists(save_tokenizer_path):
        os.makedirs(save_tokenizer_path)
    
    # This saves 2 files, which are required later by the tokenizer: merges.txt and vocab.json
    bpe_tokenizer.save(save_tokenizer_path)
    
    model_type="roberta" # roBERTa model is better than BERT for language modelling
    config = {
        "vocab_size": VOCAB_SIZE,
        "model_type": model_type 
        }

    config_path = os.path.join(save_tokenizer_path, "config.json")
    with open(config_path, 'w') as fp:
        json.dump(config, fp)

### Train Tokenizer using vocab files

To train a new tokenizer, you need to provide 
* list of data-files on which you want to train your tokenizer
* special tokens such as `"<s>","<pad>","</s>","<unk>","<mask>`
* vocabulary size
* min_frequency int
* lowercase boolean
* path to save your tokenizer

In [27]:
VOCAB_SIZE = 60000
min_frequency = 2
lowercase = True
special_tokens = ["<s>","<pad>","</s>","<unk>","<mask>"]
unaligned_en_path = os.path.join(data_path, 'unaligned.en')
aligned_en_path = os.path.join(data_path, 'train.lang1')
training_files = [unaligned_en_path, aligned_en_path]
pretrained_tokenizer_path = "tokenizer_data"

ALREADY_TRAINED_ONCE = True
if ALREADY_TRAINED_ONCE:
    print("Tokenizer already trained. Set ALREADY_TRAINED_ONCE=False to re-train tokenizer")
else:
    # train your Byte Pair Tokenizer on your own training files!
    train_tokenizer(pretrained_tokenizer_path, 
                    training_files, 
                    special_tokens, 
                    min_frequency, 
                    lowercase, 
                    VOCAB_SIZE)

Tokenizer already trained. Set ALREADY_TRAINED_ONCE=False to re-train tokenizer


### Load Saved Tokenizer 
The tokenizer is already trained before this point. The output of trained tokenizer function (`train_tokenizer(args)`) is 3 files - `merges.txt, vocab.txt and config.json`. Please note that you only have to train the tokenizer once.  

To load the tokenizer set `pretrained_tokenizer_path` to the folder where above 3 mentioned files are saved. 

In [0]:
from transformers import AutoTokenizer

# make sure the path contains 3 files: config.json, merges.txt and vocab.json
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_path, cache_dir=None)

### How to use Tokenizer  


*   To tokenize a sentence, use `tokens = tokenizer.tokenize(sentence)`
*   To encode a sentence to integers, use `encoded_sequence = tokenizer.encode(sentence)`. Not that it also adds start and end tokens, i.e. `<s>` and `</s>` to the encoded outputs.
*   To decode/untokenize a sentence, use `tokenizer.decode(encoded_sequence, skip_special_tokens=True)`
* Padding isn't supported inherently in this tokenizer. Hence, we use keras's `pad_sequences` to pad. Make sure to use `tokenizer.pad_token_id` to provide the tokenizer specific pad token. 


Usage of this tokenizer is shown in following examples.



In [29]:
text = "Montreal is a great city".strip()
tokenizer.tokenize(text)

['M', 'ont', 'real', 'Ġis', 'Ġa', 'Ġgreat', 'Ġcity']

Capitalization and lowercased inputs will give different results. Hence, its user's choice on how he provides input to the tokenizer.

In [31]:
text = "Montreal is a great city".strip().lower()
tokenizer.tokenize(text)

['mont', 'real', 'Ġis', 'Ġa', 'Ġgreat', 'Ġcity']

In [32]:
encoded_seq = tokenizer.encode(text)
encoded_seq

[0, 18325, 306, 263, 805, 3195, 2]

In [33]:
# decode sequence back!
tokenizer.decode(encoded_seq, skip_special_tokens=False)

'<s> montreal is a great city</s>'

In [35]:
tokenizer.decode(encoded_seq, skip_special_tokens=True).strip()

'montreal is a great city'

In [37]:
tokens = tokenizer.encode_plus(text)
tokens

{'attention_mask': [1, 1, 1, 1, 1, 1, 1],
 'input_ids': [0, 18325, 306, 263, 805, 3195, 2]}

In [38]:
tokens["input_ids"]

[0, 18325, 306, 263, 805, 3195, 2]

In [40]:
tokenizer.get_special_tokens_mask(tokens["input_ids"], already_has_special_tokens=True)

[1, 0, 0, 0, 0, 0, 1]

In [41]:
tokenizer.get_special_tokens_mask(encoded_seq, already_has_special_tokens=True)

[1, 0, 0, 0, 0, 0, 1]

In [42]:
# pad sequences!
padded_seq = pad_sequences([tokens["input_ids"]], padding='post', value=tokenizer.pad_token_id, maxlen=15)
padded_seq[0]

array([    0, 18325,   306,   263,   805,  3195,     2,     1,     1,
           1,     1,     1,     1,     1,     1], dtype=int32)

In [43]:
tokenizer.get_special_tokens_mask(padded_seq[0], already_has_special_tokens=True)

[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

#### Un-tokenize inputs

In [0]:
tokenizer.decode(padded_seq[0], skip_special_tokens=False)

'<s> Montreal is a great city</s><pad><pad><pad><pad><pad>'

In [0]:
tokenizer.decode(padded_seq[0], skip_special_tokens=True)

' Montreal is a great city'

In [44]:
# encode batch in one go!
text1 = "Montreal is a great city".strip()
text2 = "California has good weather".strip()

texts = [text1, text2]
tokenizer.batch_encode_plus(texts)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[0, 225, 49, 2096, 9317, 306, 263, 805, 3195, 2],
  [0, 225, 39, 289, 8955, 407, 793, 5872, 2]]}

### Use of Tokenizer with `tf.Dataset` Object

In [0]:
# datalaoder stuff
def tokenize_string(tokenizer, raw_string):
    return (raw_string)
  
def encode_dataset(file_path, pretrained_tokenizer_path, lowercase, num_examples):
    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_path)
    sentences = io.open(file_path, encoding='UTF-8').read().strip().split('\n')
    if lowercase:
        encoded_sequences = [tokenizer.encode(sentence.lower()) for sentence in sentences[:num_examples]]
    else:
        encoded_sequences = [tokenizer.encode(sentence) for sentence in sentences[:num_examples]]

    encoded_sequences = pad_sequences(encoded_sequences, padding='post', value=tokenizer.pad_token_id)
    decoded_sequences = [tokenizer.decode(x) for x in encoded_sequences]
    return encoded_sequences, decoded_sequences


def data_generator_fn():
    NUM_EXAMPLES = 10
    lowercase = True
    encoded_sequences, decoded_sequences = encode_dataset(unaligned_en_path,
                                                          pretrained_tokenizer_path, 
                                                          lowercase, 
                                                          NUM_EXAMPLES)
    
    for enc_seq, decoded_seq in zip(encoded_sequences, decoded_sequences):
        yield enc_seq, decoded_seq

BATCH_SIZE = 2
# dataset object
dataset = tf.data.Dataset.from_generator(
    data_generator_fn,
    output_types=(tf.int32, tf.string)
    ).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)

In [50]:
for enc, dec in dataset.take(1):
    print(enc)
    print(dec)

tf.Tensor(
[[    0   325   266  1354  4876   288   266  8427   324   648   728  1102
  14787    16  1372    16  3012    16  1397   299  2886    17 13103    18
    330   411  1819    18     2     1     1     1     1     1     1     1]
 [    0   853    75   728   491   544  3766   266  2506  3470    16   698
    314   438   266   767   478  5719  4873   282  3769  1414    16  5122
    352   290 22882   325   500 22840 22578   547    87  2976    18     2]], shape=(2, 36), dtype=int32)
tf.Tensor(
[b"<s> for the second phase of the trials we just had different sizes, small, medium, large and extra-large. it's true.</s><pad><pad><pad><pad><pad><pad><pad>"
 b'<s> geng had been my host the previous january, when i was the first us defense secretary to visit china, acting as an interlocutor for president jimmy carter\xe2\x80\x99s administration.</s>'], shape=(2,), dtype=string)
