In [2]:
from tokenizers import Tokenizer, models, \
    trainers, normalizers, pre_tokenizers, processors, decoders
import sentencepiece as spm

# Prepare corpus for training tokenizer

In [2]:
with open('./data/collection.tsv', 'r') as rf, open('./data/raw_texts/collection_corpus.txt', 'w') as wf:
    for line in rf.readlines():
        wf.write(line.split('\t')[1])

In [3]:
with open('./data/queries.train.tsv', 'r') as rf, open('./data/raw_texts/queries.train_corpus.txt', 'w') as wf:
    for line in rf.readlines():
        wf.write(line.split('\t')[1])

In [4]:
with open('./data/queries.dev.tsv', 'r') as rf, open('./data/raw_texts/queries.dev_corpus.txt', 'w') as wf:
    for line in rf.readlines():
        wf.write(line.split('\t')[1])

In [5]:
with open('./data/queries.eval.tsv', 'r') as rf, open('./data/raw_texts/queries.eval_corpus.txt', 'w') as wf:
    for line in rf.readlines():
        wf.write(line.split('\t')[1])

# Tokenizer training

## Huggingface tokenizer

### BPE tokenizer

In [6]:
bpe_tokenizer = Tokenizer(models.BPE())
bpe_tokenizer.normalizer = normalizers.NFC()
bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

bpe_tokenizer.decoder = decoders.WordPiece('@@')
trainer = trainers.BpeTrainer(special_tokens=[
    '[UNK]',
    '[CLS]',
    '[SEP]',
    '[PAD]'],
    unknow_token='[UNK]',
    vocab_size=32000,
    min_frequency=0,
    continuing_subword_prefix='@@')

In [7]:
files = [f'./data/raw_texts/{file}.txt' for file in ['collection_corpus', 'queries.train_corpus', 'queries.dev_corpus', 'queries.eval_corpus']]
bpe_tokenizer.train(files=files, trainer=trainer)

In [None]:
print(bpe_tokenizer.encode('Hi i am a studen. My name is Hai.').tokens)
print(bpe_tokenizer.encode('Hi i am a studen. My name is Hai.').ids)

['Hi', 'i', 'am', 'a', 'stud', '@@en', '.', 'My', 'name', 'is', 'H', '@@ai', '.']
[7390, 75, 947, 67, 1253, 591, 16, 2124, 979, 609, 42, 2523, 16]


In [None]:
bpe_tokenizer.decode([42, 323, 75, 947, 67, 1253, 591, 16, 2124, 979, 609, 42, 2523, 16])

'Hi i am a studen. My name is Hai.'

### WordPiece tokenizer

In [None]:
# WP tokenizer inspired by BERT tokenizer
wp_tokenizer = Tokenizer(models.WordPiece())
wp_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC(),
                                                normalizers.Lowercase(),
                                                normalizers.StripAccents()])
wp_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
wp_tokenizer.decoder = decoders.WordPiece()
wp_tokenizer.post_processor = processors.TemplateProcessing(
    single='[CLS] $A [SEP]',
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ('[CLS]', 1),
        ('[SEP]', 2),
    ],
)
trainer = trainers.WordPieceTrainer(special_tokens=[
    '[UNK]',
    '[CLS]',
    '[SEP]',
    '[PAD]'],
    unknow_token='[UNK]',
    vocab_size=32000,
    min_frequency=0,)

In [None]:
files = [f'./data/raw_texts/{file}.txt' for file in ['collection_corpus', 'queries.train_corpus', 'queries.dev_corpus', 'queries.eval_corpus']]
wp_tokenizer.train(files=files, trainer=trainer)

In [None]:
wp_tokenizer.get_vocab()

{'scams': 29694,
 'county': 1266,
 '##roll': 2880,
 'democrat': 17749,
 'tate': 27420,
 'ein': 11524,
 'alter': 6240,
 'ont': 7346,
 'considerable': 12379,
 '495': 25734,
 'analyzed': 16510,
 'bun': 8140,
 'dishes': 8239,
 'foreclosure': 14692,
 'calcit': 31453,
 '##var': 22310,
 '##woods': 26307,
 'material': 1747,
 'nodule': 23099,
 'appeared': 6539,
 '##lobacter': 31909,
 'sciences': 8548,
 '##ister': 1879,
 'espe': 2241,
 'entity': 6834,
 '##burgh': 8931,
 'secrets': 14757,
 'fault': 7164,
 'weaker': 18923,
 'eruptions': 16604,
 '##dle': 2259,
 'trent': 31698,
 'engineered': 14259,
 'flood': 6444,
 '##asso': 21734,
 'canadaâ': 26496,
 '##rof': 8870,
 'cowork': 28539,
 '##ighth': 8928,
 'potato': 6989,
 'ness': 26226,
 '##develop': 26022,
 'secretly': 27925,
 'amazon': 4493,
 'phytopl': 31015,
 'making': 1930,
 '##ublic': 3646,
 '##ridged': 20974,
 'lor': 9641,
 'yemen': 27699,
 'educator': 18698,
 'redes': 20541,
 'san': 2618,
 'lgbt': 25287,
 'infancy': 24495,
 'regulation': 7376,

In [None]:
wp_tokenizer.encode('Hi i am a studen. My name is Hai.').ids

[1, 4772, 49, 663, 41, 1035, 472, 16, 886, 857, 490, 31813, 16, 2]

In [None]:
wp_tokenizer.decode([1, 4772, 49, 663, 41, 1035, 472, 16, 886, 857, 490, 579, 257, 16, 2])

'hi i am a studen. my name is hab.'

In [None]:
wp_tokenizer.save('./save/tokenizer/wp_tokenizer_32k.json', True)

Exception: The system cannot find the path specified. (os error 3)

### Unigram tokenizer

In [None]:
unigram_tokenizer = Tokenizer(models.Unigram())

trainer = trainers.UnigramTrainer(special_tokens=[
    '[UNK]',
    '[CLS]',
    '[SEP]',
    '[PAD]'],
    unk_token='[UNK]',
    vocab_size=32000,
    min_frequency=0)
unigram_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

In [None]:
files = [f'./data/raw_texts/{file}.txt' for file in ['collection_corpus', 'queries.train_corpus', 'queries.dev_corpus', 'queries.eval_corpus']]
unigram_tokenizer.train(files=files, trainer=trainer)

In [None]:
unigram_tokenizer.__dir__()

['__new__',
 '__getstate__',
 '__setstate__',
 '__getnewargs__',
 'from_str',
 'from_file',
 'from_buffer',
 'from_pretrained',
 'to_str',
 'save',
 'num_special_tokens_to_add',
 'get_vocab',
 'get_vocab_size',
 'enable_truncation',
 'no_truncation',
 'enable_padding',
 'no_padding',
 'encode',
 'encode_batch',
 'decode',
 'decode_batch',
 'token_to_id',
 'id_to_token',
 'add_tokens',
 'add_special_tokens',
 'train',
 'train_from_iterator',
 'post_process',
 'truncation',
 'post_processor',
 'decoder',
 'pre_tokenizer',
 'padding',
 'model',
 'normalizer',
 '__dict__',
 '__doc__',
 '__module__',
 '__repr__',
 '__hash__',
 '__str__',
 '__getattribute__',
 '__setattr__',
 '__delattr__',
 '__lt__',
 '__le__',
 '__eq__',
 '__ne__',
 '__gt__',
 '__ge__',
 '__init__',
 '__reduce_ex__',
 '__reduce__',
 '__subclasshook__',
 '__init_subclass__',
 '__format__',
 '__sizeof__',
 '__dir__',
 '__class__']

In [None]:
unigram_tokenizer.encode('Hi, my name is Hai')

Encoding(num_tokens=10, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [None]:
from transformers.tokenization_utils import PreTrainedTokenizer

## SentencePiece tokenizer

### BPE tokenizer

In [None]:
files = [f'./data/raw_texts/{file}.txt' for file in ['collection_corpus', 'queries.train_corpus', 'queries.dev_corpus', 'queries.eval_corpus']]
spm.SentencePieceTrainer.Train(
    input=files,
    model_prefix='./save/tokenizer/spModel_bpe_8k.model',
    vocab_size=32000,
    pad_id=0,                
    unk_id=1,
    bos_id=2,
    eos_id=3,
    pad_piece='[PAD]',
    unk_piece='[UNK]',
    bos_piece='[CLS]',
    eos_piece='[SEP]',
    user_defined_symbols='[MASK]',
    model_type='bpe'
)
sp = spm.SentencePieceProcessor()
sp.load('spModel.model')

In [None]:
sp.encode_as_pieces('Hi i am a student, my name is Hai')

['▁Hi', '▁i', '▁am', '▁a', '▁student', ',', '▁my', '▁name', '▁is', '▁H', 'ai']

### Unigram tokenizer

In [None]:
files = [f'./data/raw_texts/{file}.txt' for file in ['collection_corpus', 'queries.train_corpus', 'queries.dev_corpus', 'queries.eval_corpus']]
spm.SentencePieceTrainer.Train(
    input=files,
    model_prefix='./save/tokenizer/spModel_bpe_8k.model',
    vocab_size=32000,
    pad_id=0,                
    unk_id=1,
    bos_id=2,
    eos_id=3,
    pad_piece='[PAD]',
    unk_piece='[UNK]',
    bos_piece='[CLS]',
    eos_piece='[SEP]',
    user_defined_symbols='[MASK]',
    model_type='uni'
)
sp = spm.SentencePieceProcessor()
sp.load('spModel.model')