# How to use some pre-defined tokenisers (e.g. BERT, RoBERTa)

In [3]:
import torch
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
from collections import Counter

In [None]:
# For more details - https://huggingface.co/bert-base-uncased
tokenizerWP = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
def findToken(id, d):
  # Iterate over the items and find the key with value 5
  for key, value in d.items():
    if value == id:
        desired_key = key
        break
  return desired_key

In [30]:
tokenizer = tokenizerWP
print("Vocabulary : ", tokenizer.get_vocab())
print("Vocabulary size : ", tokenizer.vocab_size)
print("Tokens for 'i like riding my bike' are : ", tokenizer(['i like riding my bike']).get('input_ids'))
print("Index of 'i' : ", tokenizer.get_vocab()['i'])
print("Index of 'like' : ", tokenizer.get_vocab()['like'])
print("Index of 'riding' : ", tokenizer.get_vocab()['riding'])
print("Index of 'my' : ", tokenizer.get_vocab()['my'])
print("Index of 'bike' : ", tokenizer.get_vocab()['bike'])
print("\n")
print("token with id 101 is : ", findToken(101, tokenizer.get_vocab()) )
print("token with id 102 is : ", findToken(102, tokenizer.get_vocab()) )
print("token with id 100 is : ", findToken(100, tokenizer.get_vocab()) )
print("\n")
print("Tokens for 'i like riding my bike' are : ", tokenizer(['i like riding my bike']).get('input_ids'))
print("Tokens for 'i like riding my bike on bikelane' are : ", tokenizer(['i like riding my bike on bikelane']).get('input_ids'))

Vocabulary size :  30522
Tokens for 'i like riding my bike' are :  [[101, 1045, 2066, 5559, 2026, 7997, 102]]
Index of 'i' :  1045
Index of 'like' :  2066
Index of 'riding' :  5559
Index of 'my' :  2026
Index of 'bike' :  7997
token with id 101 is :  [CLS]
token with id 102 is :  [SEP]
token with id 100 is :  [UNK]


Tokens for 'i like riding my bike' are :  [[101, 1045, 2066, 5559, 2026, 7997, 102]]
Tokens for 'i like riding my bike on bikelane' are :  [[101, 1045, 2066, 5559, 2026, 7997, 2006, 7997, 20644, 102]]


In [None]:
tokenizerBPE = RobertaTokenizer.from_pretrained('roberta-base')

In [31]:
tokenizer = tokenizerBPE

print("Vocabulary : ", tokenizer.get_vocab())
print("Vocabulary size : ", tokenizer.vocab_size)
tokens = tokenizer(['i like riding my bike']).get('input_ids')
print("Tokens for 'i like riding my bike' are : ", tokens)
print("Index of 'i' : ", tokenizer.get_vocab()['i'])
print("Index of 'like' : ", tokenizer.get_vocab()['like'])
# print("Index of 'riding' : ", tokenizer.get_vocab()['riding'])
print("Index of 'my' : ", tokenizer.get_vocab()['my'])
print("Index of 'bike' : ", tokenizer.get_vocab()['bike'])
print("\n")
print("token with id 0 is : ", findToken(0, tokenizer.get_vocab()) )
print("token with id 2 is : ", findToken(2, tokenizer.get_vocab()) )
print("token with id 1 is : ", findToken(1, tokenizer.get_vocab()) )
print("token with id 101 is : ", findToken(101, tokenizer.get_vocab()) )
print("token with id 102 is : ", findToken(102, tokenizer.get_vocab()) )
print("token with id 100 is : ", findToken(100, tokenizer.get_vocab()) )
for t in tokens[0]:
  print("token with id ", t, " is : ", findToken(t, tokenizer.get_vocab()))
print("\n")
print("Tokens for 'i like riding my bike' are : ", tokenizer(['i like riding my bike']).get('input_ids'))
print("Tokens for 'i like riding my bike on bikelane' are : ", tokenizer(['i like riding my bike on bikelane']).get('input_ids'))

Vocabulary size :  50265
Tokens for 'i like riding my bike' are :  [[0, 118, 101, 5793, 127, 4806, 2]]
Index of 'i' :  118
Index of 'like' :  3341
Index of 'my' :  4783
Index of 'bike' :  20974


token with id 0 is :  <s>
token with id 2 is :  </s>
token with id 1 is :  <pad>
token with id 101 is :  Ġlike
token with id 102 is :  a
token with id 100 is :  I
token with id  0  is :  <s>
token with id  118  is :  i
token with id  101  is :  Ġlike
token with id  5793  is :  Ġriding
token with id  127  is :  Ġmy
token with id  4806  is :  Ġbike
token with id  2  is :  </s>


Tokens for 'i like riding my bike' are :  [[0, 118, 101, 5793, 127, 4806, 2]]
Tokens for 'i like riding my bike on bikelane' are :  [[0, 118, 101, 5793, 127, 4806, 15, 4806, 21765, 2]]


# How to train a tokenizer

In [None]:
!pip install sentencepiece

## WordPiece tokenizer

In [40]:
import sentencepiece as spm

# Sample sentence
sentence = "i like riding my bike on bikelane"

# training the tokenizer - a deterministic step for BPE
spm.SentencePieceTrainer.Train('--input=trainBikes.txt --model_prefix=mWord --vocab_size=9 --model_type=word')
spWord = spm.SentencePieceProcessor()
spWord.load('mWord.model')

vocab = [(i, spWord.id_to_piece(i)) for i in range(spWord.vocab_size())]
print("Vocabulary : ", vocab)
print("Vocabulary size : ", spWord.vocab_size())

# Encode the sentence
encoded_ids = spWord.encode_as_ids(sentence)
encoded_pieces = spWord.encode_as_pieces(sentence)
print("Tokens' ids for '", sentence, "' : ", encoded_ids)
print("Tokens for '", sentence, "' : ", encoded_pieces)

# Decode the IDs back to the sentence
decoded_sentence = spWord.decode_ids(encoded_ids)
print("Decoded Sentence:", decoded_sentence)

Vocabulary :  [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, '▁bicking'), (4, '▁bicycle'), (5, '▁bike'), (6, '▁bikelane'), (7, '▁biker'), (8, '▁like')]
Vocabulary size :  9
Tokens' ids for ' i like riding my bike on bikelane ' :  [0, 8, 0, 5, 0, 6]
Tokens for ' i like riding my bike on bikelane ' :  ['▁i', '▁like', '▁riding▁my', '▁bike', '▁on', '▁bikelane']
Decoded Sentence:  ⁇  like ⁇  bike ⁇  bikelane


## BPE tokenizer

In [56]:
import sentencepiece as spm

# Sample sentence
sentence = "i like riding my bike on bikelane"

# training the tokenizer - a deterministic step for BPE
spm.SentencePieceTrainer.Train('--input=trainBikes.txt --model_prefix=mBPE --vocab_size=60 --model_type=bpe')
spBPE = spm.SentencePieceProcessor()
spBPE.load('mBPE.model') # Replace with your model file

vocab = [(i, spBPE.id_to_piece(i)) for i in range(spBPE.vocab_size())]
print("Vocabulary : ", vocab)
print("Vocabulary size : ", spBPE.vocab_size())

# Encode the sentence
encoded_ids = spBPE.encode_as_ids(sentence)
encoded_pieces = spBPE.encode_as_pieces(sentence)
print("Tokens' ids for '", sentence, "' : ", encoded_ids)
print("Tokens for '", sentence, "' : ", encoded_pieces)

# Decode the IDs back to the sentence
decoded_sentence = spBPE.decode_ids(encoded_ids)
print("Decoded Sentence:", decoded_sentence)
print("\n")

# training a thiner tokenizer - a deterministic step for BPE
spm.SentencePieceTrainer.Train('--input=trainBikes.txt --model_prefix=mBPE --vocab_size=20 --model_type=bpe')
spBPE = spm.SentencePieceProcessor()
spBPE.load('mBPE.model') # Replace with your model file
vocab = [(i, spBPE.id_to_piece(i)) for i in range(spBPE.vocab_size())]
print("Vocabulary : ", vocab)
print("Vocabulary size : ", spBPE.vocab_size())
# Encode the sentence
encoded_ids = spBPE.encode_as_ids(sentence)
encoded_pieces = spBPE.encode_as_pieces(sentence)
print("Tokens' ids for '", sentence, "' : ", encoded_ids)
print("Tokens for '", sentence, "' : ", encoded_pieces)

# Decode the IDs back to the sentence
decoded_sentence = spBPE.decode_ids(encoded_ids)
print("Decoded Sentence:", decoded_sentence)

Vocabulary :  [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, 'bi'), (4, '▁bi'), (5, 'ke'), (6, '▁bike'), (7, '▁bic'), (8, 'an'), (9, 'cl'), (10, 'in'), (11, 'li'), (12, 'ane'), (13, 'cle'), (14, 'ing'), (15, '▁li'), (16, 'king'), (17, 'lane'), (18, 'ycle'), (19, '▁like'), (20, '▁biker'), (21, '▁bicking'), (22, '▁bicycle'), (23, '▁bikelane'), (24, 'ck'), (25, 'cy'), (26, 'el'), (27, 'er'), (28, 'ic'), (29, 'ik'), (30, 'ki'), (31, 'la'), (32, 'le'), (33, 'ne'), (34, 'ng'), (35, 'yc'), (36, '▁b'), (37, '▁l'), (38, 'bic'), (39, 'bik'), (40, 'ike'), (41, 'kel'), (42, 'ker'), (43, 'kin'), (44, 'lan'), (45, 'ycl'), (46, 'like'), (47, '▁bik'), (48, 'i'), (49, 'e'), (50, '▁'), (51, 'b'), (52, 'k'), (53, 'c'), (54, 'l'), (55, 'n'), (56, 'a'), (57, 'g'), (58, 'r'), (59, 'y')]
Vocabulary size :  60
Tokens' ids for ' i like riding my bike on bikelane ' :  [50, 48, 19, 50, 58, 48, 0, 14, 50, 0, 59, 6, 50, 0, 55, 23]
Tokens for ' i like riding my bike on bikelane ' :  ['▁', 'i', '▁like', '▁', 'r', 'i', 

In [52]:
import sentencepiece as spm

# Sample sentence
sentence = "i like riding my bike on bikelane"
# sentence = "312 new bikes have been bought by a sport club:300 for kids and 12 for adults"

# training the tokenizer - a probabilistic step for unigram
# spm.SentencePieceTrainer.Train('--input=trainBikes.txt --model_prefix=mUnigram --vocab_size=18 --model_type=unigram --character_coverage=0.9995')
spm.SentencePieceTrainer.Train('--input=trainGutenberg.txt --model_prefix=mUnigram --vocab_size=5000 --model_type=unigram --character_coverage=0.9995')
spUnigram = spm.SentencePieceProcessor()
spUnigram.load('mUnigram.model') # Replace with your model file

vocab = [(i, spUnigram.id_to_piece(i)) for i in range(spUnigram.vocab_size())]
print("Vocabulary : ", vocab)
print("Vocabulary size : ", spUnigram.vocab_size())

# Encode the sentence
encoded_ids = spUnigram.encode_as_ids(sentence)
encoded_pieces = spUnigram.encode_as_pieces(sentence)
print("Tokens' ids for '", sentence, "' : ", encoded_ids)
print("Tokens for '", sentence, "' : ", encoded_pieces)

# Decode the IDs back to the sentence
decoded_sentence = spUnigram.decode_ids(encoded_ids)
print("Decoded Sentence:", decoded_sentence)

Vocabulary :  [(0, '<unk>'), (1, '<s>'), (2, '</s>'), (3, ','), (4, '.'), (5, '▁the'), (6, '▁I'), (7, '▁to'), (8, '▁a'), (9, '▁and'), (10, '▁of'), (11, '▁in'), (12, '▁was'), (13, '▁"'), (14, 's'), (15, '▁it'), (16, '▁'), (17, '-'), (18, "'"), (19, '▁that'), (20, 'ing'), (21, '▁me'), (22, '▁with'), (23, '▁my'), (24, 'ed'), (25, '▁he'), (26, '▁not'), (27, '▁is'), (28, '▁be'), (29, '▁for'), (30, '▁as'), (31, '▁you'), (32, '▁but'), (33, '▁on'), (34, 't'), (35, '."'), (36, '▁at'), (37, '▁or'), (38, '▁had'), (39, '▁this'), (40, '▁would'), (41, '▁The'), (42, '▁his'), (43, '▁by'), (44, '▁have'), (45, 'd'), (46, '▁one'), (47, '▁so'), (48, '▁him'), (49, '▁all'), (50, '▁if'), (51, '▁they'), (52, 'ly'), (53, '▁Shirt'), (54, '▁Red'), (55, '▁from'), (56, '▁there'), (57, '▁about'), (58, '▁out'), (59, '▁said'), (60, 'n'), (61, '▁them'), (62, '▁no'), (63, ';'), (64, '▁were'), (65, '▁like'), (66, '▁are'), (67, '▁up'), (68, '▁Porcupine'), (69, '▁do'), (70, '▁school'), (71, 'y'), (72, '▁we'), (73, ',"'), 