In [1]:
import pathpiece

In [2]:
vocab_file = "./pathpiece/data/vdump_32768.vocab"
# this is hardcoded in the tokenizer
eos_text = "<|endoftext|>"  # default
greedy = False  # default
random_tiebreaker = True # old value

tokenizer = pathpiece.Tokenizer(vocab_file, random_tiebreaker=random_tiebreaker)

In [3]:
# this is added as token zero automatically 
# that is, it isn't present in the .vocab file
# Note it isn't automatically used:
# It is up to the caller to add it at the end of the document
tokenizer.encode(eos_text)

{'input_ids': [0]}

In [4]:
# can also pass in some other end-of-document token
tokenizer2 = pathpiece.Tokenizer(vocab_file, special="<eos>")
# now this is token 0
tokenizer2.encode("<eos>")

{'input_ids': [0]}

In [5]:
s = "The quick brown fox"
en = tokenizer.encode(s)

In [6]:
# given a list of strings, 
# this tokenizes each document in parallel
# returns a list of token id lists
parallel = tokenizer.encode_batch(["The quick brown fox", "jumped over the lazy dog."])
parallel

{'input_ids': [[14004, 10800, 31267, 8561, 7389],
  [16259, 15656, 3800, 1306, 24019, 16452, 3865, 24659]]}

In [7]:
# there is also a parallel decode 
# that takes a list of list of token ids
tokenizer.decode_batch(parallel["input_ids"])

['The quick brown fox', 'jumped over the lazy dog.']

In [8]:
# matching the huggingface interface, 
# you can also __call__ the tokenizer to encode
# if you pass a list of strings, this uses encode_batch
# any extra parameters (often passed to huggingface tokenizer) will be ignored
tokenizer("The quick brown fox")

{'input_ids': [14004, 10800, 31267, 8561, 7389]}

In [9]:
# conver back again with decode
back = tokenizer.decode(en["input_ids"])
assert back == s
back

'The quick brown fox'

In [10]:
# what is the maximum number of bytes in any token
tokenizer.get_max_len()

16

In [11]:
# get the token : id mapping as dict
# note it comes in random order each time the vocab is read
vocab = tokenizer.get_vocab()
list(vocab.items())[:10]

[(b'6d', 14838),
 (b' OR', 254),
 (b'risto', 373),
 (b'. We are', 27207),
 (b' (see', 8459),
 (b's. To', 20120),
 (b'lush', 6195),
 (b'Na', 26799),
 (b'LD', 22995),
 (b'509', 30546)]

In [12]:
# to get the vocabulary size do
# remember that one of these is for the <|endoftext|> special token
len(tokenizer.get_vocab())

32769

In [13]:
# we can get the unjoined token lists like this
ids = tokenizer.get_ids()
def get_token_list(enc):
    return [ids[i] for i in enc["input_ids"]] 

In [14]:
opt = get_token_list(tokenizer("The quick brown fox jumped over the lazy dog."))
print(len(opt))
opt

12


[b'The ',
 b'quick',
 b' br',
 b'own ',
 b'fox',
 b' jump',
 b'ed over',
 b' the l',
 b'az',
 b'y ',
 b'do',
 b'g.']

In [15]:
# do the greedy version
gr = pathpiece.Tokenizer(vocab_file, greedy=True)
tl = get_token_list(gr("The quick brown fox jumped over the lazy dog."))
print(len(tl))
tl

13


[b'The ',
 b'quick',
 b' brown',
 b' fo',
 b'x ',
 b'ju',
 b'mp',
 b'ed over',
 b' the l',
 b'az',
 b'y d',
 b'og',
 b'.']

In [16]:
# and the longest tiebreaker version
long = pathpiece.Tokenizer(vocab_file, random_tiebreaker=False)
tl = get_token_list(long("The quick brown fox jumped over the lazy dog."))
print(len(tl))
tl

12


[b'The',
 b' quick',
 b' br',
 b'own ',
 b'fox',
 b' jump',
 b'ed',
 b' over the',
 b' la',
 b'zy',
 b' do',
 b'g.']

In [17]:
opt = tokenizer.encode("The quick brown fox jumped over the lazy dog.")['input_ids']
print(len(opt))
opt

12


[14004, 10800, 31267, 8561, 7389, 31747, 3800, 1306, 24019, 23329, 30199, 8916]

In [18]:
# can also pass in some other end-of-document token
# verify the greedy params work
# specify this for __init__, so don't need to modify calling encoding code

In [19]:
tokenizer3 = pathpiece.Tokenizer(vocab_file, special=eos_text, greedy=True)

gr3 = tokenizer3.encode("The quick brown fox jumped over the lazy dog.")['input_ids']
print(len(gr3))
print(gr3)

13
[14004, 10800, 12759, 6623, 22140, 16259, 15656, 3800, 1306, 24019, 16452, 3865, 24659]


In [20]:
tokenizer4 = pathpiece.Tokenizer(vocab_file, eos_text, True)  # greedy positional

gr4 = tokenizer4.encode("The quick brown fox jumped over the lazy dog.")['input_ids']
print(len(gr4))
print(gr4)
assert gr3 == gr4

13
[14004, 10800, 12759, 6623, 22140, 16259, 15656, 3800, 1306, 24019, 16452, 3865, 24659]


In [21]:
tokenizer5 = pathpiece.Tokenizer(vocab_file, greedy=True)     # keyword without special

gr5 = tokenizer5.encode("The quick brown fox jumped over the lazy dog.")['input_ids']
print(len(gr5))
print(gr5)
assert gr3 == gr5

13
[14004, 10800, 12759, 6623, 22140, 16259, 15656, 3800, 1306, 24019, 16452, 3865, 24659]


In [22]:
# and the longest tiebreaker version with all params
long = pathpiece.Tokenizer(vocab_file, special=eos_text, greedy=False, random_tiebreaker=False)
tl = get_token_list(long("The quick brown fox jumped over the lazy dog."))
print(len(tl))
tl

12


[b'The',
 b' quick',
 b' br',
 b'own ',
 b'fox',
 b' jump',
 b'ed',
 b' over the',
 b' la',
 b'zy',
 b' do',
 b'g.']