# `019` Byte-pair encoding tokenization

Requirements: 016 Transformers

⚡⚡⚡WIP! DONT READ YET⚡⚡⚡

In [1]:
# !pip install regex
from regex import compile as regex_compile
from time import time

In [2]:
with open('custom-data/spanish-novels.txt', encoding='utf-8') as fp:
	text = fp.read()

print(f'In total, there are {len(text.encode("utf-8")):,} UTF-8 bytes in the file.')

In total, there are 19,979,810 UTF-8 bytes in the file.


the idea is simple we merge the most common pair over and over.

In [3]:
def merge(elems, pair, replacement):
	res = []
	i = 0
	while i < len(elems):
		if i < len(elems) - 1 and elems[i] == pair[0] and elems[i + 1] == pair[1]:
			res.append(replacement)
			i += 2
		else:
			res.append(elems[i])
			i += 1
	return res

merge('How are you?', (' ', 'a'), 'X')

['H', 'o', 'w', 'X', 'r', 'e', ' ', 'y', 'o', 'u', '?']

Before thinking of any tokenization method, there is a constraint we want to impose over the output tokens: we don't want tokens to contain letters from different categories such as text, digits, whitespace or others. The reason why is that, since BPE will perform merges based on the most common pairs, if a very common word like `for` is tokenized early on, we'll find many occurrences of that same word followed by different stuff, such as `for.`, `for?`, or `for!`.

To avoid creating too many tokens that encode the same stuff, we will split into these categories using the `regex` library, a more powerful version of the builtin `re` library that defines groups for unicode subsets. While the `\w` in regex matches any letter or digit, the `\p{L}` and `\p{N}` match only letters and digits respectively.

In [4]:
split = regex_compile(r'\p{Z}?(?:\p{L}+|\p{N}+)|\p{Z}+|.').findall
split('Let\'s see how this w0rks!')

['Let', "'", 's', ' see', ' how', ' this', ' w', '0', 'rks', '!']

In [5]:
def train_tokenizer(text, max_merges=1000):
	pieces = split(text)
	elems = [list(text.encode('utf-8')) for text in pieces]
	start = time()
	next_replacement = 256
	vocabulary = [[i] for i in range(next_replacement)]
	merges = {}
	for m in range(max_merges):
		# find the most frequent pair of elements across all pieces
		count = {}
		for piece in elems:
			for e1, e2 in zip(piece[:-1], piece[1:]):
				count[(e1, e2)] = count.get((e1, e2), 0) + 1
		pair, freq = max(count.items(), key=lambda x: x[1])
		# if the pair is not frequent enough, stop
		if freq == 1:
			print('Stopping early, no pair found more than once.')
			break
		# merge the pair
		vocabulary.append(vocabulary[pair[0]] + vocabulary[pair[1]])
		merges[pair] = next_replacement
		for i, piece in enumerate(elems):
			elems[i] = merge(piece, pair, next_replacement)
		next_replacement += 1
		# print progress
		if m % (max_merges // 10) == 0:
			remaining = (time() - start) * (max_merges - m) / (m + 1)
			remaining = f'{remaining//3600:02.0f}:{remaining%3600//60:02.0f}:{remaining%60:02.0f}'
			print(f'{m:4} merges done, {remaining} remaining, compression ratio: {len(text) / sum(map(len, elems)):.2f}')
	return elems, vocabulary, merges

compressed_text, vocabulary, merges = train_tokenizer(text)
print(f'Final compression ratio: {len(text) / sum(map(len, compressed_text)):.2f}')

   0 merges done, 01:59:16 remaining, compression ratio: 1.01


In [None]:
def encode(text, merges):
	res = list(text.encode('utf-8'))
	for pair, replacement in merges.items():
		res = merge(res, pair, replacement)
	return res

def decode(elems, vocabulary):
	res = b''.join(bytes(vocabulary[e]) for e in elems)
	return res.decode('utf-8')

text = 'this is a test to see if it works °å^○ⁿ·'
encoded = encode(text, merges)
decoded = decode(encoded, vocabulary)
print(f'Original text: {text}')
print(f'Encoded text: {encoded}')
print(f'Decoded text: {decoded}')

Original text: this is a test to see if it works °å^○ⁿ·
Encoded text: [116, 104, 105, 115, 32, 105, 115, 259, 32, 270, 115, 116, 32, 291, 301, 101, 32, 105, 102, 32, 105, 116, 32, 119, 111, 114, 107, 115, 32, 194, 176, 195, 165, 94, 226, 151, 139, 226, 129, 191, 194, 183]
Decoded text: this is a test to see if it works °å^○ⁿ·
