# Byte Pair Encoding (BPE) Tokenizer From Scratch Simple

## Bits and Bytes

In [1]:
text = "This is some text"
byte_ary = bytearray(text, "utf-8")
print(byte_ary)

bytearray(b'This is some text')


In [2]:
ids = list(byte_ary)
print(ids)

[84, 104, 105, 115, 32, 105, 115, 32, 115, 111, 109, 101, 32, 116, 101, 120, 116]


In [3]:
print("Number of characters:", len(text))
print("Number of token IDs:", len(ids))

Number of characters: 17
Number of token IDs: 17


In [4]:
import tiktoken
gpt2_tokenizer = tiktoken.get_encoding("gpt2")
gpt2_tokenizer.encode("This is some text")

[1212, 318, 617, 2420]

In [5]:
for i in range(300):
    decoded = gpt2_tokenizer.decode([i])
    print(f"{i}: {decoded}")

0: !
1: "
2: #
3: $
4: %
5: &
6: '
7: (
8: )
9: *
10: +
11: ,
12: -
13: .
14: /
15: 0
16: 1
17: 2
18: 3
19: 4
20: 5
21: 6
22: 7
23: 8
24: 9
25: :
26: ;
27: <
28: =
29: >
30: ?
31: @
32: A
33: B
34: C
35: D
36: E
37: F
38: G
39: H
40: I
41: J
42: K
43: L
44: M
45: N
46: O
47: P
48: Q
49: R
50: S
51: T
52: U
53: V
54: W
55: X
56: Y
57: Z
58: [
59: \
60: ]
61: ^
62: _
63: `
64: a
65: b
66: c
67: d
68: e
69: f
70: g
71: h
72: i
73: j
74: k
75: l
76: m
77: n
78: o
79: p
80: q
81: r
82: s
83: t
84: u
85: v
86: w
87: x
88: y
89: z
90: {
91: |
92: }
93: ~
94: �
95: �
96: �
97: �
98: �
99: �
100: �
101: �
102: �
103: �
104: �
105: �
106: �
107: �
108: �
109: �
110: �
111: �
112: �
113: �
114: �
115: �
116: �
117: �
118: �
119: �
120: �
121: �
122: �
123: �
124: �
125: �
126: �
127: �
128: �
129: �
130: �
131: �
132: �
133: �
134: �
135: �
136: �
137: �
138: �
139: �
140: �
141: �
142: �
143: �
144: �
145: �
146: �
147: �
148: �
149: �
150: �
151: �
152: �
153: �
154: �
155: �
156: �
157: �
158:

## A simple BPE Usage

In [6]:
import os
import urllib.request
from BPEImplemetation import BPETokenizerSimple

if not os.path.exists("../main-chapter-code/the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "../main-chapter-code/the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

with open("../main-chapter-code/the-verdict.txt", "r", encoding="utf-8") as f: # added ../01_main-chapter-code/
    text = f.read()

In [7]:
tokenizer = BPETokenizerSimple()
tokenizer.train(text, vocab_size=1000, allowed_special={"<|endoftext|>"})

In [8]:
print(tokenizer.vocab)
print(len(tokenizer.vocab))

{0: '\x00', 1: '\x01', 2: '\x02', 3: '\x03', 4: '\x04', 5: '\x05', 6: '\x06', 7: '\x07', 8: '\x08', 9: '\t', 10: '\n', 11: '\x0b', 12: '\x0c', 13: '\r', 14: '\x0e', 15: '\x0f', 16: '\x10', 17: '\x11', 18: '\x12', 19: '\x13', 20: '\x14', 21: '\x15', 22: '\x16', 23: '\x17', 24: '\x18', 25: '\x19', 26: '\x1a', 27: '\x1b', 28: '\x1c', 29: '\x1d', 30: '\x1e', 31: '\x1f', 32: ' ', 33: '!', 34: '"', 35: '#', 36: '$', 37: '%', 38: '&', 39: "'", 40: '(', 41: ')', 42: '*', 43: '+', 44: ',', 45: '-', 46: '.', 47: '/', 48: '0', 49: '1', 50: '2', 51: '3', 52: '4', 53: '5', 54: '6', 55: '7', 56: '8', 57: '9', 58: ':', 59: ';', 60: '<', 61: '=', 62: '>', 63: '?', 64: '@', 65: 'A', 66: 'B', 67: 'C', 68: 'D', 69: 'E', 70: 'F', 71: 'G', 72: 'H', 73: 'I', 74: 'J', 75: 'K', 76: 'L', 77: 'M', 78: 'N', 79: 'O', 80: 'P', 81: 'Q', 82: 'R', 83: 'S', 84: 'T', 85: 'U', 86: 'V', 87: 'W', 88: 'X', 89: 'Y', 90: 'Z', 91: '[', 92: '\\', 93: ']', 94: '^', 95: '_', 96: '`', 97: 'a', 98: 'b', 99: 'c', 100: 'd', 101: 'e'

In [9]:
print(len(tokenizer.bpe_merges))

742


In [10]:
input_text = "Jack embraced beauty through art and life."
token_ids = tokenizer.encode(input_text)
print(token_ids)

[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46]


In [11]:
print("Number of characters:", len(input_text))
print("Number of token IDs:", len(token_ids))

Number of characters: 42
Number of token IDs: 20


In [12]:
print(token_ids)

[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46]


In [13]:
print(tokenizer.decode(token_ids))

Jack embraced beauty through art and life.


In [14]:
for token_id in token_ids:
    print(f"{token_id} -> {tokenizer.decode([token_id])}")

424 -> Jack
256 ->  
654 -> em
531 -> br
302 -> ac
311 -> ed
256 ->  
296 -> be
97 -> a
465 -> ut
121 -> y
595 ->  through
841 ->  ar
116 -> t
287 ->  a
466 -> nd
256 ->  
326 -> li
972 -> fe
46 -> .


In [15]:
tokenizer.decode(tokenizer.encode("This is some text."))

'This is some text.'