# Comparing Various Byte Pair Encoding (BPE) Implementation

## Using BPE from tiktoken

In [1]:
from importlib.metadata import version
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.7.0


In [2]:
import tiktoken
tik_token = tiktoken.get_encoding("gpt2")
text = "Hello, world. Is this-- a test?"

In [3]:
integers = tik_token.encode(text, allowed_special={"<|endoftext|>"})
print("Encoded integers: ", integers)

Encoded integers:  [15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [4]:
strings = tik_token.decode(integers)
print("Decoded string: ", strings)

Decoded string:  Hello, world. Is this-- a test?


In [5]:
from bpe_openai_gpt2 import get_encoder, download_vocab
download_vocab()
orig_tokenizer = get_encoder(model_name="gpt2",models_dir=".")

Fetching encoder.json: 1.04Mit [00:00, 4.40Mit/s]                                                   
Fetching vocab.bpe: 457kit [00:00, 2.92Mit/s]                                                       


In [6]:
integers = orig_tokenizer.encode(text)
print("Encoded integers (original): ", integers)

Encoded integers (original):  [15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [7]:
strings = orig_tokenizer.decode(integers)
print("Decoded string (original):", strings)

Decoded string (original): Hello, world. Is this-- a test?


## Using the BPE via Hugging Face Transformers

In [9]:
%pip install transformers>=4.33.2

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import transformers

transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


'4.57.6'

In [13]:
from transformers import GPT2Tokenizer
hf_tokenizer = GPT2Tokenizer(
    vocab_file="./gpt2/encoder.json",
    merges_file="./gpt2/vocab.bpe"
)

In [14]:
hf_tokenizer(strings)["input_ids"]

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]

In [16]:
from transformers import GPT2TokenizerFast
hf_tokenizer_fast = GPT2TokenizerFast(
    vocab_file="./gpt2/encoder.json",
    merges_file="./gpt2/vocab.bpe"
)

In [17]:
hf_tokenizer_fast(strings)["input_ids"]

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]