# Comparing Various Byte Pair Encoding (BPE) Implementations

## Using BPE from `tiktoken`

In [None]:
from importlib.metadata import version
print("tiktoken version:", version("tiktoken"))

In [None]:
import tiktoken
tik_tokenizer = tiktoken.get_encoding("gpt2")
text = "Hello, world. Is this-- a test?"

In [None]:
integers = tik_tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

In [None]:
strings = tik_tokenizer.decode(integers)
print(strings)

In [None]:
print(tik_tokenizer.n_vocab)

## Using the original BPE implementation used in GPT-2

In [None]:
from bpe_openai_gpt2 import get_encoder, download_vocab

In [None]:
download_vocab()

In [None]:
orig_tokenizer = get_encoder(model_name="gpt2_model", models_dir=".")

In [None]:
integers = orig_tokenizer.encode(text)
print(integers)

In [None]:
strings = orig_tokenizer.decode(integers)
print(strings)

## Using the BPE via HuggingFace Transformers

In [None]:
import transformers
transformers.__version__

In [None]:
from transformers import GPT2Tokenizer
hf_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
integers = hf_tokenizer(strings)["input_ids"]
print(integers)

In [None]:
from transformers import GPT2TokenizerFast
hf_tokenizer_fast = GPT2TokenizerFast.from_pretrained("gpt2")

In [None]:
integers = hf_tokenizer_fast(strings)["input_ids"]
print(integers)

## Using BPE from scratch implementation

In [None]:
import os
import sys
import io
import nbformat
import types

def import_from_notebook():
    def import_definitions_from_notebook(fullname, names):
        current_dir = os.getcwd()
        path = os.path.join(current_dir, "..", "bpe-from-scratch", fullname + ".ipynb")
        path = os.path.normpath(path)

        # Load the notebook
        if not os.path.exists(path):
            raise FileNotFoundError(f"Notebook file not found at: {path}")

        with io.open(path, "r", encoding="utf-8") as f:
            nb = nbformat.read(f, as_version=4)

        # Create a module to store the imported functions and classes
        mod = types.ModuleType(fullname)
        sys.modules[fullname] = mod

        # Go through the notebook cells and only execute function or class definitions
        for cell in nb.cells:
            if cell.cell_type == "code":
                cell_code = cell.source
                for name in names:
                    # Check for function or class definitions
                    if f"def {name}" in cell_code or f"class {name}" in cell_code:
                        exec(cell_code, mod.__dict__)
        return mod

    fullname = "bpe-from-scratch"
    names = ["BPETokenizerSimple"]

    return import_definitions_from_notebook(fullname, names)

In [None]:
imported_module = import_from_notebook()
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)

tokenizer_gpt2 = BPETokenizerSimple()
tokenizer_gpt2.load_vocab_and_merges_from_openai(
    vocab_path=os.path.join("gpt2_model", "encoder.json"),
    bpe_merges_path=os.path.join("gpt2_model", "vocab.bpe")
)

In [None]:
integers = tokenizer_gpt2.encode(text)
print(integers)

## A Quick Performance Comparison

In [None]:
with open("../main/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [None]:
# Original OpenAI GPT-2 tokenizer
%timeit orig_tokenizer.encode(raw_text)

In [None]:
# Tiktoken OpenAI GPT-2 tokenizer
%timeit tik_tokenizer.encode(raw_text)

In [None]:
# Hugging Face OpenAI GPT-2 tokenizer
%timeit hf_tokenizer(raw_text)["input_ids"]

In [None]:
%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)["input_ids"]

In [None]:
%timeit hf_tokenizer_fast(raw_text)["input_ids"]

In [None]:
%timeit hf_tokenizer_fast(raw_text, max_length=5145, truncation=True)["input_ids"]

In [None]:
# GPT-2 tokenizer from scratch
%timeit tokenizer_gpt2.encode(raw_text)