In [1]:
from datasets import load_dataset
import json, gzip

corpus_name="eng_latn_300mb"
dataset = load_dataset(
                "sanderland/monolingual-tokenizer-data",
                data_files=[f"{corpus_name}.txt"],
                split="train",
                streaming=False,
            )

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokfile = f"{corpus_name}-hf.json"

In [3]:
from tokenizers import Tokenizer, pre_tokenizers
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.normalizers import NFC, Lowercase, Sequence

tokenizer = Tokenizer(BPE(unk_token="<|endoftext|>"))
tokenizer.normalizer = Sequence([NFC()])
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)

trainer = BpeTrainer(vocab_size=50257, special_tokens=["<|endoftext|>"], initial_alphabet=pre_tokenizers.ByteLevel.alphabet() )
def get_training_corpus():
    for row in dataset:
        yield row["text"]

# 4. Train the tokenizer using the new iterator
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

tokenizer.save(tokfile)






In [4]:
tokenizer = Tokenizer.from_file(tokfile)

encoded = tokenizer.encode("Hello, this is a test of the new GPT-2 style tokenizer.")
print("Encoded string:", encoded.tokens)
print("Encoded IDs:", encoded.ids)

encoded_complex = tokenizer.encode("    Multiple spaces and special characters like 'é'!")
print("\nEncoded complex string:", encoded_complex.tokens)
print("Encoded complex IDs:", encoded_complex.ids)

Encoded string: ['Hello', ',', 'Ġthis', 'Ġis', 'Ġa', 'Ġtest', 'Ġof', 'Ġthe', 'Ġnew', 'ĠGP', 'T', '-', '2', 'Ġstyle', 'Ġtoken', 'izer', '.']
Encoded IDs: [10452, 12, 417, 315, 258, 1343, 289, 263, 617, 8929, 52, 13, 18, 2493, 17741, 6208, 14]

Encoded complex string: ['ĠĠĠ', 'ĠMultiple', 'Ġspaces', 'Ġand', 'Ġspecial', 'Ġcharacters', 'Ġlike', "Ġ'", 'Ã©', "'", '!']
Encoded complex IDs: [44242, 17800, 6320, 286, 1439, 4077, 584, 1111, 2849, 7, 1]


In [5]:
# 8. Load the saved tokenizer file to inspect its contents
with open(tokfile, "r") as f:
    tokenizer_data = json.load(f)
merges = tokenizer_data['model']['merges']

n = 20
print(f"\n--- Top {n} BPE Merges ---")
for i, merge in enumerate(merges[:n]):
   print(f"{i + 1}: {merge!r}")



--- Top 20 BPE Merges ---
1: ['Ġ', 't']
2: ['Ġ', 'a']
3: ['i', 'n']
4: ['h', 'e']
5: ['r', 'e']
6: ['o', 'n']
7: ['Ġt', 'he']
8: ['e', 'r']
9: ['Ġ', 's']
10: ['Ġ', 'w']
11: ['Ġ', 'o']
12: ['a', 't']
13: ['n', 'd']
14: ['Ġ', 'c']
15: ['i', 't']
16: ['e', 's']
17: ['o', 'u']
18: ['o', 'r']
19: ['i', 's']
20: ['Ġ', 'f']


In [6]:
with gzip.open(f"/Users/sander/Desktop/script_bpe/results/tokenizers/{corpus_name}/n50000/bytes_gpt2.json.gz") as f:
   my_tok = json.load(f)

In [7]:

def bytes_to_char():
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))
char_to_byte = {v:k for k,v in bytes_to_char().items()}

def demangle_hf_token(token_string: str) -> bytes:
    """
    Converts a token string from the HF tokenizer's vocab or merges
    (which uses special characters to represent bytes) back into a raw bytes object.
    
    Example: demangle_hf_token('âĢ') -> b'\xe2\x80'
    """
    byte_values = [char_to_byte[char] for char in token_string]
    return bytes(byte_values).decode(errors='backslashreplace')

In [8]:
my_vocab = set()
hf_vocab = set()

my_merges = my_tok['metadata']['tokens'][256:]
for i, (hf_merge, my_merge) in enumerate(zip(merges, my_merges)):
    hf_froma, hf_fromb = hf_merge
    hf_to = demangle_hf_token(hf_froma+hf_fromb)
    my_to = my_merge['vocab']
    my_vocab.add(my_to)
    hf_vocab.add(hf_to)    
    if hf_to != my_to and i< 1680:
        print(f"{i}: hf {hf_to!r} \tmy {my_to!r}")

In [9]:
my_vocab - hf_vocab

set()

In [10]:
hf_vocab - my_vocab

set()

In [11]:
my_merges = my_tok['metadata']['tokens'][256:]
for i, (hf_merge, my_merge) in enumerate(zip(merges, my_merges)):
    hf_froma, hf_fromb = hf_merge
    hf_to = demangle_hf_token(hf_froma+hf_fromb)
    my_to = my_merge['vocab']
    no_match = (hf_to != my_to)
    if my_to not in hf_vocab or hf_to not in my_vocab or no_match:
        if my_to not in hf_vocab:
            marker = 'm'
        elif hf_to not in my_vocab:
            marker = 'h'
        else:
            marker = '?'
        print(f"{marker} {i}: hf {repr(hf_to):20} \tmy {repr(my_to):20}")

In [12]:
from tokenizers import Tokenizer
from tqdm import tqdm

# 2. Initialize counters
total_original_bytes = 0
total_tokens = 0

# 3. Define a function to process a batch of texts
def process_batch(batch):
    global total_original_bytes, total_tokens
    # Calculate the byte size of the original text (in UTF-8)
    # The 'text' column is assumed to contain your text data.
    text_list = batch["text"]
    total_original_bytes += sum(len(text.encode('utf-8')) for text in text_list if text)
    
    # Tokenize the text and count the number of tokens
    # Using batch_encode is much faster than encoding one by one.
    encoded_batch = tokenizer.encode_batch(text_list)
    total_tokens += sum(len(encoding.ids) for encoding in encoded_batch)

dataset.map(
    process_batch,
    batched=True,
    batch_size=1000,
)

compression_ratio = total_original_bytes / total_tokens
print("\n--- Corpus Compression Statistics ---")
print(f"Total bytes in original text: {total_original_bytes:,}")
print(f"Total number of tokens generated: {total_tokens:,}")
print(f"Compression Ratio (Bytes/Token): {compression_ratio:.4f}")

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 846440/846440 [00:33<00:00, 25077.75 examples/s]


--- Corpus Compression Statistics ---
Total bytes in original text: 299,153,665
Total number of tokens generated: 63,629,873
Compression Ratio (Bytes/Token): 4.7015



