# 01: Build the Duckling Tokenizer

Truncate GPT-2 tokenizer to 8,000 tokens. These are our "live" tokens.

The embedding matrix will be padded to 10,000 rows during training,
creating 2,000 "phantom" dead tokens that can never appear in data.

---

*Jeffery Harrell & Alpha, December 1, 2025*

## Parameters

In [1]:
import json
from pathlib import Path
from transformers import GPT2TokenizerFast

# How many merge rules to keep (vocab = BASE_VOCAB + TARGET_MERGES + specials)
# GPT-2 has 256 byte tokens + 50,000 merges = 50,257 vocab
# We want ~8,000 total, so: 8000 - 256 - 2 (eos, pad) = 7,742 merges
TARGET_VOCAB = 8_000
BASE_VOCAB = 256  # Byte-level tokens
TARGET_MERGES = TARGET_VOCAB - BASE_VOCAB - 2  # Leave room for eos + pad

OUTPUT_DIR = Path("data/tokenizer")

print(f"Target vocab: {TARGET_VOCAB}")
print(f"Base vocab (bytes): {BASE_VOCAB}")
print(f"Target merges: {TARGET_MERGES}")

Target vocab: 8000
Base vocab (bytes): 256
Target merges: 7742


## Load GPT-2 Tokenizer

In [2]:
# Load the full GPT-2 tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})

print(f"Original GPT-2 vocab size: {len(tokenizer)}")
print(f"EOS token: {repr(tokenizer.eos_token)} (id={tokenizer.eos_token_id})")
print(f"PAD token: {repr(tokenizer.pad_token)} (id={tokenizer.pad_token_id})")

Original GPT-2 vocab size: 50258
EOS token: '<|endoftext|>' (id=50256)
PAD token: '<|pad|>' (id=50257)


## Extract and Truncate

In [3]:
# Save full tokenizer to temp dir so we can extract vocab.json and merges.txt
tmp_dir = OUTPUT_DIR / "tmp-full"
tmp_dir.mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(tmp_dir)

# Load vocab and merges
with open(tmp_dir / "vocab.json") as f:
    full_vocab = json.load(f)

with open(tmp_dir / "merges.txt") as f:
    lines = f.read().splitlines()

header, merge_rules = lines[0], lines[1:]

print(f"Full vocab entries: {len(full_vocab)}")
print(f"Full merge rules: {len(merge_rules)}")
print(f"Header: {header}")

Full vocab entries: 50257
Full merge rules: 50000
Header: #version: 0.2


In [4]:
# Truncate merge rules
truncated_merge_rules = merge_rules[:TARGET_MERGES]

# Build new vocab: first BASE_VOCAB bytes + TARGET_MERGES merged tokens
inverse_vocab = {idx: token for token, idx in full_vocab.items()}
keep_ids = list(range(BASE_VOCAB + TARGET_MERGES))
kept_tokens = [inverse_vocab[i] for i in keep_ids]

# Make sure special tokens are included
for token in [tokenizer.eos_token, tokenizer.pad_token]:
    if token not in kept_tokens:
        kept_tokens.append(token)

# Create new vocab mapping
new_vocab = {token: i for i, token in enumerate(kept_tokens)}

print(f"Truncated merges: {len(truncated_merge_rules)}")
print(f"New vocab size: {len(new_vocab)}")

Truncated merges: 7742
New vocab size: 8000


## Save Truncated Tokenizer

In [5]:
# Save new vocab and merges
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

with open(OUTPUT_DIR / "vocab.json", "w") as f:
    json.dump(new_vocab, f, ensure_ascii=False)

with open(OUTPUT_DIR / "merges.txt", "w") as f:
    f.write(header + "\n")
    for merge in truncated_merge_rules:
        f.write(merge + "\n")

print(f"Saved vocab.json ({len(new_vocab)} entries)")
print(f"Saved merges.txt ({len(truncated_merge_rules)} rules)")

Saved vocab.json (8000 entries)
Saved merges.txt (7742 rules)


In [6]:
# Load as a proper tokenizer and save with all config files
new_tokenizer = GPT2TokenizerFast(
    vocab_file=str(OUTPUT_DIR / "vocab.json"),
    merges_file=str(OUTPUT_DIR / "merges.txt"),
)
new_tokenizer.pad_token = tokenizer.pad_token
new_tokenizer.eos_token = tokenizer.eos_token
new_tokenizer.bos_token = tokenizer.eos_token  # GPT-2 uses eos as bos

new_tokenizer.save_pretrained(OUTPUT_DIR)

print(f"\n{'='*50}")
print(f"TOKENIZER SAVED")
print(f"{'='*50}")
print(f"Vocab size: {len(new_tokenizer)}")
print(f"Location: {OUTPUT_DIR.absolute()}")


TOKENIZER SAVED
Vocab size: 8000
Location: /Users/jefferyharrell/Workshop/projects/Azimuth/Duckling/data/tokenizer


## Cleanup

In [7]:
# Remove temp directory
import shutil
shutil.rmtree(tmp_dir)
print(f"Cleaned up {tmp_dir}")

Cleaned up data/tokenizer/tmp-full


## Sanity Check

In [8]:
# Reload and test
test_tokenizer = GPT2TokenizerFast.from_pretrained(OUTPUT_DIR)

test_text = "Once upon a time there was a little girl named Lily."
tokens = test_tokenizer.encode(test_text)

print(f"Test text: {repr(test_text)}")
print(f"Token IDs: {tokens}")
print(f"Decoded: {repr(test_tokenizer.decode(tokens))}")
print(f"Num tokens: {len(tokens)}")
print(f"\nMax token ID in encoding: {max(tokens)}")
print(f"Vocab size: {len(test_tokenizer)}")

# Verify all tokens are in range
assert max(tokens) < len(test_tokenizer), "Token ID out of range!"
print("\n✓ All token IDs within vocab range")

Test text: 'Once upon a time there was a little girl named Lily.'
Token IDs: [7454, 2402, 257, 640, 612, 373, 257, 1310, 2576, 3706, 406, 813, 13]
Decoded: 'Once upon a time there was a little girl named Lily.'
Num tokens: 13

Max token ID in encoding: 7454
Vocab size: 8000

✓ All token IDs within vocab range


## Summary

In [9]:
print("="*50)
print("DUCKLING TOKENIZER READY")
print("="*50)
print(f"Live tokens: {len(test_tokenizer):,}")
print(f"Phantom tokens (added during training): 2,000")
print(f"Total embedding rows: 10,000")
print(f"Dead token ratio: 20%")
print(f"\nLocation: {OUTPUT_DIR.absolute()}")

DUCKLING TOKENIZER READY
Live tokens: 8,000
Phantom tokens (added during training): 2,000
Total embedding rows: 10,000
Dead token ratio: 20%

Location: /Users/jefferyharrell/Workshop/projects/Azimuth/Duckling/data/tokenizer
