<a href="https://colab.research.google.com/github/kelanmail-create/colabs/blob/main/tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://tiktokenizer.vercel.app/


https://ndingwall.github.io/blog/tokenization

In [None]:
!pip install -q sentencepiece datasets

In [None]:
import os, sentencepiece as spm
from datasets import load_dataset


WORKDIR = "/content/tokenizer_demo"
os.makedirs(WORKDIR, exist_ok=True)
print("Workdir:", WORKDIR)

In [None]:
# Karpathy's Tiny Shakespeare raw text
!curl -L -o /content/tokenizer_demo/corpus.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

corpus_path = f"{WORKDIR}/corpus.txt"
!wc -c {corpus_path}
!head -n 3 {corpus_path}

In [None]:
def train_tokenizer(
    input_file,
    model_prefix,
    vocab_size=2000,
    model_type="unigram",    # can be "bpe" or "unigram"
    character_coverage=1.0,  # 1.0 for pure English
):
    cmd = (
        f"--input={input_file} "
        f"--model_prefix={model_prefix} "
        f"--vocab_size={vocab_size} "
        f"--model_type={model_type} "
        f"--character_coverage={character_coverage} "
        f"--unk_id=0 --pad_id=1 --bos_id=2 --eos_id=3"
    )
    print("Training SentencePiece model...")
    spm.SentencePieceTrainer.Train(cmd)
    print("Saved:", f"{model_prefix}.model")

train_tokenizer(
    input_file=corpus_path,
    model_prefix=f"{WORKDIR}/spm_unigram",
    model_type="unigram",
)
train_tokenizer(
    input_file=corpus_path,
    model_prefix=f"{WORKDIR}/spm_bpe",
    model_type="bpe",
)

In [None]:
sp_uni = spm.SentencePieceProcessor(model_file=f"{WORKDIR}/spm_unigram.model")
sp_bpe = spm.SentencePieceProcessor(model_file=f"{WORKDIR}/spm_bpe.model")

samples = [
    "Tokenization affects efficiency and quality.",
    "Unbreakable transformations are fascinating!",
    "Let's analyze subword units produced by BPE and Unigram models."
]

def preview(sp, name):
    print(f"\n=== {name} ===")
    total = 0
    for s in samples:
        ids = sp.encode(s, out_type=int)
        pieces = sp.encode(s, out_type=str)
        total += len(ids)
        print(f"• {s}\n  {list(zip(pieces, ids))}\n  ({len(ids)} tokens)\n")
    print(f"Avg length: {total/len(samples):.2f} tokens\n")

preview(sp_uni, "Unigram")
preview(sp_bpe, "BPE")