In [1]:
import tiktoken


In [2]:
source = "data/story.txt"

In [13]:
# Load the file at file_path and return a string
def load_file(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
            return text
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"Error reading file: {e}")
        return None


In [14]:
text = load_file(source)

In [5]:
encoding = tiktoken.get_encoding("o200k_harmony")

In [None]:
tokens = encoding.encode(text, allowed_special="all")

ValueError: Encountered text corresponding to disallowed special token '<|startoftext|>'.
If you want this text to be encoded as a special token, pass it to `allowed_special`, e.g. `allowed_special={'<|startoftext|>', ...}`.
If you want this text to be encoded as normal text, disable the check for this token by passing `disallowed_special=(enc.special_tokens_set - {'<|startoftext|>'})`.
To disable this check for all special tokens, pass `disallowed_special=()`.


In [7]:
len(tokens)

259

In [8]:
# Return the special token string for a given token id if it is a special token
# in the embedding model's encoding; otherwise return None.
#
# Usage:
#   name = special_token_name(token_id, enc=encoding)
#   # or rely on the global `encoding` if set earlier in the notebook
#   name = special_token_name(token_id)
import tiktoken

def special_token_name(token_id: int, enc=None):
    """
    Check whether `token_id` is a special token in the embedding model's encoding
    and return the string name of it if it is; otherwise return None.

    If `enc` is not provided, this will use a global `encoding` if present,
    otherwise it will default to tiktoken.get_encoding("o200k_harmony").
    """
    # Resolve the encoding to use
    if enc is None:
        enc = globals().get("encoding") or tiktoken.get_encoding("o200k_harmony")

    # Build the mapping using only public API
    # Iterate known special token strings and map their ids
    for s in getattr(enc, "special_tokens_set", set()):
        ids = enc.encode(s, allowed_special="all")
        if len(ids) == 1 and ids[0] == token_id:
            return s
    return None

In [9]:
# Map and filter special tokens present in `tokens`
# Produces a dict {token_id: special_name} for unique specials encountered
special_tokens_found = {}
for tid in tokens:
    name = special_token_name(tid, enc=encoding)
    if name is not None and tid not in special_tokens_found:
        special_tokens_found[tid] = name

special_tokens_found

{}

In [11]:
# Print all special tokens in the model (excluding names containing "reserved")
import tiktoken

def print_special_tokens(enc=None):
    """
    Print all special tokens known by the encoding, excluding any whose
    names contain "reserved" (case-insensitive). Returns a sorted list of
    (token_id, token_string) for convenience.
    """
    # Resolve encoding
    if enc is None:
        enc = globals().get("encoding") or tiktoken.get_encoding("o200k_harmony")

    specials = []
    for s in getattr(enc, "special_tokens_set", set()):
        if "reserved" in s.lower():
            continue
        ids = enc.encode(s, allowed_special="all")
        if len(ids) == 1:
            specials.append((ids[0], s))

    specials.sort(key=lambda x: x[0])
    for tid, name in specials:
        print(f"{tid}\t{name}")
    return specials

In [12]:

print_special_tokens()

199998	<|startoftext|>
199999	<|endoftext|>
200002	<|return|>
200003	<|constrain|>
200005	<|channel|>
200006	<|start|>
200007	<|end|>
200008	<|message|>
200012	<|call|>
200018	<|endofprompt|>


[(199998, '<|startoftext|>'),
 (199999, '<|endoftext|>'),
 (200002, '<|return|>'),
 (200003, '<|constrain|>'),
 (200005, '<|channel|>'),
 (200006, '<|start|>'),
 (200007, '<|end|>'),
 (200008, '<|message|>'),
 (200012, '<|call|>'),
 (200018, '<|endofprompt|>')]