In [None]:
# https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken

In [2]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.11.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading tiktoken-0.11.0-cp313-cp313-macosx_11_0_arm64.whl (997 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m997.1/997.1 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.11.0


In [1]:
import tiktoken

# 1. Get the tokenizer for a specific model.
# "cl100k_base" is the encoding used by GPT-4, GPT-3.5-Turbo, and others.
#encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.get_encoding("o200k_base")

# 2. Define a sample string to encode and decode.
text = "Hello, world! How are you?"
#text = "listen labs .ai /p uzzle"

print(f"Original text: '{text}'")
print("-" * 30)

# 3. Encode the string into a list of token IDs.
# The `encode()` method converts text into a sequence of integers (tokens).
token_ids = encoding.encode(text)

print(f"Encoded token IDs: {token_ids}")
print(f"Number of tokens: {len(token_ids)}")

# The tokenizer might break words into multiple tokens (e.g., "Hello," becomes two tokens).
decoded_tokens = [encoding.decode_single_token_bytes(token).decode('utf-8', errors='replace') for token in token_ids]
print(f"Decoded tokens: {decoded_tokens}")

print("-" * 30)

# 4. Decode the list of token IDs back into a string.
# The `decode()` method is the inverse of `encode()`.
decoded_text = encoding.decode(token_ids)

print(f"Decoded text: '{decoded_text}'")

# The original text and the decoded text should be identical.
assert text == decoded_text

print("-" * 30)


Original text: 'Hello, world! How are you?'
------------------------------
Encoded token IDs: [13225, 11, 2375, 0, 3253, 553, 481, 30]
Number of tokens: 8
Decoded tokens: ['Hello', ',', ' world', '!', ' How', ' are', ' you', '?']
------------------------------
Decoded text: 'Hello, world! How are you?'
------------------------------


In [2]:
# Encode text to token

import tiktoken

# 1. First, make sure you have the tiktoken library installed:
#    pip install tiktoken

# 2. Get the tokenizer for a specific model.
#    'cl100k_base' is the encoding used by GPT-4 and GPT-3.5-Turbo.
#encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.get_encoding("o200k_base")

# 3. Start a loop to repeatedly prompt the user for text.
while True:
    print("-" * 40)
    user_input = input("Enter text to tokenize (or type 'quit' to exit): ")
    
    # 4. Check for the exit command.
    if user_input.lower() in ['quit', 'exit']:
        print("Exiting...")
        break
    
    # 5. Encode the user's input into tokens.
    tokens = encoding.encode(user_input)
    
    # 6. Print the results.
    print(f"\nOriginal Text: '{user_input}'")
    print(f"Token IDs: {tokens}")
    print(f"Number of tokens: {len(tokens)}")

----------------------------------------


Enter text to tokenize (or type 'quit' to exit):  Let there be light



Original Text: 'Let there be light'
Token IDs: [12845, 1354, 413, 4207]
Number of tokens: 4
----------------------------------------


Enter text to tokenize (or type 'quit' to exit):  quit


Exiting...


In [4]:
# Decode token to text

import tiktoken

# Get the tokenizer for the model.
# 'cl100k_base' is the encoding used by GPT-4 and GPT-3.5-Turbo.
#encoding = tiktoken.get_encoding("cl100k_base")
encoding = tiktoken.get_encoding("o200k_base")
# Start a loop to repeatedly prompt the user for a token ID.
while True:
    print("-" * 40)
    user_input = input("Enter a token ID (e.g., 24912) to decode, or type 'quit' to exit: ")
    
    # Check for the exit command.
    if user_input.lower() in ['quit', 'exit']:
        print("Exiting...")
        break
    
    try:
        # Convert the user's input string into an integer.
        token_id = int(user_input)
        
        # Decode the token ID. The `decode` method requires a list of integers.
        decoded_text = encoding.decode([token_id])
        
        # Print the results.
        print(f"\nToken ID: {token_id}")
        print(f"Decoded Text: '{decoded_text}'")
        
    except ValueError:
        # Handle cases where the user enters non-numeric input.
        print("Invalid input. Please enter a valid number.")


----------------------------------------


Enter a token ID (e.g., 24912) to decode, or type 'quit' to exit:  24912



Token ID: 24912
Decoded Text: 'hello'
----------------------------------------


Enter a token ID (e.g., 24912) to decode, or type 'quit' to exit:  quit


Exiting...
