In [1]:
import tiktoken

ModuleNotFoundError: No module named 'tiktoken'

In [2]:
encoding = tiktoken.get_encoding("cl100k_base")

In [3]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [4]:
encoding.encode("tiktoken is great!")

[83, 1609, 5963, 374, 2294, 0]

In [5]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens
num_tokens_from_string("tiktoken is great!", "cl100k_base")

6

In [6]:
encoding.decode([83, 1609, 5963, 374, 2294, 0])

'tiktoken is great!'

In [8]:
[encoding.decode_single_token_bytes(token) for token in [83, 1609, 5963, 374, 2294, 0]]

[b't', b'ik', b'token', b' is', b' great', b'!']

# Comparing encodings

In [9]:
def compare_encodings(example_string: str) -> None:
    """Prints a comparison of three string encodings."""
    # print the example string
    print(f'\nExample string: "{example_string}"')
    # for each encoding, print the # of tokens, the token integers, and the token bytes
    for encoding_name in ["gpt2", "p50k_base", "cl100k_base"]:
        encoding = tiktoken.get_encoding(encoding_name)
        token_integers = encoding.encode(example_string)
        num_tokens = len(token_integers)
        token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]
        print()
        print(f"{encoding_name}: {num_tokens} tokens")
        print(f"token integers: {token_integers}")
        print(f"token bytes: {token_bytes}")

In [10]:
compare_encodings("antidisestablishmentarianism")


Example string: "antidisestablishmentarianism"

gpt2: 5 tokens
token integers: [415, 29207, 44390, 3699, 1042]
token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']

p50k_base: 5 tokens
token integers: [415, 29207, 44390, 3699, 1042]
token bytes: [b'ant', b'idis', b'establishment', b'arian', b'ism']

cl100k_base: 6 tokens
token integers: [519, 85342, 34500, 479, 8997, 2191]
token bytes: [b'ant', b'idis', b'establish', b'ment', b'arian', b'ism']


In [11]:
compare_encodings("2 + 2 = 4")


Example string: "2 + 2 = 4"

gpt2: 5 tokens
token integers: [17, 1343, 362, 796, 604]
token bytes: [b'2', b' +', b' 2', b' =', b' 4']

p50k_base: 5 tokens
token integers: [17, 1343, 362, 796, 604]
token bytes: [b'2', b' +', b' 2', b' =', b' 4']

cl100k_base: 7 tokens
token integers: [17, 489, 220, 17, 284, 220, 19]
token bytes: [b'2', b' +', b' ', b'2', b' =', b' ', b'4']


In [12]:
compare_encodings("किसी के अकाउंट में पैसे भेजने की जरूरत है।")


Example string: "किसी के अकाउंट में पैसे भेजने की जरूरत है।"

gpt2: 67 tokens
token integers: [11976, 243, 11976, 123, 11976, 116, 24231, 222, 28225, 243, 24231, 229, 28225, 227, 11976, 243, 48077, 11976, 231, 11976, 224, 11976, 253, 28225, 106, 24231, 229, 11976, 224, 28225, 103, 24231, 230, 11976, 116, 24231, 229, 28225, 255, 24231, 229, 11976, 250, 11976, 101, 24231, 229, 28225, 243, 24231, 222, 28225, 250, 11976, 108, 24231, 224, 11976, 108, 11976, 97, 28225, 117, 24231, 230, 24231, 97]
token bytes: [b'\xe0\xa4', b'\x95', b'\xe0\xa4', b'\xbf', b'\xe0\xa4', b'\xb8', b'\xe0\xa5', b'\x80', b' \xe0\xa4', b'\x95', b'\xe0\xa5', b'\x87', b' \xe0\xa4', b'\x85', b'\xe0\xa4', b'\x95', b'\xe0\xa4\xbe', b'\xe0\xa4', b'\x89', b'\xe0\xa4', b'\x82', b'\xe0\xa4', b'\x9f', b' \xe0\xa4', b'\xae', b'\xe0\xa5', b'\x87', b'\xe0\xa4', b'\x82', b' \xe0\xa4', b'\xaa', b'\xe0\xa5', b'\x88', b'\xe0\xa4', b'\xb8', b'\xe0\xa5', b'\x87', b' \xe0\xa4', b'\xad', b'\xe0\xa5', b'\x87', b'\xe0\xa4', b'\x9c', b'\xe

# Counting tokens for chat API calls

In [14]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [19]:
import openai
from config import API_KEY
openai.api_key  = API_KEY

example_messages = [
    {
        "role": "system",
        "content": "You are a helpful, pattern-following assistant that translates corporate jargon into plain English.",
    },
    {
        "role": "system",
        "name": "example_user",
        "content": "New synergies will help drive top-line growth.",
    },
    {
        "role": "system",
        "name": "example_assistant",
        "content": "Things working well together will increase revenue.",
    },
    {
        "role": "system",
        "name": "example_user",
        "content": "Let's circle back when we have more bandwidth to touch base on opportunities for increased leverage.",
    },
    {
        "role": "system",
        "name": "example_assistant",
        "content": "Let's talk later when we're less busy about how to do better.",
    },
    {
        "role": "user",
        "content": "This late pivot means we don't have time to boil the ocean for the client deliverable.",
    },
]

for model in ["gpt-3.5-turbo-0301", "gpt-3.5-turbo-0301"]:
    print(model)
    # example token count from the function defined above
    print(f"{num_tokens_from_messages(example_messages, model)} prompt tokens counted by num_tokens_from_messages().")
    # example token count from the OpenAI API
    response = openai.ChatCompletion.create(
        model=model,
        messages=example_messages,
        temperature=0,
        max_tokens=1  # we're only counting input tokens here, so let's not waste tokens on the output
    )
    print(f'{response["usage"]["prompt_tokens"]} prompt tokens counted by the OpenAI API.')
    print()

gpt-3.5-turbo-0301
127 prompt tokens counted by num_tokens_from_messages().
127 prompt tokens counted by the OpenAI API.

gpt-3.5-turbo-0301
127 prompt tokens counted by num_tokens_from_messages().
127 prompt tokens counted by the OpenAI API.

