In [2]:
from tiktoken import encoding_for_model

In [4]:
model1 = encoding_for_model("gpt-3.5")
model2 = encoding_for_model("gpt-4")

### when passing name into tiktoken encoding model it gives more than 1 token based on lenght of the name

In [55]:
text = "jaswabiju"
encoding1 = tiktoken.encoding_for_model("gpt-3.5-turbo")
encoding2 = tiktoken.encoding_for_model("gpt-4")

token1 = encoding1.encode(text)
token2 = encoding2.encode(text)

print(len(token1))  # 6
print(len(token2))  # 6
print([encoding2.decode([t]) for t in token2])


4
4
['jas', 'w', 'ab', 'iju']


In [58]:
import tiktoken

def compare_token_counts(text: str):
    enc_35 = tiktoken.encoding_for_model("gpt-3.5-turbo")
    enc_4 = tiktoken.encoding_for_model("gpt-4")

    tokens_35 = enc_35.encode(text)
    tokens_4 = enc_4.encode(text)

    print(f"Text: {text!r}")
    print(f"gpt-3.5 tokens ({len(tokens_35)}): {tokens_35}")
    print(f"gpt-4   tokens ({len(tokens_4)}): {tokens_4}")
    print([enc_35.decode([t]) for t in tokens_35])
    print([enc_4.decode([t]) for t in tokens_4])
    print("-" * 60)

# Basic sentence
compare_token_counts("Hello, how are you doing today?")

# Emoji and punctuation
compare_token_counts("🎉👍😄!!??")

# Long compound word (not in vocabulary)
compare_token_counts("bioluminescentdinoflagellates")

# Hyphenated word
compare_token_counts("state-of-the-art")

# CamelCase
compare_token_counts("ThisIsCamelCaseText")

# Code snippet
compare_token_counts("def count_tokens(text): return len(text.split())")

# Repeated characters
compare_token_counts("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")

# Non-English text
compare_token_counts("नमस्ते दुनिया")  # Hindi
compare_token_counts("こんにちは世界")   # Japanese


Text: 'Hello, how are you doing today?'
gpt-3.5 tokens (8): [9906, 11, 1268, 527, 499, 3815, 3432, 30]
gpt-4   tokens (8): [9906, 11, 1268, 527, 499, 3815, 3432, 30]
['Hello', ',', ' how', ' are', ' you', ' doing', ' today', '?']
['Hello', ',', ' how', ' are', ' you', ' doing', ' today', '?']
------------------------------------------------------------
Text: '🎉👍😄!!??'
gpt-3.5 tokens (10): [9468, 236, 231, 9468, 239, 235, 76460, 226, 3001, 7801]
gpt-4   tokens (10): [9468, 236, 231, 9468, 239, 235, 76460, 226, 3001, 7801]
['�', '�', '�', '�', '�', '�', '�', '�', '!!', '??']
['�', '�', '�', '�', '�', '�', '�', '�', '!!', '??']
------------------------------------------------------------
Text: 'bioluminescentdinoflagellates'
gpt-3.5 tokens (9): [8385, 1152, 1572, 1189, 73911, 1073, 13667, 616, 988]
gpt-4   tokens (9): [8385, 1152, 1572, 1189, 73911, 1073, 13667, 616, 988]
['bi', 'olum', 'ines', 'cent', 'din', 'of', 'lag', 'ell', 'ates']
['bi', 'olum', 'ines', 'cent', 'din', 'of', 'lag', '

### Main Function

In [47]:
from typing import Union, List, Dict
from tiktoken import encoding_for_model

def count_chat_token(model="gpt-3.5",text=Union[str, List[Dict[str,str]]]) -> int:
    encoding = encoding_for_model(model)
    if model == "gpt-3.5":
        token_per_message = 4
        token_per_name = -1
    elif model == "gpt-4":
        token_per_message = 3
        token_per_name = 1
    else:
        raise NotImplementedError(f"Token counting not implemented for {model}")

    num_token = 0
    if isinstance(text, str):
        
        num_token += token_per_message
        num_token += len(encoding.encode(text))
        num_token += 3
        return num_token

    elif isinstance(text,list):
        for msg in text:
            num_token += token_per_message
            for key,value in msg.items():

                num_token += len(encoding.encode(value))
                print(value,len(encoding.encode(value)))
                
                if key == "name":
                    num_token += token_per_name
        num_token += 3
        return num_token

    else:
        raise TypeError(f"unsupported input type")

            
        

    

In [52]:
messages = [
    {"role": "user", "content": "hello"},
    {"role": "user", "content": "how are you"},

]
count_chat_token("gpt-3.5",messages)

user 1
hello 1
user 1
how are you 3


17