In [18]:
!pip install tiktoken
!pip install openai



In [19]:
import openai
import tiktoken

# Encoding name - OpenAI models:
# o200k_base: gpt-4o, gpt-4o-mini
# cl100k_base: gpt-4-turbo, gpt-4, gpt-3.5-turbo, text-embedding-ada-002, text-embedding-3-small, text-embedding-3-large
# p50k_base: Codex models, text-davinci-002, text-davinci-003
# r50k_base (or gpt2): GPT-3 models like davinci

encoding_names = ["o200k_base", "cl100k_base", "p50k_base", "r50k_base"]

In [20]:
def tokenize_input(encoding_name, input_text):
    try:
        tokenizer = tiktoken.get_encoding(encoding_name)

        token_ids = tokenizer.encode(input_text)

        tokens_as_words = [tokenizer.decode([token_id]) for token_id in token_ids]
        token_count = len(tokens_as_words)

        return tokens_as_words, token_count
    except Exception as e:
        print(f"Error with encoding {encoding_name}: {e}")
        return None, None

In [21]:
sample_input = """function quicksort(arr) {
    if (arr.length <= 1) return arr;
    let pivot = arr[Math.floor(arr.length / 2)];
    let left = arr.filter(x => x < pivot);
    let middle = arr.filter(x => x === pivot);
    let right = arr.filter(x => x > pivot);
    return quicksort(left).concat(middle, quicksort(right));
}"""

In [22]:
for encoding in encoding_names:
    tokens, token_count = tokenize_input(encoding, sample_input)
    if tokens is not None:
        print(f"Encoding: {encoding}")
        print("Tokens (Words/Subwords):", tokens)
        print("Token Count:", token_count)
        print()

Encoding: o200k_base
Tokens (Words/Subwords): ['function', ' quick', 'sort', '(arr', ')', ' {\n', '   ', ' if', ' (', 'arr', '.length', ' <=', ' ', '1', ')', ' return', ' arr', ';\n', '   ', ' let', ' pivot', ' =', ' arr', '[Math', '.floor', '(arr', '.length', ' /', ' ', '2', ')];\n', '   ', ' let', ' left', ' =', ' arr', '.filter', '(x', ' =>', ' x', ' <', ' pivot', ');\n', '   ', ' let', ' middle', ' =', ' arr', '.filter', '(x', ' =>', ' x', ' ===', ' pivot', ');\n', '   ', ' let', ' right', ' =', ' arr', '.filter', '(x', ' =>', ' x', ' >', ' pivot', ');\n', '   ', ' return', ' quick', 'sort', '(left', ').', 'concat', '(m', 'iddle', ',', ' quick', 'sort', '(right', '));\n', '}']
Token Count: 82

Encoding: cl100k_base
Tokens (Words/Subwords): ['function', ' quick', 'sort', '(arr', ')', ' {\n', '   ', ' if', ' (', 'arr', '.length', ' <=', ' ', '1', ')', ' return', ' arr', ';\n', '   ', ' let', ' pivot', ' =', ' arr', '[Math', '.floor', '(arr', '.length', ' /', ' ', '2', ')];\n', '   ',