In [15]:
!pip install --upgrade google-cloud-aiplatform[tokenization]



In [16]:
from vertexai.preview import tokenization
import http.client
import typing
import urllib.request

# Model name - Gemini models:
# gemini-1.5-flash-001: Supports text inputs with up to 2,097,152 tokens for advanced comprehension
# gemini-1.0-pro: Supports 32,760 tokens, optimized for general-purpose language tasks

model_names = ["gemini-1.5-flash-001", "gemini-1.0-pro"]

In [17]:
def tokenize_input(model_name, input_text):
    try:
        tokenizer = tokenization.get_tokenizer_for_model(model_name)

        tokens = tokenizer.compute_tokens(input_text)

        count = tokenizer.count_tokens(input_text)

        return tokens, count
    except Exception as e:
        print(f"Error with model {model_name}: {e}")
        return None, None

In [19]:
sample_input = """function quicksort(arr) {
    if (arr.length <= 1) return arr;
    let pivot = arr[Math.floor(arr.length / 2)];
    let left = arr.filter(x => x < pivot);
    let middle = arr.filter(x => x === pivot);
    let right = arr.filter(x => x > pivot);
    return quicksort(left).concat(middle, quicksort(right));
}"""

for model in model_names:
    tokens, token_count = tokenize_input(model, sample_input)
    if tokens is not None:
        print(f"Model: {model}")
        print("Tokens (Words/Subwords):", tokens)
        print("Token Count:", token_count)
        print()

Model: gemini-1.5-flash-001
Tokens (Words/Subwords): PreviewComputeTokensResult(tokens_info=[TokensInfo(token_ids=[1929, 150524, 641, 235278, 3665, 235275, 612, 108, 141, 648, 591, 3665, 235265, 2737, 5718, 235248, 235274, 235275, 2203, 7137, 235289, 108, 141, 1243, 44591, 589, 7137, 235309, 10278, 235265, 17612, 235278, 3665, 235265, 2737, 1148, 235248, 235284, 43096, 108, 141, 1243, 2731, 589, 7137, 235265, 6395, 235278, 235297, 1236, 1141, 968, 44591, 594, 108, 141, 1243, 7185, 589, 7137, 235265, 6395, 235278, 235297, 1236, 1141, 4637, 44591, 594, 108, 141, 1243, 1833, 589, 7137, 235265, 6395, 235278, 235297, 1236, 1141, 1562, 44591, 594, 108, 141, 773, 150524, 641, 235278, 1672, 846, 24969, 235278, 19161, 235269, 150524, 641, 235278, 1331, 2260, 108, 235270], tokens=[b'function', b' quicks', b'ort', b'(', b'arr', b')', b' {', b'\n', b'    ', b'if', b' (', b'arr', b'.', b'length', b' <=', b' ', b'1', b')', b' return', b' arr', b';', b'\n', b'    ', b'let', b' pivot', b' =', b' arr',