## Topic: Tokenization

In [2]:
from transformers import AutoTokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
text = "The quick brown fox jumps over the lazy dog"
tokens = tokenizer.tokenize(text)
ids = tokenizer.convert_tokens_to_ids(tokens)

In [4]:
print("Tokens:", tokens)
print("Token IDs:", ids)

Tokens: ['The', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog']
Token IDs: [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290]


## Step 1: Setup — Install & Import Dependencies

In [5]:
!pip install -q transformers

from transformers import AutoTokenizer

## Step 2: Choose a Model and Tokenize Text

In [6]:
def tokenize_text(text, model_name="gpt2"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Tokenize text
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    return tokens, token_ids

In [7]:
sample_text = "The quick brown fox jumps over the lazy dog"
tokens, token_ids = tokenize_text(sample_text)

print("Tokens:", tokens)
print("Token IDs:", token_ids)
print("Number of tokens:", len(tokens))

Tokens: ['The', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog']
Token IDs: [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290]
Number of tokens: 9


## Step 3: Add User Input and Model Selector (Interactive)

In [8]:
def interactive_tokenizer():
    print("Choose a model from these options:")
    models = ["gpt2", "bert-base-uncased", "distilbert-base-uncased"]
    for i, m in enumerate(models):
        print(f"{i}: {m}")

    model_idx = int(input("Enter model number (e.g. 0): "))
    if model_idx < 0 or model_idx >= len(models):
        print("Invalid model choice. Using gpt2.")
        model_idx = 0
    model_name = models[model_idx]

    text = input("Enter text to tokenize: ")
    tokens, token_ids = tokenize_text(text, model_name)

    print(f"\nModel: {model_name}")
    print("Tokens:", tokens)
    print("Token IDs:", token_ids)
    print("Number of tokens:", len(tokens))

interactive_tokenizer()

Choose a model from these options:
0: gpt2
1: bert-base-uncased
2: distilbert-base-uncased
Enter model number (e.g. 0): 1
Enter text to tokenize: The quick brown fox jumps over the lazy dog


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


Model: bert-base-uncased
Tokens: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Token IDs: [1996, 4248, 2829, 4419, 14523, 2058, 1996, 13971, 3899]
Number of tokens: 9


## Step 4: Pretty-print output & add token count with cost estimation

In [10]:
def interactive_tokenizer_pretty():
    print("Choose a model from these options:")
    models = ["gpt2", "bert-base-uncased", "distilbert-base-uncased"]
    for i, m in enumerate(models):
        print(f"{i}: {m}")

    model_idx = int(input("Enter model number (e.g. 0): "))
    if model_idx < 0 or model_idx >= len(models):
        print("Invalid model choice. Using gpt2.")
        model_idx = 0
    model_name = models[model_idx]

    text = input("Enter text to tokenize: ")
    tokens, token_ids = tokenize_text(text, model_name)

    print(f"\nModel: {model_name}")
    print("\nTokens and IDs:")
    for t, tid in zip(tokens, token_ids):
        print(f"{t:15} -> {tid}")

    token_count = len(tokens)
    print(f"\nTotal tokens: {token_count}")

    # Example cost estimation (GPT-4: $0.03 per 1K tokens)
    cost_per_1k = 0.03
    estimated_cost = token_count * (cost_per_1k / 1000)
    print(f"Estimated cost (GPT-4 style): ${estimated_cost:.6f}")

interactive_tokenizer_pretty()

Choose a model from these options:
0: gpt2
1: bert-base-uncased
2: distilbert-base-uncased
Enter model number (e.g. 0): 2
Enter text to tokenize: The quick brown fox jumps over the lazy dog

Model: distilbert-base-uncased

Tokens and IDs:
the             -> 1996
quick           -> 4248
brown           -> 2829
fox             -> 4419
jumps           -> 14523
over            -> 2058
the             -> 1996
lazy            -> 13971
dog             -> 3899

Total tokens: 9
Estimated cost (GPT-4 style): $0.000270


## Step 5: Compare Multiple Model Tokenizers Side-by-Side

In [11]:
def compare_tokenizers(text, models=None):
    if models is None:
        models = ["gpt2", "bert-base-uncased", "distilbert-base-uncased"]

    for model_name in models:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokens = tokenizer.tokenize(text)
        token_ids = tokenizer.convert_tokens_to_ids(tokens)

        print(f"\n=== Model: {model_name} ===")
        print(f"Tokens ({len(tokens)}):")
        print(tokens)
        print(f"Token IDs:")
        print(token_ids)

In [12]:
# Example usage
input_text = input("Enter text to tokenize across models: ")
compare_tokenizers(input_text)

Enter text to tokenize across models: The quick brown fox jumps over the lazy dog

=== Model: gpt2 ===
Tokens (9):
['The', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog']
Token IDs:
[464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290]

=== Model: bert-base-uncased ===
Tokens (9):
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Token IDs:
[1996, 4248, 2829, 4419, 14523, 2058, 1996, 13971, 3899]

=== Model: distilbert-base-uncased ===
Tokens (9):
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Token IDs:
[1996, 4248, 2829, 4419, 14523, 2058, 1996, 13971, 3899]
