In [16]:
from textwrap import dedent

import altair as alt
import numpy as np
import pandas as pd
import torch

from IPython.display import display, Markdown
from transformers import AutoModel, AutoTokenizer

def displaymd(text):
    display(Markdown(dedent(text)))

alt.renderers.set_embed_options(theme='dark')

RendererRegistry.enable('default')

# Byte-Pair Encoding numeric tokens

As a first approach, it would be interesting to see how numeric quantities are handled in the model. It seems like numbers between 0-1000 get their own tokens, while beyond that the word is split between at least two tokens.

In [4]:
model_ids = [
    "allenai/OLMo-2-1124-7B",
    "meta-llama/Llama-3.1-8B-Instruct",
    "microsoft/Phi-4-mini-instruct",
    "deepseek-ai/DeepSeek-R1",
]

model_id = model_ids[0]

In [17]:
def test_numbers_are_multitokens(model_id, upper_limit=1200):
    tokenized_numbers = [
        (num, tokenizer.tokenize(str(num))) for num in range(upper_limit)
    ]

    tokenized_0_999 = tokenized_numbers[:1000]

    multitoken_numbers = [
        num for num, tokens in tokenized_numbers if len(tokens) > 1
    ]

    displaymd(f"""
    In the range of numbers 0-1200, the numbers from {min(multitoken_numbers)} to {max(multitoken_numbers)} are split into multiple tokens.

    The numbers between 0 and 999 get represented by a single token (very likely to be hardcoded and not a result of BPE).
    """)

test_numbers_are_multitokens(model_id)


In the range of numbers 0-1200, the numbers from 1000 to 1199 are split into multiple tokens.

The numbers between 0 and 999 get represented by a single token (very likely to be hardcoded and not a result of BPE).


In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

def load_embeddings(model_id):
    model = AutoModel.from_pretrained(model_id)
    embeddings = model.embed_tokens
    del model
    return embeddings

embeddings = load_embeddings(model_id)

Loading checkpoint shards: 100%|██████████| 6/6 [00:00<00:00, 47.25it/s]


In [19]:
tokenized = tokenizer(
    [str(i) for i in range(1000)],
    return_attention_mask=False,
    return_tensors="pt",
)["input_ids"]
tokenized.unique().shape # to verify we get 1000 unique token ids

torch.Size([1000])

In [20]:
with torch.no_grad():
    number_embeddings = embeddings.forward(tokenized).squeeze()
number_embeddings.shape # 1000 x d_model

torch.Size([1000, 4096])

In [21]:
from sklearn.decomposition import PCA

def plot_pca(data):
    embeddings_array = data.numpy()

    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(embeddings_array)

    pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
    pca_df['number'] = range(1000)

    chart = alt.Chart(pca_df).mark_circle(size=60).encode(
        x='PC1:Q',
        y='PC2:Q',
        color=alt.Color('number:Q', scale=alt.Scale(scheme='viridis')),
        tooltip=['number:Q', 'PC1:Q', 'PC2:Q']
    ).properties(
        title='PCA of Embeddings for Numbers 0-999',
        height=400,
        width='container'
    ).interactive()

    display(chart)


plot_pca(number_embeddings)

In [None]:
torch.manual_seed(1234)
random_toks = torch.randint(0, embeddings.num_embeddings, (1000,))

print(f"Sample of the random tokens: {tokenizer.decode(random_toks[:5])}")

Sample of the random tokens:
 inconvenient phon gettext fluffy przy


In [23]:
with torch.no_grad():
    random_embeddings = embeddings.forward(random_toks).squeeze()
random_embeddings.shape

torch.Size([1000, 4096])

In [None]:
random_embeddings == number_embeddings

In [None]:
plot_pca(random_embeddings)

In [None]:
import umap

def plot_umap(data, title, color_scheme='viridis'):
    embeddings_array = data.to(torch.device("cpu")).numpy()

    reducer = umap.UMAP(
        n_components=2,
        random_state=125,
        n_epochs=2000
    )
    umap_result = reducer.fit_transform(embeddings_array)

    umap_df = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2'])
    umap_df['number'] = range(1000)

    chart = alt.Chart(umap_df).mark_circle(size=60).encode(
        x='UMAP1:Q',
        y='UMAP2:Q',
        color=alt.Color('number:Q', scale=alt.Scale(scheme=color_scheme)),
        tooltip=['number:Q', 'UMAP1:Q', 'UMAP2:Q']
    ).properties(
        title=title,
    ).interactive()

    return mo.ui.altair_chart(chart)

plot_umap(number_embeddings, title='UMAP of Embeddings for Numbers 0-999')


In [None]:

plot_umap(random_embeddings, title='UMAP of Randomized Embeddings', color_scheme='turbo')