In [1]:
from textwrap import dedent
from functools import cache

import altair as alt
import numpy as np
import pandas as pd
import torch
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

from IPython.display import display, Markdown
from transformers import AutoModel, AutoTokenizer

def displaymd(text):
    display(Markdown(dedent(text)))

alt.data_transformers.disable_max_rows()
alt.renderers.set_embed_options(theme='dark')

RendererRegistry.enable('default')

# Byte-Pair Encoding numeric tokens

As a first approach, it would be interesting to see how numeric quantities are handled in the model. It seems like numbers between 0-1000 get their own tokens, while beyond that the word is split between at least two tokens.

In [2]:
model_ids = {
    "olmo": "allenai/OLMo-2-1124-7B",
    "llama": "meta-llama/Llama-3.2-3B-Instruct",
    "phi": "microsoft/Phi-4-mini-instruct",
    "deepseek": "deepseek-ai/DeepSeek-R1",
    "qwen3-0.6b": "Qwen/Qwen3-0.6B",
    "qwen3-1.7b": "Qwen/Qwen3-1.7B",
    "qwen3-32b": "Qwen/Qwen3-32B",
}

In [3]:
@cache
def smallest_multitoken_number(model_id, upper_limit=1200):
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    for num in range(upper_limit):
        tokens = tokenizer.tokenize(str(num))
        if len(tokens) > 1:
            return num

smallest_multitokens = {
    model_id: smallest_multitoken_number(model_id)
    for model_id in model_ids.values()
}

smallest_multitokens

{'allenai/OLMo-2-1124-7B': 1000,
 'meta-llama/Llama-3.2-3B-Instruct': 1000,
 'microsoft/Phi-4-mini-instruct': 1000,
 'deepseek-ai/DeepSeek-R1': 1000,
 'Qwen/Qwen3-0.6B': 10,
 'Qwen/Qwen3-1.7B': 10,
 'Qwen/Qwen3-32B': 10}

In [4]:
def load_embeddings(model_id):
    model = AutoModel.from_pretrained(model_id)
    model.eval()
    embeddings = model.embed_tokens
    del model
    return embeddings

class EmbeddingsLoader:
    def __init__(self, model_id):
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.embeddings = load_embeddings(model_id)
    
    def __repr__(self):
        return f'EmbeddingsLoader("{self.tokenizer.name_or_path}")'
    
    def tokenize(self, words):
        return self.tokenizer(
            words,
            add_special_tokens=False,
            return_attention_mask=False,
            return_tensors="pt",
        )["input_ids"]
    
    def from_tokens(self, tokens):
        with torch.no_grad():
            extracted_embeddings = self.embeddings.forward(tokens).squeeze()
        return extracted_embeddings

    def from_words(self, words):
        return self.from_tokens(self.tokenize(words))
    
    def numbers(self, upper_limit=None):
        if upper_limit is None:
            upper_limit = smallest_multitoken_number(self.tokenizer.name_or_path)

        return self.from_words([str(i) for i in range(upper_limit)])
    
    def random(self, n=1000, seed=None):
        if seed is not None:
            torch.manual_seed(seed)
        random_tokens = torch.randint(0, self.embeddings.num_embeddings, (n,))
        return self.from_tokens(random_tokens)


In [9]:
from sklearn.manifold import TSNE
import umap
from sklearn.decomposition import PCA


class EmbeddingsAnalyzer:
    def __init__(self, embeddings, model_id, label, color_scheme='viridis', **alt_props):
        self.embeddings = embeddings
        self.model_id = model_id
        self.label = label
        self.color_scheme = color_scheme

        alt_props.setdefault('width', 500)
        self.alt_props = alt_props
    
    def __str__(self):
        return f'({self.model_id}) {self.label}'

    def pca(self):
        embeddings_array = self.embeddings

        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(embeddings_array)

        pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
        pca_df['number'] = range(1000)

        chart = alt.Chart(pca_df).mark_circle(size=60).encode(
            x='PC1:Q',
            y='PC2:Q',
            color=alt.Color('number:Q', scale=alt.Scale(scheme=self.color_scheme)),
            tooltip=['number:Q', 'PC1:Q', 'PC2:Q']
        ).properties(
            title=f'{self}: PCA',
            **self.alt_props
        ).interactive()

        return chart
    
    def tsne(self, n_iter=1000, random_state=None, **kwargs):
        embeddings = self.embeddings

        tsne = TSNE(
            n_components=2,
            random_state=random_state,
            n_iter=n_iter,
            verbose=1,
            perplexity=50,
            **kwargs,
        )
        tsne_result = tsne.fit_transform(embeddings)

        tsne_df = pd.DataFrame(tsne_result, columns=['t-SNE1', 't-SNE2'])
        tsne_df['number'] = range(1000)

        chart = alt.Chart(tsne_df).mark_circle(size=60).encode(
            x='t-SNE1:Q',
            y='t-SNE2:Q',
            color=alt.Color('number:Q', scale=alt.Scale(scheme=self.color_scheme)),
            tooltip=['number:Q', 't-SNE1:Q', 't-SNE2:Q']
        ).properties(
            title=f'{self}: t-SNE',
            **self.alt_props
        ).interactive()

        return chart
    
    def umap(self, n_epochs=2000, random_state=None):
        # setting random_state will disable parallelization

        embeddings = self.embeddings

        with warnings.catch_warnings(category=UserWarning):
            reducer = umap.UMAP(
                n_components=2,
                random_state=random_state,
                n_epochs=n_epochs,
            )
            umap_result = reducer.fit_transform(embeddings)

        umap_df = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2'])
        umap_df['number'] = range(1000)

        chart = alt.Chart(umap_df).mark_circle(size=60).encode(
            x='UMAP1:Q',
            y='UMAP2:Q',
            color=alt.Color('number:Q', scale=alt.Scale(scheme=self.color_scheme)),
            tooltip=['number:Q', 'UMAP1:Q', 'UMAP2:Q']
        ).properties(
            title=f'{self}: UMAP',
            **self.alt_props
        ).interactive()

        return chart

def plot_dimensional_reduction_for_number_and_random_embeddings(model_id, pca=True, tsne=True, umap=True):
    loader = EmbeddingsLoader(model_id)
    numeric_analyzer = EmbeddingsAnalyzer(loader.numbers(), model_id, 'Number embeddings between 0-999')
    random_analyzer = EmbeddingsAnalyzer(loader.random(), model_id, 'Random embeddings', color_scheme='plasma')
    del loader

    charts = []

    if pca:
        charts.append(alt.hconcat(numeric_analyzer.pca(), random_analyzer.pca()))
    if tsne:
        charts.append(alt.hconcat(numeric_analyzer.tsne(), random_analyzer.tsne()))
    if umap:
        charts.append(alt.hconcat(numeric_analyzer.umap(), random_analyzer.umap()))


    return alt.vconcat(*charts)

In [10]:
plot_dimensional_reduction_for_number_and_random_embeddings(model_ids['olmo'])

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1000 samples in 0.001s...
[t-SNE] Computed neighbors for 1000 samples in 0.128s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1000
[t-SNE] Mean sigma: 1.965958
[t-SNE] KL divergence after 250 iterations with early exaggeration: 112.890503
[t-SNE] KL divergence after 1000 iterations: 2.122033
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1000 samples in 0.001s...
[t-SNE] Computed neighbors for 1000 samples in 0.101s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1000
[t-SNE] Mean sigma: 2.304992
[t-SNE] KL divergence after 250 iterations with early exaggeration: 121.525894
[t-SNE] KL divergence after 1000 iterations: 2.152879


In [11]:
plot_dimensional_reduction_for_number_and_random_embeddings(model_ids['llama'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

: 

In [None]:
plot_dimensional_reduction_for_number_and_random_embeddings(model_ids['phi'])

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# plot_pca_and_umap_for_numeric_and_random_embeddings(model_ids['qwen3-0.6b'])

NameError: name 'plot_pca_and_umap_for_numeric_and_random_embeddings' is not defined