In [43]:
from textwrap import dedent

import altair as alt
import numpy as np
import pandas as pd
import torch

from IPython.display import display, Markdown
from transformers import AutoModel, AutoTokenizer

def displaymd(text):
    display(Markdown(dedent(text)))

alt.renderers.set_embed_options(theme='dark')

RendererRegistry.enable('default')

# Byte-Pair Encoding numeric tokens

As a first approach, it would be interesting to see how numeric quantities are handled in the model. It seems like numbers between 0-1000 get their own tokens, while beyond that the word is split between at least two tokens.

In [44]:
model_ids = [
    "allenai/OLMo-2-1124-7B",
    "meta-llama/Llama-3.1-8B-Instruct",
    "microsoft/Phi-4-mini-instruct",
    "deepseek-ai/DeepSeek-R1",
]

model_id = model_ids[0]

In [98]:
def smallest_multitoken_number(model_id, upper_limit=1200):
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    for num in range(upper_limit):
        tokens = tokenizer.tokenize(str(num))
        if len(tokens) > 1:
            return num

smallest_multitoken = smallest_multitoken_number(model_id)

displaymd(f"""
In the range of numbers 0-1200, the numbers from {smallest_multitoken} onwards are split into multiple tokens.

The numbers between 0 and {smallest_multitoken - 1} get represented by a single token (very likely to be hardcoded and not a result of BPE).
""")



In the range of numbers 0-1200, the numbers from 1000 onwards are split into multiple tokens.

The numbers between 0 and 999 get represented by a single token (very likely to be hardcoded and not a result of BPE).


In [99]:
def load_embeddings(model_id):
    model = AutoModel.from_pretrained(model_id)
    model.eval()
    embeddings = model.embed_tokens
    del model
    return embeddings

class EmbeddingsExtractor:
    def __init__(self, model_id):
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.embeddings = load_embeddings(model_id)
    
    def from_tokens(self, tokens):
        with torch.no_grad():
            extracted_embeddings = self.embeddings.forward(tokens).squeeze()
        return extracted_embeddings

    def from_words(self, words):
        tokenized = self.tokenizer(words, return_attention_mask=False, return_tensors="pt")["input_ids"]
        return self.from_tokens(tokenized)
    
    def numbers(self, upper_limit=1000):
        return self.from_words([str(i) for i in range(upper_limit)])
    
    def random(self, n=1000, seed=1234):
        torch.manual_seed(seed)
        random_tokens = torch.randint(0, self.embeddings.num_embeddings, (n,))
        return self.from_tokens(random_tokens)


In [104]:
import umap
from sklearn.decomposition import PCA


class EmbeddingsAnalyzer:
    def __init__(self, embeddings, model_id, label, color_scheme='viridis'):
        self.embeddings = embeddings
        self.model_id = model_id
        self.label = label
        self.color_scheme = color_scheme
    
    def __str__(self):
        return f'({self.model_id}) {self.label}'

    def pca(self):
        embeddings_array = self.embeddings

        pca = PCA(n_components=2)
        pca_result = pca.fit_transform(embeddings_array)

        pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
        pca_df['number'] = range(1000)

        chart = alt.Chart(pca_df).mark_circle(size=60).encode(
            x='PC1:Q',
            y='PC2:Q',
            color=alt.Color('number:Q', scale=alt.Scale(scheme='viridis')),
            tooltip=['number:Q', 'PC1:Q', 'PC2:Q']
        ).properties(
            title=f'{self}: PCA',
            height=400,
            width='container'
        ).interactive()

        return chart
    
    def umap(self, n_epochs=2000, random_state=125):
        embeddings = self.embeddings

        reducer = umap.UMAP(
            n_components=2,
            random_state=random_state,
            n_epochs=n_epochs,
        )
        umap_result = reducer.fit_transform(embeddings)

        umap_df = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2'])
        umap_df['number'] = range(1000)

        chart = alt.Chart(umap_df).mark_circle(size=60).encode(
            x='UMAP1:Q',
            y='UMAP2:Q',
            color=alt.Color('number:Q', scale=alt.Scale(scheme=self.color_scheme)),
            tooltip=['number:Q', 'UMAP1:Q', 'UMAP2:Q']
        ).properties(
            title=f'{self}: UMAP',
            width='container',
        ).interactive()

        return chart
    
extractor = EmbeddingsExtractor(model_id)
numeric_analyzer = EmbeddingsAnalyzer(extractor.numbers(), model_id, 'Number embeddings between 0-999')
random_analyzer = EmbeddingsAnalyzer(extractor.random(), model_id, 'Random embeddings')

alt.hconcat(numeric_analyzer.pca(), random_analyzer.pca())

Loading checkpoint shards: 100%|██████████| 6/6 [00:00<00:00, 56.35it/s]


In [96]:
alt.hconcat(numeric_analyzer.umap(), random_analyzer.umap())

  warn(
  warn(
