In [1]:
import altair as alt
from embeddings_analysis import smallest_multitoken_number, plot_embeddings

alt.data_transformers.disable_max_rows()
alt.renderers.set_embed_options(theme='dark')

RendererRegistry.enable('default')

# Byte-Pair Encoding numeric tokens

As a first approach, it would be interesting to see how numeric quantities are handled in the model. It seems like numbers between 0-1000 get their own tokens, while beyond that the word is split between at least two tokens.

In [2]:
model_ids = {
    "olmo": "allenai/OLMo-2-1124-7B",
    "llama": "meta-llama/Llama-3.2-3B-Instruct",
    "phi": "microsoft/Phi-4-mini-instruct",
}

model_id = model_ids["olmo"]

In [3]:
smallest_multitokens = {
    model_id: smallest_multitoken_number(model_id, 1200)
    for model_id in model_ids.values()
}

smallest_multitokens

{'allenai/OLMo-2-1124-7B': 1000,
 'meta-llama/Llama-3.2-3B-Instruct': 1000,
 'microsoft/Phi-4-mini-instruct': 1000}

In [4]:
plot_embeddings(model_id)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [5]:
from embeddings_analysis import EmbeddingsLoader
from embeddings_svd import analyze_embeddings

def analyze_model(model_id):
    loader = EmbeddingsLoader(model_id)
    number_data = loader.numbers()
    del loader
    return analyze_embeddings(number_data, smallest_multitokens[model_id])

plots = analyze_model(model_ids['olmo'])

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [6]:
plots

{'explained_variance': alt.Chart(...),
 'cumulative_variance': alt.LayerChart(...),
 'projection': alt.Chart(...),
 'consecutive_distances': alt.Chart(...),
 'component_patterns': alt.FacetChart(...),
 'correlation_heatmap': alt.Chart(...),
 'ones_digit': alt.Chart(...),
 'tens_digit': alt.Chart(...),
 'hundreds_digit': alt.Chart(...),
 'digit_length': alt.Chart(...),
 'special_numbers': alt.LayerChart(...)}

In [8]:
alt.vconcat(plots['explained_variance'], plots['cumulative_variance'])

In [10]:
alt.vconcat(plots['projection'], plots['ones_digit'], plots['tens_digit'], plots['hundreds_digit'], plots['digit_length'])

In [11]:
plots['projection']

In [13]:
plots['component_patterns']