# Numeric embedding analysis

In [1]:
import altair as alt
from embeddings_analysis import EmbeddingsLoader

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from umap import UMAP

import warnings

alt.data_transformers.disable_max_rows()
alt.renderers.set_embed_options(theme="dark")

RendererRegistry.enable('default')

In [2]:
model_id = "allenai/OLMo-2-1124-7B"

In [3]:
# Parameters
model_id = "meta-llama/Llama-3.2-3B-Instruct"


In [4]:
loader = EmbeddingsLoader(model_id)
loader.smallest_multitoken_number()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

1000

In [5]:
# Loading the number embeddings and 1000 random embeddings for comparison

number_embeddings = loader.numbers()
random_embeddings = loader.random()

number_embeddings.data.shape

(1000, 3072)

# Linear Dimensionality Reduction

## Principal Component Analysis

In [6]:
number_pca = number_embeddings.dim_reduction(PCA(n_components=1000))
random_pca = random_embeddings.dim_reduction(PCA(n_components=1000))

alt.hconcat(number_pca.plot(), random_pca.plot()).resolve_scale(color="independent")

### Explained variance

In [7]:
number_pca.plot_variance_overview()

## Singular Value Decomposition

In [8]:
number_svd = number_embeddings.dim_reduction(TruncatedSVD(n_components=100))
random_svd = random_embeddings.dim_reduction(TruncatedSVD(n_components=100))

alt.hconcat(number_svd.plot(), random_svd.plot()).properties().resolve_scale(
    color="independent"
)

In [9]:
number_svd.plot_digit_overview()

### Correlation Heatmap

In [10]:
random_svd = random_embeddings.dim_reduction(TruncatedSVD(n_components=100))
alt.hconcat(
    number_svd.plot_correlation_heatmap(20),
    random_svd.plot_correlation_heatmap(20),
)

### Components with maximum correlation

In [11]:
number_svd.plot_top_correlated_components()

# Non-Linear Dimensionality Reduction

## t-SNE

In [12]:
tsne_kwargs = dict(
    perplexity=75,
    max_iter=3000,
    learning_rate=500,
    early_exaggeration=20,
    random_state=42,
)

number_tsne = number_embeddings.dim_reduction(TSNE(**tsne_kwargs))
random_tsne = random_embeddings.dim_reduction(TSNE(**tsne_kwargs))

alt.hconcat(number_tsne.plot(), random_tsne.plot()).resolve_scale(color="independent")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
number_tsne.plot_digit_overview()

## UMAP

### Cosine similarity

In [14]:
umap_kwargs = dict(
    # Increase from default 15 to preserve more global structure
    n_neighbors=50,        
    # Decrease from default 0.1 for tighter local clusters
    min_dist=0.05,         
    metric="cosine",
    # Increase from default 1.0 to spread out the visualization
    spread=1.5,            
    # Increase to enhance local structure preservation
    local_connectivity=2,  
    random_state=42,
)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    number_umap_cos = number_embeddings.dim_reduction(UMAP(**umap_kwargs))
    random_umap_cos = random_embeddings.dim_reduction(UMAP(**umap_kwargs))

alt.hconcat(number_umap_cos.plot(), random_umap_cos.plot()).resolve_scale(color="independent")

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [15]:
number_umap_cos.plot_digit_overview()

### Euclidean distance

In [16]:
umap_kwargs.update(metric='euclidean')

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    number_umap_euc = number_embeddings.dim_reduction(UMAP(**umap_kwargs))
    random_umap_euc = random_embeddings.dim_reduction(UMAP(**umap_kwargs))

alt.hconcat(number_umap_euc.plot(), random_umap_euc.plot()).resolve_scale(color="independent")


In [17]:
number_umap_euc.plot_digit_overview()