In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import altair as alt
from embeddings_analysis import EmbeddingsLoader

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from umap import UMAP

alt.data_transformers.disable_max_rows()
alt.renderers.set_embed_options(theme="dark")

# Numeric embedding analysis

In [None]:
model_id = "allenai/OLMo-2-1124-7B"

In [None]:
loader = EmbeddingsLoader(model_id)
loader.smallest_multitoken_number()

In [None]:
# Loading the number embeddings and 1000 random embeddings for comparison

number_embeddings = loader.numbers()
random_embeddings = loader.random()

number_embeddings.data.shape

# Linear Dimensionality Reduction

## PCA

In [None]:
number_pca = number_embeddings.dim_reduction(PCA(n_components=1000))
random_pca = random_embeddings.dim_reduction(PCA(n_components=1000))

alt.hconcat(number_pca.plot(), random_pca.plot()).resolve_scale(color="independent")

## Explained variance

In [None]:
number_pca.plot_variance_overview()

## SVD

In [None]:
number_svd = number_embeddings.dim_reduction(TruncatedSVD(n_components=100))
random_svd = random_embeddings.dim_reduction(TruncatedSVD(n_components=100))

alt.hconcat(number_svd.plot(), random_svd.plot()).properties().resolve_scale(
    color="independent"
)

In [None]:
number_svd.plot_digit_overview()

In [None]:
random_svd = random_embeddings.dim_reduction(TruncatedSVD(n_components=100))
alt.hconcat(
    number_svd.plot_correlation_heatmap(20),
    random_svd.plot_correlation_heatmap(20),
)

In [None]:
number_svd.plot_top_correlated_components()

# Non-Linear Dimensionality Reduction

In [None]:
tsne_kwargs = dict(
    perplexity=75,
    max_iter=3000,
    learning_rate=500,
    early_exaggeration=20,
    random_state=42,
)

number_tsne = number_embeddings.dim_reduction(TSNE(**tsne_kwargs))
random_tsne = random_embeddings.dim_reduction(TSNE(**tsne_kwargs))

alt.hconcat(number_tsne.plot(), random_tsne.plot()).resolve_scale(color="independent")


In [None]:
number_tsne.plot_digit_overview()

In [None]:

umap_kwargs = dict()

number_tsne = number_embeddings.dim_reduction(UMAP(**umap_kwargs))
random_tsne = random_embeddings.dim_reduction(UMAP(**umap_kwargs))

alt.hconcat(number_tsne.plot(), random_tsne.plot()).resolve_scale(color="independent")
