# Numeric embedding analysis

In [None]:
import altair as alt
from embanalysis.duckdb_loader import DuckDBLoader
from embanalysis.constants import PROJECT_ROOT, HF_MODEL_ALIASES
from embanalysis.analyzer import EmbeddingsAnalyzer

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from umap import UMAP

import warnings

alt.data_transformers.disable_max_rows()
alt.renderers.set_embed_options(theme="dark")


In [None]:
model_id = "allenai/OLMo-2-1124-7B"
version = "v1"
plots_path = PROJECT_ROOT / "plots"
display_plot = False

In [None]:
model_id = HF_MODEL_ALIASES.get(model_id, model_id)
model_name = model_id.split("/")[-1]
counter = 0

def save_plot(name, plot):
    plots_path.mkdir(parents=True, exist_ok=True)
    plot.save(plots_path / f"{model_name}_{counter:02d}_{name}_{version}.png")
    global counter
    counter += 1
    if display_plot:
        return plot

In [None]:
loader = DuckDBLoader.default(read_only=True)

# Load samples for the specified model
samples = loader.get_model_samples(model_id)
samples

In [None]:
# Create analyzers for number embeddings and random embeddings
number_analyzer = EmbeddingsAnalyzer.from_sample(samples['integers'])
random_analyzer = EmbeddingsAnalyzer.from_sample(samples['random'])

print(f"Number embeddings shape: {number_analyzer.embeddings_df.shape}")
print(f"Random embeddings shape: {random_analyzer.embeddings_df.shape}")
number_analyzer.embeddings_df.head()

# Linear Dimensionality Reduction

## Principal Component Analysis

In [None]:
number_pca = number_analyzer.run_estimator(PCA(n_components=1000))
random_pca = random_analyzer.run_estimator(PCA(n_components=100))

plot = alt.hconcat(
    number_pca.plot_components(plot_type="gradient"),
    random_pca.plot_components(plot_type="gradient")
).resolve_scale(color="independent")

save_plot("pca_components_gradient", plot)

### Explained variance

In [None]:
plot = number_pca.plot_variance_overview()
save_plot("pca_variance_overview", plot)

## Singular Value Decomposition

In [None]:
def colstack(*charts, columns=2):
    rows = []
    for i in range(0, len(charts), columns):
        row = alt.hconcat(*charts[i:i+columns]).resolve_scale(color="independent")
        rows.append(row)
    final_chart = alt.vconcat(*rows)
    return final_chart


In [None]:
number_svd = number_analyzer.run_estimator(TruncatedSVD(n_components=100))
random_svd = random_analyzer.run_estimator(TruncatedSVD(n_components=100))

plot = alt.hconcat(
    number_svd.plot_components(plot_type="gradient"),
    random_svd.plot_components(plot_type="gradient")
).resolve_scale(color="independent")

save_plot("svd_components_gradient", plot)

In [None]:
# Plot different digit visualizations
plot = alt.vconcat(
    number_svd.plot_components(plot_type="digit_length", x_component=0, y_component=1) |
    number_svd.plot_components(plot_type="digit", x_component=0, y_component=1, digit_position=2),
    number_svd.plot_components(plot_type="digit", x_component=0, y_component=1, digit_position=1) |
    number_svd.plot_components(plot_type="digit", x_component=0, y_component=1, digit_position=0)
).resolve_scale(color="independent")

save_plot("svd_digit_visualizations", plot)

### Correlation Heatmap

In [None]:
random_svd = random_analyzer.run_estimator(TruncatedSVD(n_components=100))
# Compare correlation heatmaps between number and random embeddings
plot = alt.hconcat(
    number_svd.plot_correlation_heatmap(20),
    random_svd.plot_correlation_heatmap(20),
)

save_plot("svd_correlation_heatmap", plot)

### Components with maximum correlation

In [None]:
plot = number_pca.plot_top_correlated_components(n_vectors=10, corr_df=number_svd.top_correlations_df(10))
save_plot("pca_top_correlated_components", plot)

In [None]:
plot = number_svd.plot_top_correlated_components(n_vectors=10)
save_plot("svd_top_correlated_components", plot)

# Non-Linear Dimensionality Reduction

## t-SNE

In [None]:
tsne_kwargs = dict(
    perplexity=75,
    max_iter=3000,
    learning_rate=50,
    early_exaggeration=20,
    random_state=42,
)

# Apply t-SNE to both embeddings
number_tsne = number_analyzer.run_estimator(TSNE(**tsne_kwargs))
random_tsne = random_analyzer.run_estimator(TSNE(**tsne_kwargs))

plot = alt.hconcat(
    number_tsne.plot_components(plot_type="gradient"),
    random_tsne.plot_components(plot_type="gradient")
).resolve_scale(color="independent")

save_plot("tsne_components_gradient", plot)

In [None]:
# t-SNE digit visualizations
plot = colstack(
    number_tsne.plot_components(plot_type="digit_length", x_component=0, y_component=1),
    number_tsne.plot_components(plot_type="digit", x_component=0, y_component=1, digit_position=2),
    number_tsne.plot_components(plot_type="digit", x_component=0, y_component=1, digit_position=1),
    number_tsne.plot_components(plot_type="digit", x_component=0, y_component=1, digit_position=0)
)

save_plot("tsne_digit_visualizations", plot)

## UMAP

### Cosine similarity

In [None]:
umap_kwargs = dict(
    # Increase from default 15 to preserve more global structure
    n_neighbors=50,        
    # Decrease from default 0.1 for tighter local clusters
    min_dist=0.05,         
    metric="cosine",
    # Increase from default 1.0 to spread out the visualization
    spread=1.5,            
    # Increase to enhance local structure preservation
    local_connectivity=2,  
    random_state=42,
)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    number_umap_cos = number_analyzer.run_estimator(UMAP(**umap_kwargs))
    random_umap_cos = random_analyzer.run_estimator(UMAP(**umap_kwargs))

plot = alt.hconcat(
    number_umap_cos.plot_components(plot_type="gradient"),
    random_umap_cos.plot_components(plot_type="gradient")
).resolve_scale(color="independent")

save_plot("umap_cosine_components_gradient", plot)

In [None]:
# UMAP cosine digit visualizations
plot = colstack(
    number_umap_cos.plot_components(plot_type="digit_length", x_component=0, y_component=1),
    number_umap_cos.plot_components(plot_type="digit", x_component=0, y_component=1, digit_position=2),
    number_umap_cos.plot_components(plot_type="digit", x_component=0, y_component=1, digit_position=1),
    number_umap_cos.plot_components(plot_type="digit", x_component=0, y_component=1, digit_position=0)
)

save_plot("umap_cosine_digit_visualizations", plot)

### Euclidean distance

In [None]:
umap_kwargs.update(metric='euclidean')

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    number_umap_euc = number_analyzer.run_estimator(UMAP(**umap_kwargs))
    random_umap_euc = random_analyzer.run_estimator(UMAP(**umap_kwargs))

plot = alt.hconcat(
    number_umap_euc.plot_components(plot_type="gradient"),
    random_umap_euc.plot_components(plot_type="gradient")
).resolve_scale(color="independent")

save_plot("umap_euclidean_components_gradient", plot)

In [None]:
# UMAP Euclidean digit visualizations
plot = colstack(
    number_umap_euc.plot_components(plot_type="digit_length", x_component=0, y_component=1).resolve_legend(),
    number_umap_euc.plot_components(plot_type="digit", x_component=0, y_component=1, digit_position=2).resolve_legend(),
    number_umap_euc.plot_components(plot_type="digit", x_component=0, y_component=1, digit_position=1).resolve_legend(),
    number_umap_euc.plot_components(plot_type="digit", x_component=0, y_component=1, digit_position=0).resolve_legend()
).resolve_scale(color="independent")

save_plot("umap_euclidean_digit_visualizations", plot)