# Numeric embedding analysis

In [1]:
import altair as alt
from embanalysis.duckdb_loader import DuckDBLoader
from embanalysis.constants import PROJECT_ROOT, HF_MODEL_ALIASES
from embanalysis.analyzer import EmbeddingsAnalyzer

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from umap import UMAP

import warnings

alt.data_transformers.disable_max_rows()
alt.renderers.set_embed_options(theme="dark")

RendererRegistry.enable('default')

In [2]:
model_id = "allenai/OLMo-2-1124-7B"
version = "v1"
plots_path = PROJECT_ROOT / "plots"
svg_plots_path = PROJECT_ROOT / "thesis" / "src" / "plots"
enable_display_plot = True
enable_save_plot = False

In [3]:
model_id = HF_MODEL_ALIASES.get(model_id, model_id)
model_name = model_id.split("/")[-1]
counter = 0


def register_plot(name, plot):
    if enable_save_plot:
        global counter

        plots_path.mkdir(parents=True, exist_ok=True)
        svg_plots_path.mkdir(parents=True, exist_ok=True)

        file_name = f"{model_name}_{counter:02d}_{name}_{version}"

        plot.save(svg_plots_path / f"{file_name}.svg")
        plot.save(plots_path / f"{file_name}.png")
        counter += 1

    if enable_display_plot:
        return plot

In [4]:
loader = DuckDBLoader.default(read_only=True)

# Load samples for the specified model
samples = loader.get_model_samples(model_id)
samples

{'integers': EmbeddingsSample(sample_id=1, meta=IntegerSampleMeta(model_id='allenai/OLMo-2-1124-7B', tag='integers')),
 'random': EmbeddingsSample(sample_id=2, meta=RandomSampleMeta(model_id='allenai/OLMo-2-1124-7B', sample_size=1000, seed=1234, tag='random'))}

In [5]:
# Create analyzers for number embeddings and random embeddings
number_analyzer = EmbeddingsAnalyzer.from_sample(samples["integers"])
random_analyzer = EmbeddingsAnalyzer.from_sample(samples["random"])

print(f"Number embeddings shape: {number_analyzer.embeddings_df.shape}")
print(f"Random embeddings shape: {random_analyzer.embeddings_df.shape}")
number_analyzer.embeddings_df.head()

Number embeddings shape: (1000, 4098)
Random embeddings shape: (1000, 4098)


Unnamed: 0,token_id,token,embeddings_0,embeddings_1,embeddings_2,embeddings_3,embeddings_4,embeddings_5,embeddings_6,embeddings_7,...,embeddings_4086,embeddings_4087,embeddings_4088,embeddings_4089,embeddings_4090,embeddings_4091,embeddings_4092,embeddings_4093,embeddings_4094,embeddings_4095
0,15,0,0.011222,-0.053304,0.027681,0.017035,-0.031508,0.035597,0.005884,0.12104,...,-0.006816,0.008134,0.011557,0.02093,0.007472,0.022228,-0.01271,-0.024415,-0.009979,0.060893
1,16,1,0.005024,-0.017246,0.015205,0.030968,-0.001669,-0.000502,-0.010488,0.097102,...,-0.032912,0.035343,0.021819,-0.0145,-0.030072,0.020858,-0.019381,0.000477,-0.049117,0.033351
2,17,2,0.007985,0.060892,0.027107,0.050701,0.017511,0.019558,0.004031,0.089646,...,0.017936,0.009002,0.019765,-0.03735,0.028122,0.032685,0.031882,0.008905,-0.032327,0.031943
3,18,3,0.003406,-0.001149,0.021003,0.032835,0.038525,0.012474,0.004353,0.155187,...,0.05186,0.017438,-0.025527,-0.012722,0.047611,0.000185,-0.004836,0.014018,-0.008631,-0.007731
4,19,4,-0.0083,0.044511,0.022762,0.068484,-0.01854,0.031693,0.005468,0.177626,...,0.033702,0.042859,-0.011499,-0.035958,-0.034807,-0.011618,-0.022247,0.008732,0.022598,0.029843


# Linear Dimensionality Reduction

## Principal Component Analysis

In [None]:
number_pca = number_analyzer.run_estimator(PCA(n_components=1000))
random_pca = random_analyzer.run_estimator(PCA(n_components=100))

plot = alt.hconcat(
    number_pca.plot.components(plot_type="gradient"),
    random_pca.plot.components(plot_type="gradient"),
).resolve_scale(color="independent")

register_plot("pca_components_gradient", plot)

### Explained variance

In [None]:
plot = number_pca.plot.variance_overview()
register_plot("pca_variance_overview", plot)

## Singular Value Decomposition

In [None]:
number_svd = number_analyzer.run_estimator(TruncatedSVD(n_components=100))
random_svd = random_analyzer.run_estimator(TruncatedSVD(n_components=100))

plot = alt.hconcat(
    number_svd.plot.components(plot_type="gradient"),
    random_svd.plot.components(plot_type="gradient"),
).resolve_scale(color="independent")

register_plot("svd_components_gradient", plot)

In [None]:
plot = alt.hconcat(
    number_svd.plot.components(plot_type="digit_length", x_component=0, y_component=1),
    number_svd.plot.components(plot_type="digit", x_component=0, y_component=1, digit_position=2)
).resolve_scale(color="independent")

register_plot("svd_digit_visualizations", plot)

### Correlation Heatmap

In [None]:
random_svd = random_analyzer.run_estimator(TruncatedSVD(n_components=100))
# Compare correlation heatmaps between number and random embeddings
plot = alt.hconcat(
    number_svd.plot.correlation_heatmap(20),
    random_svd.plot.correlation_heatmap(20),
)

register_plot("svd_correlation_heatmap", plot)

### Components with maximum correlation

In [None]:
plot = number_pca.plot.top_correlated_components(
    n_vectors=10, corr_df=number_svd.top_correlations_df(10)
)
register_plot("pca_top_correlated_components", plot)

In [None]:
plot = number_svd.plot.top_correlated_components(n_vectors=10)
register_plot("svd_top_correlated_components", plot)

# Non-Linear Dimensionality Reduction

## t-SNE

In [None]:
tsne_kwargs = dict(
    perplexity=75,
    max_iter=3000,
    learning_rate=50,
    early_exaggeration=20,
    random_state=42,
)

# Apply t-SNE to both embeddings
number_tsne = number_analyzer.run_estimator(TSNE(**tsne_kwargs))
random_tsne = random_analyzer.run_estimator(TSNE(**tsne_kwargs))

plot = alt.hconcat(
    number_tsne.plot.components(plot_type="gradient"),
    random_tsne.plot.components(plot_type="gradient"),
).resolve_scale(color="independent")

register_plot("tsne_components_gradient", plot)

In [None]:
# t-SNE digit visualizations
plot = alt.hconcat(
    number_tsne.plot.components(plot_type="digit_length", x_component=0, y_component=1),
    number_tsne.plot.components(
        plot_type="digit", x_component=0, y_component=1, digit_position=2
    ),
).resolve_legend(color="independent")

register_plot("tsne_digit_visualizations", plot)

## UMAP

### Cosine similarity

In [None]:
umap_kwargs = dict(
    # Increase from default 15 to preserve more global structure
    n_neighbors=50,
    # Decrease from default 0.1 for tighter local clusters
    min_dist=0.05,
    metric="cosine",
    # Increase from default 1.0 to spread out the visualization
    spread=1.5,
    # Increase to enhance local structure preservation
    local_connectivity=2,
    random_state=42,
)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    number_umap_cos = number_analyzer.run_estimator(UMAP(**umap_kwargs))
    random_umap_cos = random_analyzer.run_estimator(UMAP(**umap_kwargs))

plot = alt.hconcat(
    number_umap_cos.plot.components(plot_type="gradient"),
    random_umap_cos.plot.components(plot_type="gradient"),
).resolve_scale(color="independent")

register_plot("umap_cosine_components_gradient", plot)

In [None]:
# UMAP cosine digit visualizations
plot = alt.hconcat(
    number_umap_cos.plot.components(
        plot_type="digit_length", x_component=0, y_component=1
    ),
    number_umap_cos.plot.components(
        plot_type="digit", x_component=0, y_component=1, digit_position=2
    ),
).resolve_legend(color="independent")

register_plot("umap_cosine_digit_visualizations", plot)

### Euclidean distance

In [None]:
umap_kwargs.update(metric="euclidean")

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    number_umap_euc = number_analyzer.run_estimator(UMAP(**umap_kwargs))
    random_umap_euc = random_analyzer.run_estimator(UMAP(**umap_kwargs))

plot = alt.hconcat(
    number_umap_euc.plot.components(plot_type="gradient"),
    random_umap_euc.plot.components(plot_type="gradient"),
).resolve_scale(color="independent")

register_plot("umap_euclidean_components_gradient", plot)

In [None]:
# UMAP Euclidean digit visualizations
plot = alt.hconcat(
    number_umap_euc.plot.components(plot_type="digit_length", x_component=0, y_component=1),
    number_umap_euc.plot.components( plot_type="digit", x_component=0, y_component=1, digit_position=2)
).resolve_scale(color="independent")

register_plot("umap_euclidean_digit_visualizations", plot)

# Feature Analysis

In [10]:
number_analyzer.feature_to_sequence_analysis_df()

Unnamed: 0,Dimension,Property,Correlation,P_Value,Abs_Correlation
0,514,log/direct,-0.672870,8.446593e-133,0.672870
1,514,numbers/direct,-0.643011,9.390179e-118,0.643011
2,3085,numbers/direct,-0.609901,6.384090e-103,0.609901
3,3085,log/direct,-0.606534,1.646729e-101,0.606534
4,665,numbers/direct,-0.573848,1.201617e-88,0.573848
...,...,...,...,...,...
53243,4091,even/direct,,,
53244,4092,even/direct,,,
53245,4093,even/direct,,,
53246,4094,even/direct,,,


In [None]:
plot = number_analyzer.plot.strong_property_correlation_bar_chart()
register_plot("strong_property_correlations", plot)

In [11]:
import pandas as pd
from embanalysis.feature_analysis import make_sequences

sequences = make_sequences(999)
sequences['fibonacci/gauss']

array([9.61084160e-01, 9.21910000e-01, 8.42312634e-01, 7.29422202e-01,
       6.03294393e-01, 4.88528418e-01, 4.00699025e-01, 3.37718703e-01,
       2.84482422e-01, 2.32746903e-01, 1.95014996e-01, 1.88030306e-01,
       2.03468066e-01, 2.08372785e-01, 1.78688071e-01, 1.23639802e-01,
       7.35911573e-02, 5.39919159e-02, 7.35242410e-02, 1.23203453e-01,
       1.76472108e-01, 1.99541564e-01, 1.76035759e-01, 1.20987490e-01,
       6.47599366e-02, 2.69959580e-02, 8.83122065e-03, 2.65231219e-03,
       2.65231219e-03, 8.83122065e-03, 2.69959580e-02, 6.47599366e-02,
       1.20987490e-01, 1.76035759e-01, 1.99474648e-01, 1.76035759e-01,
       1.20987490e-01, 6.47599366e-02, 2.69959580e-02, 8.76430436e-03,
       2.21596317e-03, 4.36349021e-04, 6.69162896e-05, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 6.69162896e-05,
       4.36349021e-04, 2.21596317e-03, 8.76430436e-03, 2.69959580e-02,
       6.47599366e-02, 1.20987490e-01, 1.76035759e-01, 1.99474648e-01,
      

In [19]:
# Create a DataFrame with numbers 0 to 999
numbers_df = pd.DataFrame({'number': sequences['fibonacci/gauss']})

numbers_df.reset_index(inplace=True)


In [21]:
# Plot using Altair
plot = alt.Chart(numbers_df).mark_line().encode(
    x='index',
    y='number:Q',
).properties(
    title='Gaussian Smoothed Fibonacci Sequence'
).interactive()

plot