# Numeric embedding analysis

In [15]:
import altair as alt
from embanalysis.duckdb_loader import DuckDBLoader
from embanalysis.constants import PROJECT_ROOT, HF_MODEL_ALIASES
from embanalysis.analyzer import EmbeddingsAnalyzer

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from umap import UMAP

import warnings

alt.data_transformers.disable_max_rows()
alt.renderers.set_embed_options(theme="light")

RendererRegistry.enable('default')

In [None]:
model_id = "allenai/OLMo-2-1124-7B"
version = "v1"
plots_path = PROJECT_ROOT / "plots"
svg_plots_path = PROJECT_ROOT / "thesis" / "src" / "plots"
csv_path = PROJECT_ROOT / "thesis" / "src" / "csv"
enable_display = True
enable_save = False

In [None]:
model_id = HF_MODEL_ALIASES.get(model_id, model_id)
model_name = model_id.split("/")[-1]


def register_plot(name, plot, save=enable_save, display=enable_display):
    if save:
        global counter

        plots_path.mkdir(parents=True, exist_ok=True)
        svg_plots_path.mkdir(parents=True, exist_ok=True)

        file_name = f"{model_name}_{name}_{version}"

        plot.save(svg_plots_path / f"{file_name}.svg")
        plot.save(plots_path / f"{file_name}.png")

    if display:
        return plot

def register_df(name, df):
    if enable_save:
        df.to_csv(csv_path / f"{model_name}_{name}_{version}.csv", index=False)
    if enable_display:
        return df

In [None]:
loader = DuckDBLoader.default(read_only=True)

# Load samples for the specified model
samples = loader.get_model_samples(model_id)
samples

In [None]:
# Create analyzers for number embeddings and random embeddings
number_analyzer = EmbeddingsAnalyzer.from_sample(samples["integers"])
random_analyzer = EmbeddingsAnalyzer.from_sample(samples["random"])

print(f"Number embeddings shape: {number_analyzer.embeddings_df.shape}")
print(f"Random embeddings shape: {random_analyzer.embeddings_df.shape}")
number_analyzer.embeddings_df.head()

# Linear Dimensionality Reduction

## Principal Component Analysis

In [None]:
number_pca = number_analyzer.run_estimator(PCA(n_components=1000))
random_pca = random_analyzer.run_estimator(PCA(n_components=100))

plot = alt.hconcat(
    number_pca.plot.components(plot_type="gradient"),
    random_pca.plot.components(plot_type="gradient"),
).resolve_scale(color="independent")

register_plot("pca_components_gradient", plot)

### Explained variance

In [None]:
plot = number_pca.plot.variance_overview()
register_plot("pca_variance_overview", plot)

## Singular Value Decomposition

In [None]:
number_svd = number_analyzer.run_estimator(TruncatedSVD(n_components=100))
random_svd = random_analyzer.run_estimator(TruncatedSVD(n_components=100))

plot = alt.hconcat(
    number_svd.plot.components(plot_type="gradient"),
    random_svd.plot.components(plot_type="gradient"),
).resolve_scale(color="independent")

register_plot("svd_components_gradient", plot)

In [None]:
plot = alt.hconcat(
    number_svd.plot.components(plot_type="digit_length", x_component=0, y_component=1),
    number_svd.plot.components(plot_type="digit", x_component=0, y_component=1, digit_position=2)
).resolve_scale(color="independent")

register_plot("svd_digit_visualizations", plot)

### Correlation Heatmap

In [None]:
random_svd = random_analyzer.run_estimator(TruncatedSVD(n_components=100))
# Compare correlation heatmaps between number and random embeddings
plot = alt.hconcat(
    number_svd.plot.correlation_heatmap(20),
    random_svd.plot.correlation_heatmap(20),
)

register_plot("svd_correlation_heatmap", plot)

### Components with maximum correlation

In [None]:
plot = number_pca.plot.top_correlated_components(
    n_vectors=10, corr_df=number_svd.top_correlations_df(10)
)
register_plot("pca_top_correlated_components", plot)

In [None]:
plot = number_svd.plot.top_correlated_components(n_vectors=10)
register_plot("svd_top_correlated_components", plot)

# Non-Linear Dimensionality Reduction

## t-SNE

In [None]:
tsne_kwargs = dict(
    perplexity=75,
    max_iter=3000,
    learning_rate=50,
    early_exaggeration=20,
    random_state=42,
)

# Apply t-SNE to both embeddings
number_tsne = number_analyzer.run_estimator(TSNE(**tsne_kwargs))
random_tsne = random_analyzer.run_estimator(TSNE(**tsne_kwargs))

plot = alt.hconcat(
    number_tsne.plot.components(plot_type="gradient"),
    random_tsne.plot.components(plot_type="gradient"),
).resolve_scale(color="independent")

register_plot("tsne_components_gradient", plot)

In [None]:
# t-SNE digit visualizations
plot = alt.hconcat(
    number_tsne.plot.components(plot_type="digit_length", x_component=0, y_component=1),
    number_tsne.plot.components(
        plot_type="digit", x_component=0, y_component=1, digit_position=2
    ),
).resolve_legend(color="independent")

register_plot("tsne_digit_visualizations", plot)

## UMAP

### Cosine similarity

In [None]:
umap_kwargs = dict(
    # Increase from default 15 to preserve more global structure
    n_neighbors=50,
    # Decrease from default 0.1 for tighter local clusters
    min_dist=0.05,
    metric="cosine",
    # Increase from default 1.0 to spread out the visualization
    spread=1.5,
    # Increase to enhance local structure preservation
    local_connectivity=2,
    random_state=42,
)

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    number_umap_cos = number_analyzer.run_estimator(UMAP(**umap_kwargs))
    random_umap_cos = random_analyzer.run_estimator(UMAP(**umap_kwargs))

plot = alt.hconcat(
    number_umap_cos.plot.components(plot_type="gradient"),
    random_umap_cos.plot.components(plot_type="gradient"),
).resolve_scale(color="independent")

register_plot("umap_cosine_components_gradient", plot)

In [None]:
# UMAP cosine digit visualizations
plot = alt.hconcat(
    number_umap_cos.plot.components(
        plot_type="digit_length", x_component=0, y_component=1
    ),
    number_umap_cos.plot.components(
        plot_type="digit", x_component=0, y_component=1, digit_position=2
    ),
).resolve_legend(color="independent")

register_plot("umap_cosine_digit_visualizations", plot)

### Euclidean distance

In [None]:
umap_kwargs.update(metric="euclidean")

with warnings.catch_warnings():
    warnings.filterwarnings("ignore")
    number_umap_euc = number_analyzer.run_estimator(UMAP(**umap_kwargs))
    random_umap_euc = random_analyzer.run_estimator(UMAP(**umap_kwargs))

plot = alt.hconcat(
    number_umap_euc.plot.components(plot_type="gradient"),
    random_umap_euc.plot.components(plot_type="gradient"),
).resolve_scale(color="independent")

register_plot("umap_euclidean_components_gradient", plot)

In [None]:
# UMAP Euclidean digit visualizations
plot = alt.hconcat(
    number_umap_euc.plot.components(plot_type="digit_length", x_component=0, y_component=1),
    number_umap_euc.plot.components( plot_type="digit", x_component=0, y_component=1, digit_position=2)
).resolve_scale(color="independent")

register_plot("umap_euclidean_digit_visualizations", plot)

# Feature Analysis

In [11]:
number_analyzer.feature_to_sequence_analysis_df()



Unnamed: 0,Dimension,Property,Encoding,Correlation,P_Value
0,514,log,direct,-0.672870,8.446593e-133
1,514,numbers,direct,-0.643011,9.390179e-118
2,514,even,direct,-0.643011,9.390179e-118
3,3085,even,direct,-0.609901,6.384090e-103
4,3085,numbers,direct,-0.609901,6.384090e-103
...,...,...,...,...,...
217083,4094,factorials,fourier_cos_T1,,
217084,4095,primes,fourier_cos_T1,,
217085,4095,fibonacci,fourier_cos_T1,,
217086,4095,triangular,fourier_cos_T1,,


In [1]:
plot = number_analyzer.plot.strong_property_correlation_bar_chart()
register_plot("strong_property_correlations", plot)

NameError: name 'number_analyzer' is not defined

In [None]:
len(number_analyzer.embeddings_df)

In [None]:
df = number_analyzer.feature_to_sequence_analysis_df()

In [None]:
df[df['Property'].isin(['fourier_cos_T1', 'fourier_sin_T1'])].head(10)

In [None]:
import pandas as pd
from embanalysis.feature_analysis import make_encoded_sequences

df = pd.DataFrame(
    make_encoded_sequences(len(number_analyzer.embeddings_df)),
)
df.columns = df.columns.map(lambda x: "/".join(x))

df.reset_index(inplace=True)
df

In [None]:
import altair as alt
alt.Chart(df).mark_rule(strokeDash=[10, 5], color='darkred').encode(x='index', y='fibonacci/binary:Q')

In [None]:
number_analyzer.plot.feature_with_discrete_sequences(184, ['fibonacci', 'prime'])

In [None]:
import pandas as pd
from embanalysis.feature_analysis import make_encoded_sequences

sequences = make_encoded_sequences(999)
seq = sequences['fibonacci', 'gauss']
seq

In [None]:
# Create a DataFrame with numbers 0 to 999
numbers_df = pd.DataFrame({'number': seq})

numbers_df.reset_index(inplace=True)

# Plot using Altair
plot = alt.Chart(numbers_df).mark_line().encode(
    x='index',
    y='number:Q',
).properties(
    title='Gaussian Smoothed Fibonacci Sequence'
).interactive()

plot