# Latent Space Topologies - Embedding Exploration

**Experiment:** Explore and visualize the geometry of embedding spaces

**Date:** 2025-11-05

**Research Question:** How can we understand and experience high-dimensional latent representations?

**Goals:**
- Load embedding model
- Create embedding space from text corpus
- Visualize topology and geometry
- Identify concept clusters and boundaries
- Prepare data for mobile app

## Setup

In [None]:
# 1. Add repository root to path to import harness
import sys
sys.path.append('../../../../')  # Go up to repo root from topologies/notebooks/

# 2. Import standard libraries
import numpy as np
import matplotlib.pyplot as plt

# 3. Import dimensionality reduction algorithms
from sklearn.decomposition import PCA       # Principal Component Analysis
from sklearn.manifold import TSNE            # t-SNE
import umap                                  # UMAP

# 4. Import distance and clustering utilities
from scipy.spatial.distance import pdist, squareform  # Distance calculations
from scipy.cluster.hierarchy import linkage, dendrogram  # Hierarchical clustering

# 5. Import harness experiment tracking
from harness import ExperimentConfig, ExperimentResult, get_tracker

# 6. Confirm successful imports
print("Imports successful")

## 1. Load Embedding Model

Load a sentence embedding model (e.g., from sentence-transformers)

In [None]:
try:
    from sentence_transformers import SentenceTransformer
    
    model_name = 'all-MiniLM-L6-v2'  # Lightweight model
    embedding_model = SentenceTransformer(model_name)
    
    print(f"Loaded model: {model_name}")
    print(f"Embedding dimension: {embedding_model.get_sentence_embedding_dimension()}")
    use_embeddings = True
    
except ImportError:
    print("sentence-transformers not available")
    print("Install with: pip install sentence-transformers")
    use_embeddings = False

## 2. Create Text Corpus

Define a diverse set of concepts to embed and explore

In [None]:
# Sample corpus covering different semantic domains
corpus = [
    # Emotions
    "happiness", "sadness", "anger", "fear", "surprise", "disgust", "joy", "anxiety",
    
    # Animals
    "dog", "cat", "bird", "fish", "elephant", "tiger", "whale", "butterfly",
    
    # Colors
    "red", "blue", "green", "yellow", "orange", "purple", "black", "white",
    
    # Abstract concepts
    "freedom", "justice", "truth", "beauty", "wisdom", "courage", "compassion", "creativity",
    
    # Technology
    "computer", "internet", "artificial intelligence", "smartphone", "robot", "algorithm", "data", "network",
    
    # Nature
    "mountain", "ocean", "forest", "desert", "river", "sun", "moon", "stars",
    
    # Actions
    "running", "jumping", "thinking", "learning", "creating", "communicating", "exploring", "discovering",
]

# Add labels for visualization
labels = corpus.copy()

print(f"Corpus size: {len(corpus)} concepts")

## 3. Generate Embeddings

In [None]:
if use_embeddings:
    # Encode corpus
    embeddings = embedding_model.encode(corpus, show_progress_bar=True)
    
    print(f"Embeddings shape: {embeddings.shape}")
    print(f"Embedding statistics:")
    print(f"  Mean: {np.mean(embeddings):.4f}")
    print(f"  Std: {np.std(embeddings):.4f}")
    print(f"  Min: {np.min(embeddings):.4f}")
    print(f"  Max: {np.max(embeddings):.4f}")
else:
    # Use random embeddings for demonstration
    embeddings = np.random.randn(len(corpus), 384)
    print("Using synthetic embeddings for demonstration")

## 4. Visualize with PCA

In [None]:
# Reduce to 2D with PCA
pca = PCA(n_components=2)
embeddings_2d_pca = pca.fit_transform(embeddings)

# Plot
plt.figure(figsize=(14, 10))
plt.scatter(embeddings_2d_pca[:, 0], embeddings_2d_pca[:, 1], alpha=0.6, s=100)

# Add labels
for i, label in enumerate(labels):
    plt.annotate(
        label,
        (embeddings_2d_pca[i, 0], embeddings_2d_pca[i, 1]),
        fontsize=9,
        alpha=0.8,
        xytext=(5, 5),
        textcoords='offset points'
    )

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Concept Space Topology (PCA)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.1%}")

## 5. Visualize with t-SNE

In [None]:
# Reduce to 2D with t-SNE
tsne = TSNE(n_components=2, perplexity=15, random_state=42)
embeddings_2d_tsne = tsne.fit_transform(embeddings)

# Plot
plt.figure(figsize=(14, 10))
plt.scatter(embeddings_2d_tsne[:, 0], embeddings_2d_tsne[:, 1], alpha=0.6, s=100)

# Add labels
for i, label in enumerate(labels):
    plt.annotate(
        label,
        (embeddings_2d_tsne[i, 0], embeddings_2d_tsne[i, 1]),
        fontsize=9,
        alpha=0.8,
        xytext=(5, 5),
        textcoords='offset points'
    )

plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.title('Concept Space Topology (t-SNE)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Visualize with UMAP

In [None]:
# Reduce to 2D with UMAP
reducer = umap.UMAP(n_components=2, random_state=42)
embeddings_2d_umap = reducer.fit_transform(embeddings)

# Plot
plt.figure(figsize=(14, 10))
plt.scatter(embeddings_2d_umap[:, 0], embeddings_2d_umap[:, 1], alpha=0.6, s=100)

# Add labels
for i, label in enumerate(labels):
    plt.annotate(
        label,
        (embeddings_2d_umap[i, 0], embeddings_2d_umap[i, 1]),
        fontsize=9,
        alpha=0.8,
        xytext=(5, 5),
        textcoords='offset points'
    )

plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.title('Concept Space Topology (UMAP)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Compute Distance Matrix

Analyze pairwise distances between concepts

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity
similarity_matrix = cosine_similarity(embeddings)

# Plot heatmap
plt.figure(figsize=(16, 14))
plt.imshow(similarity_matrix, cmap='coolwarm', vmin=-1, vmax=1)
plt.colorbar(label='Cosine Similarity')
plt.xticks(range(len(labels)), labels, rotation=90, fontsize=8)
plt.yticks(range(len(labels)), labels, fontsize=8)
plt.title('Concept Similarity Matrix')
plt.tight_layout()
plt.show()

# Find most/least similar pairs
n_concepts = len(labels)
similarities = []
for i in range(n_concepts):
    for j in range(i+1, n_concepts):
        similarities.append((labels[i], labels[j], similarity_matrix[i, j]))

similarities.sort(key=lambda x: x[2], reverse=True)

print("\nMost similar concept pairs:")
for i in range(min(5, len(similarities))):
    print(f"  {similarities[i][0]} <-> {similarities[i][1]}: {similarities[i][2]:.3f}")

print("\nLeast similar concept pairs:")
for i in range(max(0, len(similarities)-5), len(similarities)):
    print(f"  {similarities[i][0]} <-> {similarities[i][1]}: {similarities[i][2]:.3f}")

## 8. Hierarchical Clustering

Identify concept clusters and boundaries

In [None]:
# Compute linkage
distances = pdist(embeddings, metric='cosine')
linkage_matrix = linkage(distances, method='ward')

# Plot dendrogram
plt.figure(figsize=(16, 8))
dendrogram(linkage_matrix, labels=labels, leaf_font_size=10)
plt.xlabel('Concepts')
plt.ylabel('Distance')
plt.title('Concept Hierarchy (Hierarchical Clustering)')
plt.tight_layout()
plt.show()

## 9. Export for Mobile App

Prepare data for the Latent Topologies mobile application

In [None]:
import json

# Export concept constellation data
constellation_data = {
    'concepts': [
        {
            'id': i,
            'label': labels[i],
            'embedding': embeddings[i].tolist(),
            'position_pca': embeddings_2d_pca[i].tolist(),
            'position_tsne': embeddings_2d_tsne[i].tolist(),
            'position_umap': embeddings_2d_umap[i].tolist(),
        }
        for i in range(len(labels))
    ],
    'metadata': {
        'n_concepts': len(labels),
        'embedding_dim': embeddings.shape[1],
        'model': model_name if use_embeddings else 'synthetic',
    }
}

# Save to data directory
output_path = '../data/constellation_example.json'
with open(output_path, 'w') as f:
    json.dump(constellation_data, f, indent=2)

print(f"Exported constellation data to: {output_path}")
print(f"  {len(labels)} concepts")
print(f"  {embeddings.shape[1]}-dimensional embeddings")

## 10. Track Experiment

In [None]:
config = ExperimentConfig(
    experiment_name="latent_topologies_embedding_exploration",
    task_type="embedding_analysis",
    strategy="dimensionality_reduction",
    provider="sentence_transformers" if use_embeddings else "synthetic",
    model=model_name if use_embeddings else "random",
)

tracker = get_tracker()
run_dir = tracker.start_experiment(config)

result = ExperimentResult(
    config=config,
    task_input=f"Analyze {len(corpus)} concepts",
    output=f"Created constellation with {len(labels)} concepts",
    eval_scores={},
    eval_metadata={
        'n_concepts': len(labels),
        'embedding_dim': embeddings.shape[1],
        'pca_variance': float(sum(pca.explained_variance_ratio_)),
        'avg_similarity': float(np.mean(similarity_matrix[np.triu_indices_from(similarity_matrix, k=1)])),
    },
    success=True,
)

tracker.log_result(result)
summary = tracker.finish_experiment()
print(f"Experiment logged in: {run_dir}")

## Key Insights

**Observations:**
- Different dimensionality reduction methods reveal different aspects of topology
- PCA preserves global structure but loses local relationships
- t-SNE emphasizes local clusters
- UMAP balances global and local structure
- Semantic clusters emerge naturally (emotions, animals, colors, etc.)

**Research Questions:**
- How stable are these topologies across different models?
- Can we navigate latent space intuitively on mobile?
- What audio/haptic mappings best represent semantic distances?
- How do concept boundaries shift with context?

## Next Steps

1. **Mobile Integration**: Load constellation data into React Native app
2. **Audio Mapping**: Design sound synthesis for semantic navigation
3. **Haptic Feedback**: Map concept boundaries to vibration patterns
4. **Interactive Annotation**: Allow users to reshape/annotate space

## Mobile App Features

Based on this exploration, the mobile app should:
- Display concepts as a visual constellation
- Allow touch navigation through semantic space
- Provide audio feedback for semantic distance
- Give haptic feedback when crossing concept boundaries
- Support annotation and space manipulation