# 04: Embedding Space

Build the embedding space from CLAP embeddings: kNN graph construction, Leiden community detection, UMAP projection with parameter sweep, and per-community valence-arousal analysis with audio playback.

## Background

CLAP embeddings (512-d) are transformed into an interactive 2D visualization through three stages:

1. **kNN graph**: Encodes local similarity structure as a sparse affinity matrix.
2. **Leiden communities**: Partitions the graph into coherent clusters via modularity optimization.
3. **UMAP projection**: Reduces dimensionality while preserving neighborhood structure.

We evaluate projection quality using trustworthiness, continuity, and Shepard correlation.

## Setup

In [None]:
import sys
from pathlib import Path

project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.patches import Rectangle
from IPython.display import Audio, display

from core.embed import load_embeddings
from core.space.knn import build_knn, KNNConfig
from core.space.leiden import leiden_partition, LeidenConfig
from core.space.umap import umap_layout, UMAPConfig
from eval.manifold import score_projection_quality
from eval.cluster import score_cluster_quality

## Load Data

In [None]:
embeddings_path = project_root / "notebooks/data/embeddings/clap_embeddings_normalized.npz"
embeddings_list = load_embeddings(embeddings_path)

track_ids = np.array([e.track_id for e in embeddings_list])
embeddings = np.vstack([e.embedding for e in embeddings_list])

print(f"Loaded {len(track_ids)} tracks")
print(f"Embedding shape: {embeddings.shape}")

In [None]:
metadata_path = project_root / "notebooks/data/merge_preprocessed.csv"
df_meta = pd.read_csv(metadata_path)

# create track_id to index mapping for alignment
track_id_to_idx = {tid: i for i, tid in enumerate(track_ids)}
df_meta["emb_idx"] = df_meta["song_id"].map(track_id_to_idx)
df_meta = df_meta.dropna(subset=["emb_idx"]).reset_index(drop=True)
df_meta["emb_idx"] = df_meta["emb_idx"].astype(int)

print(f"Metadata: {len(df_meta)} tracks with embeddings")
df_meta[["song_id", "artist", "title", "quadrant", "arousal", "valence"]].head()

## Build kNN Graph

Construct a k-nearest neighbor graph where edge weights encode cosine similarity. The graph is symmetrized using max-pooling to ensure undirected edges.

In [None]:
knn_config = KNNConfig(k=15, symmetrize=True, symmetrize_mode="max")
knn = build_knn(embeddings, knn_config)

print(f"kNN Graph:")
print(f"  Nodes: {knn.n_nodes}")
print(f"  Edges: {knn.adjacency.nnz // 2}")
print(f"  Reciprocity: {knn.reciprocity():.3f}")
print(f"  Sparsity: {1.0 - knn.adjacency.nnz / (knn.n_nodes ** 2):.6f}")

## Community Detection

Partition the kNN graph using the Leiden algorithm with resolution parameter 1.0. Modularity above 0.3 indicates meaningful community structure.

In [None]:
leiden_config = LeidenConfig(resolution=1.0, seed=42)
leiden_result = leiden_partition(knn, leiden_config)

print(f"Leiden Partition:")
print(f"  Communities: {leiden_result.n_communities}")
print(f"  Modularity: {leiden_result.modularity:.4f}")

In [None]:
# community size distribution
membership = leiden_result.membership
community_ids, community_sizes = np.unique(membership, return_counts=True)
size_order = np.argsort(community_sizes)[::-1]

fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(range(len(community_sizes)), community_sizes[size_order], color="steelblue", alpha=0.8)
ax.set_xlabel("Community (sorted by size)")
ax.set_ylabel("Number of tracks")
ax.set_title("Community Size Distribution")
plt.tight_layout()
plt.show()

In [None]:
# size statistics
print(f"Community sizes:")
print(f"  Min: {community_sizes.min()}")
print(f"  Max: {community_sizes.max()}")
print(f"  Mean: {community_sizes.mean():.1f}")
print(f"  Median: {np.median(community_sizes):.1f}")

## UMAP Parameter Sweep

UMAP has two key parameters:

- **`n_neighbors`**: Controls the balance between local and global structure. Lower values emphasize local clusters; higher values preserve more global topology.
- **`min_dist`**: Controls cluster compactness. Lower values create tighter clusters; higher values spread points more uniformly.

We sweep both parameters and evaluate projection quality using trustworthiness (neighborhood preservation).

In [None]:
n_neighbors_values = [5, 10, 15, 25, 50]
min_dist_values = [0.0, 0.1, 0.25, 0.5]

In [None]:
sweep_results = []

for n_neighbors in n_neighbors_values:
    for min_dist in min_dist_values:
        config = UMAPConfig(
            n_neighbors=n_neighbors,
            min_dist=min_dist,
            seed=42,
            compute_quality=True,
            quality_subsample_n=2000,
        )
        result = umap_layout(embeddings, config)
        
        sweep_results.append({
            "n_neighbors": n_neighbors,
            "min_dist": min_dist,
            "trustworthiness": result.quality.trustworthiness,
            "continuity": result.quality.continuity,
            "shepard_rho": result.quality.shepard_rho,
        })
        
        print(f"n_neighbors={n_neighbors:2d}, min_dist={min_dist:.2f} -> trust={result.quality.trustworthiness:.3f}")

df_sweep = pd.DataFrame(sweep_results)

In [None]:
df_sweep

## Visualize Parameter Sweep

In [None]:
# pivot for heatmap
pivot = df_sweep.pivot(index="min_dist", columns="n_neighbors", values="trustworthiness")

fig, ax = plt.subplots(figsize=(8, 5))
sns.heatmap(pivot, annot=True, fmt=".3f", cmap="RdYlGn", ax=ax, cbar_kws={"label": "Trustworthiness"})
ax.set_xlabel("n_neighbors")
ax.set_ylabel("min_dist")
ax.set_title("UMAP Trustworthiness by Parameters")

# highlight default params (n_neighbors=15, min_dist=0.1)
default_col = list(pivot.columns).index(15)
default_row = list(pivot.index).index(0.1)
ax.add_patch(Rectangle((default_col, default_row), 1, 1, fill=False, edgecolor="red", lw=3))

plt.tight_layout()
plt.show()

In [None]:
# find best configuration
best_idx = df_sweep["trustworthiness"].idxmax()
best_row = df_sweep.loc[best_idx]

print(f"Best configuration:")
print(f"  n_neighbors: {int(best_row['n_neighbors'])}")
print(f"  min_dist: {best_row['min_dist']:.2f}")
print(f"  trustworthiness: {best_row['trustworthiness']:.4f}")

## Final UMAP Projection

Run UMAP with default parameters (n_neighbors=15, min_dist=0.1) for the final visualization.

In [None]:
umap_config = UMAPConfig(n_neighbors=15, min_dist=0.1, seed=42, compute_quality=True)
umap_result = umap_layout(embeddings, umap_config)
coords = umap_result.coords

print(f"UMAP Projection:")
print(f"  X range: [{coords[:, 0].min():.2f}, {coords[:, 0].max():.2f}]")
print(f"  Y range: [{coords[:, 1].min():.2f}, {coords[:, 1].max():.2f}]")
print(f"  Trustworthiness: {umap_result.quality.trustworthiness:.4f}")
print(f"  Continuity: {umap_result.quality.continuity:.4f}")
print(f"  Shepard rho: {umap_result.quality.shepard_rho:.4f}")

In [None]:
# scatter plot colored by community
fig, ax = plt.subplots(figsize=(10, 8))
scatter = ax.scatter(
    coords[:, 0], coords[:, 1],
    c=membership, cmap="tab20", s=10, alpha=0.7
)
ax.set_xlabel("UMAP 1")
ax.set_ylabel("UMAP 2")
ax.set_title(f"Embedding Space ({leiden_result.n_communities} communities)")
plt.colorbar(scatter, ax=ax, label="Community")
plt.tight_layout()
plt.show()

## Community Summary Table

In [None]:
# add membership and coords to metadata
df_meta["community"] = membership[df_meta["emb_idx"].values]
df_meta["umap_x"] = coords[df_meta["emb_idx"].values, 0]
df_meta["umap_y"] = coords[df_meta["emb_idx"].values, 1]

# compute quadrant distribution per community
summary_rows = []
for cid in sorted(df_meta["community"].unique()):
    subset = df_meta[df_meta["community"] == cid]
    size = len(subset)
    q_counts = subset["quadrant"].value_counts()
    q_dist = {q: q_counts.get(q, 0) for q in ["Q1", "Q2", "Q3", "Q4"]}
    summary_rows.append({
        "community": cid,
        "size": size,
        "Q1 (happy)": q_dist["Q1"],
        "Q2 (tense)": q_dist["Q2"],
        "Q3 (sad)": q_dist["Q3"],
        "Q4 (calm)": q_dist["Q4"],
    })

df_summary = pd.DataFrame(summary_rows).sort_values("size", ascending=False).reset_index(drop=True)
df_summary

## Community Selection

In [None]:
# modify this list to explore different communities
selected_communities = [5, 6, 7]

## Per-Community Valence-Arousal Analysis

Visualize the valence-arousal distribution for each selected community. Points are colored by quadrant:

| Quadrant | Condition | Color | Label |
|----------|-----------|-------|-------|
| Q1 | arousal >= 0.5, valence >= 0.5 | green | Excited/Happy |
| Q2 | arousal >= 0.5, valence < 0.5 | red | Tense/Angry |
| Q3 | arousal < 0.5, valence < 0.5 | blue | Sad/Depressed |
| Q4 | arousal < 0.5, valence >= 0.5 | orange | Calm/Relaxed |

In [None]:
quadrant_colors = {
    "Q1": "tab:green",
    "Q2": "tab:red",
    "Q3": "tab:blue",
    "Q4": "tab:orange",
}

n_selected = len(selected_communities)
fig, axes = plt.subplots(1, n_selected, figsize=(5 * n_selected, 5))
if n_selected == 1:
    axes = [axes]

for ax, cid in zip(axes, selected_communities):
    subset = df_meta[df_meta["community"] == cid]
    
    for q, color in quadrant_colors.items():
        q_data = subset[subset["quadrant"] == q]
        ax.scatter(q_data["valence"], q_data["arousal"], c=color, label=q, s=30, alpha=0.7)
    
    ax.axhline(0.5, color="gray", linestyle="--", linewidth=0.8)
    ax.axvline(0.5, color="gray", linestyle="--", linewidth=0.8)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_xlabel("Valence")
    ax.set_ylabel("Arousal")
    ax.set_title(f"Community {cid} (n={len(subset)})")
    ax.legend(loc="upper left", fontsize=8)

plt.tight_layout()
plt.show()

## Representative Tracks per Community

For each selected community, sample up to 5 tracks from each quadrant and display metadata with audio playback.

In [None]:
def display_community_samples(df: pd.DataFrame, community_id: int, samples_per_quadrant: int = 5) -> None:
    subset = df[df["community"] == community_id]
    print(f"COMMUNITY {community_id} ({len(subset)} tracks)")
    
    for q in ["Q1", "Q2", "Q3", "Q4"]:
        q_data = subset[subset["quadrant"] == q]
        if len(q_data) == 0:
            continue
        
        # sample tracks
        n_sample = min(samples_per_quadrant, len(q_data))
        sampled = q_data.sample(n=n_sample, random_state=42)
        
        quadrant_labels = {
            "Q1": "Excited/Happy",
            "Q2": "Tense/Angry",
            "Q3": "Sad/Depressed",
            "Q4": "Calm/Relaxed",
        }
        
        print(f"\n{q} - {quadrant_labels[q]} ({len(q_data)} tracks in quadrant, showing {n_sample})")
        
        display_cols = ["song_id", "artist", "title", "arousal", "valence", "genre", "mood_all", "theme", "style"]
        available_cols = [c for c in display_cols if c in sampled.columns]
        display(sampled[available_cols].reset_index(drop=True))
        
        for _, row in sampled.iterrows():
            audio_path = row.get("audio_path", None)
            if audio_path and Path(audio_path).exists():
                print(f"\n{row['artist']} - {row['title']}")
                display(Audio(audio_path))

In [None]:
for cid in selected_communities:
    display_community_samples(df_meta, cid, samples_per_quadrant=5)

## Summary

In [None]:
print("Embedding Space Pipeline Complete")
print(f"\nOutputs:")
print(f"  kNN graph: {knn.n_nodes} nodes, {knn.adjacency.nnz // 2} edges")
print(f"  Leiden communities: {leiden_result.n_communities}")
print(f"  UMAP coordinates: {coords.shape}")

print(f"\nKey Metrics:")
print(f"  Modularity: {leiden_result.modularity:.4f} (target > 0.3)")
print(f"  Trustworthiness: {umap_result.quality.trustworthiness:.4f} (target > 0.8)")
print(f"  Continuity: {umap_result.quality.continuity:.4f}")
print(f"  Shepard rho: {umap_result.quality.shepard_rho:.4f}")