# V4 Embedding Space Analysis

Structural analysis of code-smriti embeddings:
1. Document type separation
2. Language clustering
3. Hierarchy validation (symbol ‚Üí file ‚Üí module)
4. Cross-repo similarity

In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict
from pathlib import Path
import sys

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Try UMAP if available
try:
    import umap
    HAS_UMAP = True
except ImportError:
    HAS_UMAP = False
    print("UMAP not installed. Run: pip install umap-learn")

# Couchbase
sys.path.insert(0, str(Path.cwd()))
from storage.couchbase_client import CouchbaseClient

plt.style.use('seaborn-v0_8-whitegrid')
print("Ready")

## 1. Load Embeddings from Couchbase

In [None]:
cb = CouchbaseClient()

def fetch_embeddings(doc_type: str, limit: int = 5000) -> pd.DataFrame:
    """Fetch embeddings for a document type."""
    query = f"""
        SELECT 
            META().id as doc_id,
            repo_id,
            CASE 
                WHEN type = 'file_index' THEN file_path
                WHEN type = 'symbol_index' THEN file_path
                WHEN type = 'module_summary' THEN module_path
                ELSE repo_id
            END as path,
            CASE
                WHEN type = 'file_index' THEN metadata.language
                WHEN type = 'symbol_index' THEN metadata.language
                ELSE 'summary'
            END as language,
            embedding
        FROM `code_kosha`
        WHERE type = '{doc_type}'
          AND embedding IS NOT NULL
        LIMIT {limit}
    """
    
    results = list(cb.cluster.query(query))
    
    rows = []
    for r in results:
        if r.get('embedding'):
            rows.append({
                'doc_id': r['doc_id'],
                'repo_id': r['repo_id'],
                'path': r['path'],
                'language': r.get('language', 'unknown'),
                'type': doc_type,
                'embedding': np.array(r['embedding'])
            })
    
    return pd.DataFrame(rows)

# Fetch all types
print("Fetching embeddings...")
df_file = fetch_embeddings('file_index', 3000)
df_symbol = fetch_embeddings('symbol_index', 3000)
df_module = fetch_embeddings('module_summary', 1000)
df_repo = fetch_embeddings('repo_summary', 500)

print(f"file_index: {len(df_file)}")
print(f"symbol_index: {len(df_symbol)}")
print(f"module_summary: {len(df_module)}")
print(f"repo_summary: {len(df_repo)}")

# Combine
df_all = pd.concat([df_file, df_symbol, df_module, df_repo], ignore_index=True)
print(f"\nTotal: {len(df_all)} documents with embeddings")

In [None]:
# Extract embedding matrix
embeddings = np.vstack(df_all['embedding'].values)
print(f"Embedding shape: {embeddings.shape}")
print(f"Embedding dim: {embeddings.shape[1]}")

## 2. PCA Analysis - Variance Explained

In [None]:
# PCA to understand dimensionality
pca_full = PCA(n_components=min(100, embeddings.shape[1]))
pca_full.fit(embeddings)

# Plot variance explained
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Cumulative variance
cumvar = np.cumsum(pca_full.explained_variance_ratio_)
axes[0].plot(cumvar, 'b-', linewidth=2)
axes[0].axhline(y=0.9, color='r', linestyle='--', label='90% variance')
axes[0].axhline(y=0.95, color='orange', linestyle='--', label='95% variance')
axes[0].set_xlabel('Number of Components')
axes[0].set_ylabel('Cumulative Explained Variance')
axes[0].set_title('PCA: Cumulative Variance Explained')
axes[0].legend()
axes[0].grid(True)

# Find 90% and 95% thresholds
n_90 = np.argmax(cumvar >= 0.9) + 1
n_95 = np.argmax(cumvar >= 0.95) + 1
print(f"Components for 90% variance: {n_90}")
print(f"Components for 95% variance: {n_95}")

# Individual variance (first 30)
axes[1].bar(range(30), pca_full.explained_variance_ratio_[:30])
axes[1].set_xlabel('Component')
axes[1].set_ylabel('Explained Variance Ratio')
axes[1].set_title('PCA: Variance per Component (first 30)')

plt.tight_layout()
plt.show()

## 3. Document Type Separation (2D Visualization)

In [None]:
# PCA to 2D for visualization
pca_2d = PCA(n_components=2)
embeddings_2d = pca_2d.fit_transform(embeddings)

df_all['pca_x'] = embeddings_2d[:, 0]
df_all['pca_y'] = embeddings_2d[:, 1]

# Plot by document type
fig, ax = plt.subplots(figsize=(12, 8))

type_colors = {
    'symbol_index': 'blue',
    'file_index': 'green', 
    'module_summary': 'orange',
    'repo_summary': 'red'
}

for doc_type, color in type_colors.items():
    mask = df_all['type'] == doc_type
    ax.scatter(
        df_all.loc[mask, 'pca_x'],
        df_all.loc[mask, 'pca_y'],
        c=color, label=doc_type, alpha=0.5, s=20
    )

ax.set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]:.1%} var)')
ax.set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]:.1%} var)')
ax.set_title('Document Type Separation (PCA)')
ax.legend()
plt.show()

print(f"PC1 + PC2 explain {sum(pca_2d.explained_variance_ratio_):.1%} of variance")

In [None]:
# UMAP visualization (better for cluster structure)
if HAS_UMAP:
    print("Running UMAP (this may take a minute)...")
    reducer = umap.UMAP(n_components=2, n_neighbors=30, min_dist=0.1, random_state=42)
    embeddings_umap = reducer.fit_transform(embeddings)
    
    df_all['umap_x'] = embeddings_umap[:, 0]
    df_all['umap_y'] = embeddings_umap[:, 1]
    
    fig, ax = plt.subplots(figsize=(12, 8))
    
    for doc_type, color in type_colors.items():
        mask = df_all['type'] == doc_type
        ax.scatter(
            df_all.loc[mask, 'umap_x'],
            df_all.loc[mask, 'umap_y'],
            c=color, label=doc_type, alpha=0.5, s=20
        )
    
    ax.set_xlabel('UMAP 1')
    ax.set_ylabel('UMAP 2')
    ax.set_title('Document Type Separation (UMAP)')
    ax.legend()
    plt.show()
else:
    print("Skipping UMAP (not installed)")

## 4. Language Clustering

In [None]:
# Filter to file_index and symbol_index (have language info)
df_code = df_all[df_all['type'].isin(['file_index', 'symbol_index'])].copy()

# Top languages
top_langs = df_code['language'].value_counts().head(8).index.tolist()
df_code_top = df_code[df_code['language'].isin(top_langs)]

print(f"Top languages: {top_langs}")
print(f"Documents: {len(df_code_top)}")

# Plot
fig, ax = plt.subplots(figsize=(12, 8))

lang_colors = plt.cm.tab10(np.linspace(0, 1, len(top_langs)))

for i, lang in enumerate(top_langs):
    mask = df_code_top['language'] == lang
    ax.scatter(
        df_code_top.loc[mask, 'pca_x'],
        df_code_top.loc[mask, 'pca_y'],
        c=[lang_colors[i]], label=lang, alpha=0.5, s=20
    )

ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('Language Clustering (PCA)')
ax.legend()
plt.show()

In [None]:
# UMAP by language
if HAS_UMAP and 'umap_x' in df_all.columns:
    fig, ax = plt.subplots(figsize=(12, 8))
    
    for i, lang in enumerate(top_langs):
        mask = df_code_top['language'] == lang
        ax.scatter(
            df_code_top.loc[mask, 'umap_x'],
            df_code_top.loc[mask, 'umap_y'],
            c=[lang_colors[i]], label=lang, alpha=0.5, s=20
        )
    
    ax.set_xlabel('UMAP 1')
    ax.set_ylabel('UMAP 2')
    ax.set_title('Language Clustering (UMAP)')
    ax.legend()
    plt.show()

## 5. Intra-Repo vs Inter-Repo Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Sample repos with enough documents
repo_counts = df_file['repo_id'].value_counts()
sample_repos = repo_counts[repo_counts >= 20].head(10).index.tolist()

print(f"Analyzing {len(sample_repos)} repos with 20+ files each")

intra_sims = []  # Similarities within same repo
inter_sims = []  # Similarities across repos

for repo in sample_repos:
    repo_mask = df_file['repo_id'] == repo
    repo_embeds = np.vstack(df_file.loc[repo_mask, 'embedding'].values)
    
    # Intra-repo: pairwise within repo (sample to limit computation)
    if len(repo_embeds) > 50:
        idx = np.random.choice(len(repo_embeds), 50, replace=False)
        repo_embeds_sample = repo_embeds[idx]
    else:
        repo_embeds_sample = repo_embeds
    
    sim_matrix = cosine_similarity(repo_embeds_sample)
    # Get upper triangle (excluding diagonal)
    triu_idx = np.triu_indices(len(sim_matrix), k=1)
    intra_sims.extend(sim_matrix[triu_idx].tolist())

# Inter-repo: compare files from different repos
for i, repo1 in enumerate(sample_repos[:5]):
    for repo2 in sample_repos[i+1:6]:
        embeds1 = np.vstack(df_file.loc[df_file['repo_id'] == repo1, 'embedding'].values[:20])
        embeds2 = np.vstack(df_file.loc[df_file['repo_id'] == repo2, 'embedding'].values[:20])
        
        cross_sim = cosine_similarity(embeds1, embeds2)
        inter_sims.extend(cross_sim.flatten().tolist())

print(f"Intra-repo pairs: {len(intra_sims)}")
print(f"Inter-repo pairs: {len(inter_sims)}")

In [None]:
# Plot distributions
fig, ax = plt.subplots(figsize=(10, 6))

ax.hist(intra_sims, bins=50, alpha=0.7, label=f'Intra-repo (mean={np.mean(intra_sims):.3f})', density=True)
ax.hist(inter_sims, bins=50, alpha=0.7, label=f'Inter-repo (mean={np.mean(inter_sims):.3f})', density=True)

ax.set_xlabel('Cosine Similarity')
ax.set_ylabel('Density')
ax.set_title('File Similarity: Within Repo vs Across Repos')
ax.legend()
plt.show()

print(f"\nIntra-repo similarity: {np.mean(intra_sims):.3f} ¬± {np.std(intra_sims):.3f}")
print(f"Inter-repo similarity: {np.mean(inter_sims):.3f} ¬± {np.std(inter_sims):.3f}")
print(f"Separation: {np.mean(intra_sims) - np.mean(inter_sims):.3f}")

## 6. Hierarchy Validation (Symbol ‚Üí File ‚Üí Module)

In [None]:
# Check if symbols are closer to their parent file than to random files

# Build file embedding lookup
file_embeds = {row['path']: row['embedding'] for _, row in df_file.iterrows()}

# For each symbol, compute similarity to its file vs random files
symbol_to_own_file = []
symbol_to_random_file = []

sample_symbols = df_symbol.sample(min(500, len(df_symbol)))
random_files = list(file_embeds.values())

for _, sym in sample_symbols.iterrows():
    sym_embed = sym['embedding'].reshape(1, -1)
    sym_file = sym['path']
    
    # Similarity to own file
    if sym_file in file_embeds:
        own_file_embed = file_embeds[sym_file].reshape(1, -1)
        own_sim = cosine_similarity(sym_embed, own_file_embed)[0, 0]
        symbol_to_own_file.append(own_sim)
        
        # Similarity to random file
        rand_idx = np.random.randint(0, len(random_files))
        rand_embed = random_files[rand_idx].reshape(1, -1)
        rand_sim = cosine_similarity(sym_embed, rand_embed)[0, 0]
        symbol_to_random_file.append(rand_sim)

print(f"Computed {len(symbol_to_own_file)} symbol-file pairs")

In [None]:
# Plot
fig, ax = plt.subplots(figsize=(10, 6))

ax.hist(symbol_to_own_file, bins=50, alpha=0.7, 
        label=f'Symbol ‚Üí Own File (mean={np.mean(symbol_to_own_file):.3f})', density=True)
ax.hist(symbol_to_random_file, bins=50, alpha=0.7, 
        label=f'Symbol ‚Üí Random File (mean={np.mean(symbol_to_random_file):.3f})', density=True)

ax.set_xlabel('Cosine Similarity')
ax.set_ylabel('Density')
ax.set_title('Hierarchy Validation: Symbol to File Similarity')
ax.legend()
plt.show()

print(f"\nSymbol ‚Üí Own file: {np.mean(symbol_to_own_file):.3f} ¬± {np.std(symbol_to_own_file):.3f}")
print(f"Symbol ‚Üí Random file: {np.mean(symbol_to_random_file):.3f} ¬± {np.std(symbol_to_random_file):.3f}")
print(f"Hierarchy coherence: {np.mean(symbol_to_own_file) - np.mean(symbol_to_random_file):.3f}")

## 7. Cross-Repo Similarity Matrix

In [None]:
# Compute average embedding per repo (centroid)
repo_centroids = {}

for repo in df_file['repo_id'].unique():
    repo_embeds = np.vstack(df_file.loc[df_file['repo_id'] == repo, 'embedding'].values)
    repo_centroids[repo] = repo_embeds.mean(axis=0)

# Select top repos by file count for visualization
top_repos = repo_counts.head(15).index.tolist()
centroid_matrix = np.vstack([repo_centroids[r] for r in top_repos])

# Compute similarity matrix
repo_sim_matrix = cosine_similarity(centroid_matrix)

# Shorten repo names for display
short_names = [r.split('/')[-1][:15] for r in top_repos]

# Plot heatmap
fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(repo_sim_matrix, xticklabels=short_names, yticklabels=short_names,
            cmap='RdYlBu_r', center=0.5, annot=True, fmt='.2f', ax=ax)
ax.set_title('Repository Similarity Matrix (File Centroids)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 8. Summary Statistics

In [None]:
print("=" * 60)
print("EMBEDDING ANALYSIS SUMMARY")
print("=" * 60)

print(f"\nüìä Data Overview:")
print(f"   Total documents: {len(df_all):,}")
print(f"   Embedding dimension: {embeddings.shape[1]}")
print(f"   Unique repos: {df_all['repo_id'].nunique()}")

print(f"\nüìà PCA Analysis:")
print(f"   Components for 90% variance: {n_90}")
print(f"   Components for 95% variance: {n_95}")
print(f"   First 2 PCs explain: {sum(pca_2d.explained_variance_ratio_):.1%}")

print(f"\nüîó Similarity Analysis:")
print(f"   Intra-repo file similarity: {np.mean(intra_sims):.3f}")
print(f"   Inter-repo file similarity: {np.mean(inter_sims):.3f}")
print(f"   Repo coherence (diff): {np.mean(intra_sims) - np.mean(inter_sims):.3f}")

print(f"\nüèóÔ∏è Hierarchy Validation:")
print(f"   Symbol ‚Üí Own file: {np.mean(symbol_to_own_file):.3f}")
print(f"   Symbol ‚Üí Random file: {np.mean(symbol_to_random_file):.3f}")
print(f"   Hierarchy coherence: {np.mean(symbol_to_own_file) - np.mean(symbol_to_random_file):.3f}")

print("\n" + "=" * 60)