# The Voice Codex — Essence Analysis

This notebook analyses the 82-dim **Voice Essence** vectors extracted from `voice_codex_dataset`.

### Vector layout (ESSENCE_DIM = 82)

| Slice | Segment | Dims | Description |
|-------|---------|------|--------------|
| [0:5] | F0 stats | 5 | Pitch mean, std, min, max (Hz); voiced fraction |
| [5:8] | F0 trajectory | 3 | Slope (Hz/s); vibrato rate (Hz); vibrato depth (cents RMS) |
| [8:21] | MFCC mean | 13 | Vocal tract shape — temporal means |
| [21:34] | MFCC std | 13 | Vocal tract shape — temporal stds |
| [34:47] | Delta-MFCC mean | 13 | Rate of vocal tract change — means |
| [47:60] | Delta-MFCC std | 13 | Rate of vocal tract change — stds |
| [60:66] | Formants F1–F3 | 6 | Resonance frequencies — mean & std per formant (Hz) |
| [66:68] | HNR | 2 | Harmonics-to-noise ratio — mean & std (dB) |
| [68:72] | Amplitude | 4 | RMS envelope — mean, std, skewness, excess kurtosis |
| [72:78] | Spectral | 6 | Centroid mean/std, bandwidth mean, rolloff mean, ZCR mean/std |
| [78:82] | Onset | 4 | Onset rate, strength mean/std, mean inter-onset interval (s) |

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

from extract import ESSENCE_LAYOUT, ESSENCE_DIM

%matplotlib inline
sns.set_theme(style='whitegrid', palette='tab10')
plt.rcParams['figure.dpi'] = 120

## 1. Load essences

In [None]:
data   = np.load('essences.npz', allow_pickle=False)
X      = data['X']        # (45, 82)
labels = data['labels']   # speaker IDs
files  = data['files']

speakers     = sorted(set(labels))
n_speakers   = len(speakers)
spk_idx      = {s: i for i, s in enumerate(speakers)}
color_ids    = np.array([spk_idx[l] for l in labels])
palette      = plt.cm.tab20(np.linspace(0, 1, n_speakers))

print(f'Shape: {X.shape}  |  Speakers: {n_speakers}  |  All finite: {np.all(np.isfinite(X))}')

## 2. Sanity check: intra- vs inter-speaker distance

In [None]:
scaler = StandardScaler()
Xs     = scaler.fit_transform(X)

intra, inter = [], []
for i in range(len(Xs)):
    for j in range(i + 1, len(Xs)):
        d = np.linalg.norm(Xs[i] - Xs[j])
        (intra if labels[i] == labels[j] else inter).append(d)

fig, ax = plt.subplots(figsize=(8, 4))
ax.hist(intra, bins=20, alpha=0.6, label=f'Intra-speaker  μ={np.mean(intra):.2f}')
ax.hist(inter, bins=40, alpha=0.6, label=f'Inter-speaker  μ={np.mean(inter):.2f}')
ax.set_xlabel('Euclidean distance (standardised)')
ax.set_title(f'Speaker separation  |  inter/intra = {np.mean(inter)/np.mean(intra):.2f}×')
ax.legend()
plt.tight_layout(); plt.show()

print(f'Intra: mean={np.mean(intra):.3f}  std={np.std(intra):.3f}')
print(f'Inter: mean={np.mean(inter):.3f}  std={np.std(inter):.3f}')

## 3. F0 per speaker — pitch identity signature

In [None]:
f0_means = {spk: X[labels == spk, 0] for spk in speakers}

fig, ax = plt.subplots(figsize=(12, 4))
for i, spk in enumerate(speakers):
    vals = f0_means[spk]
    ax.scatter([i] * len(vals), vals, color=palette[i], s=60, zorder=3)
    ax.plot([i - 0.3, i + 0.3], [vals.mean()] * 2, color=palette[i], lw=2)

ax.set_xticks(range(n_speakers))
ax.set_xticklabels([s.replace('speaker_', 'S') for s in speakers], rotation=45)
ax.set_ylabel('F0 mean (Hz)')
ax.set_title('Fundamental Frequency per Speaker — each dot is one clip')
plt.tight_layout(); plt.show()

## 4. HNR and Formant F1 comparison

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

for ax, feat_idx, ylabel, title in [
    (axes[0], 66, 'HNR mean (dB)',       'Harmonics-to-Noise Ratio per Speaker'),
    (axes[1], 60, 'F1 mean (Hz)',         'Formant F1 per Speaker'),
]:
    for i, spk in enumerate(speakers):
        vals = X[labels == spk, feat_idx]
        ax.scatter([i] * len(vals), vals, color=palette[i], s=60, zorder=3)
        ax.plot([i - 0.3, i + 0.3], [vals.mean()] * 2, color=palette[i], lw=2)
    ax.set_xticks(range(n_speakers))
    ax.set_xticklabels([s.replace('speaker_', 'S') for s in speakers], rotation=45)
    ax.set_ylabel(ylabel)
    ax.set_title(title)

plt.tight_layout(); plt.show()

## 5. MFCC mean profile (vocal tract fingerprint)

In [None]:
n_mfcc = 13
mfcc_slice = slice(8, 21)

fig, ax = plt.subplots(figsize=(12, 5))
for i, spk in enumerate(speakers):
    m = X[labels == spk][:, mfcc_slice].mean(axis=0)
    ax.plot(range(n_mfcc), m, marker='o', color=palette[i],
            label=spk.replace('speaker_', 'S'), alpha=0.8, lw=1.5)

ax.set_xticks(range(n_mfcc))
ax.set_xticklabels([f'C{i}' for i in range(n_mfcc)])
ax.set_xlabel('MFCC coefficient')
ax.set_ylabel('Mean value')
ax.set_title('MFCC Mean Profile per Speaker  — vocal tract fingerprint')
ax.legend(ncol=5, fontsize=7)
plt.tight_layout(); plt.show()

## 6. PCA — 2-D speaker map

In [None]:
pca  = PCA(n_components=10)
X2   = pca.fit_transform(Xs)[:, :2]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scree plot
cum = np.cumsum(pca.explained_variance_ratio_) * 100
axes[0].bar(range(1, 11), pca.explained_variance_ratio_ * 100, alpha=0.7)
axes[0].plot(range(1, 11), cum, 'ro-', label='Cumulative')
axes[0].axhline(90, color='gray', linestyle='--', alpha=0.5, label='90 %')
axes[0].set_xlabel('PC'); axes[0].set_ylabel('Variance explained (%)')
axes[0].set_title('PCA Scree Plot'); axes[0].legend()

# 2-D scatter
for i, spk in enumerate(speakers):
    mask = labels == spk
    axes[1].scatter(X2[mask, 0], X2[mask, 1], color=palette[i],
                    s=80, label=spk.replace('speaker_', 'S'), edgecolors='k', lw=0.4)
axes[1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f} %)')
axes[1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f} %)')
axes[1].set_title('PCA 2-D: each speaker forms a tight cluster')
axes[1].legend(ncol=3, fontsize=7)

plt.tight_layout(); plt.show()

## 7. t-SNE — non-linear speaker map

In [None]:
tsne   = TSNE(n_components=2, perplexity=10, random_state=42, n_iter=1000)
X_tsne = tsne.fit_transform(Xs)

fig, ax = plt.subplots(figsize=(9, 7))
for i, spk in enumerate(speakers):
    mask = labels == spk
    ax.scatter(X_tsne[mask, 0], X_tsne[mask, 1], color=palette[i],
               s=100, label=spk.replace('speaker_', 'S'), edgecolors='k', lw=0.4)
    # label centroid
    cx, cy = X_tsne[mask, 0].mean(), X_tsne[mask, 1].mean()
    ax.text(cx, cy, spk.replace('speaker_', 'S'), fontsize=7,
            ha='center', va='center', weight='bold')

ax.set_title('t-SNE projection of 82-dim Voice Essence')
ax.legend(ncol=3, fontsize=7, loc='lower right')
plt.tight_layout(); plt.show()

## 8. Feature-segment importance (variance contribution)

In [None]:
# Variance of each dimension across the corpus (standardised)
# High variance = segment carries discriminative information
per_dim_var = Xs.var(axis=0)

seg_var = {}
for name, (sl, _) in ESSENCE_LAYOUT.items():
    seg_var[name] = per_dim_var[sl].mean()

names = list(seg_var.keys())
vals  = list(seg_var.values())

fig, ax = plt.subplots(figsize=(10, 4))
bars = ax.bar(names, vals, color=plt.cm.viridis(np.linspace(0.2, 0.85, len(names))))
ax.set_ylabel('Mean per-dimension variance (standardised)')
ax.set_title('Feature Segment Discriminative Variance')
ax.tick_params(axis='x', rotation=35)
for bar, v in zip(bars, vals):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
            f'{v:.2f}', ha='center', fontsize=8)
plt.tight_layout(); plt.show()

## 9. Correlation heatmap (scalar segments)

In [None]:
# Use only the first 2 features per segment as summary
sel_idx  = [0, 1, 5, 6, 8, 9, 60, 61, 62, 63, 66, 67, 68, 69, 72, 73, 78, 79]
sel_names = [
    'F0 mean', 'F0 std', 'F0 slope', 'Vibrato rate',
    'MFCC0 μ', 'MFCC1 μ',
    'F1 mean', 'F1 std', 'F2 mean', 'F2 std',
    'HNR mean', 'HNR std',
    'RMS mean', 'RMS std',
    'Centroid μ', 'Centroid σ',
    'Onset rate', 'Onset str μ',
]

df_sel = pd.DataFrame(X[:, sel_idx], columns=sel_names)
corr   = df_sel.corr()
mask   = np.triu(np.ones_like(corr, dtype=bool))

fig, ax = plt.subplots(figsize=(11, 9))
sns.heatmap(corr, mask=mask, annot=True, fmt='.2f',
            cmap='RdBu_r', center=0, vmin=-1, vmax=1,
            linewidths=0.4, ax=ax, annot_kws={'size': 7})
ax.set_title('Cross-segment Correlation (selected features)')
plt.tight_layout(); plt.show()

## 10. Summary table

In [None]:
rows = []
for spk in speakers:
    mask = labels == spk
    v    = X[mask]
    rows.append({
        'Speaker':      spk,
        'F0 mean (Hz)': f"{v[:, 0].mean():.1f} ± {v[:, 0].std():.1f}",
        'HNR (dB)':     f"{v[:, 66].mean():.1f} ± {v[:, 66].std():.1f}",
        'F1 (Hz)':      f"{v[:, 60].mean():.0f} ± {v[:, 60].std():.0f}",
        'F2 (Hz)':      f"{v[:, 62].mean():.0f} ± {v[:, 62].std():.0f}",
        'Onset/s':      f"{v[:, 78].mean():.2f}",
        'Vibrato depth':f"{v[:, 7].mean():.1f} cts",
    })

pd.DataFrame(rows).set_index('Speaker')