# The Voice Codex — Exploratory Analysis

This notebook walks through:
1. Loading a pre-built feature archive (`features.npz`)
2. Sanity-checking with `verify.py`
3. Descriptive statistics
4. Correlation heatmap
5. PCA projection
6. F0 and MFCC distribution plots

Run `aggregate.py` first to generate `features.npz`.

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from aggregate import load_npz
from verify import verify
from extract import N_MFCC

%matplotlib inline
sns.set_theme(style='whitegrid', palette='muted')
plt.rcParams['figure.dpi'] = 120

## 1. Load features

In [None]:
NPZ_PATH = Path('features.npz')  # adjust if needed

if not NPZ_PATH.exists():
    raise FileNotFoundError(
        f'{NPZ_PATH} not found.\n'
        'Run:  python aggregate.py <audio_dir> --output features.npz'
    )

features, files = load_npz(NPZ_PATH)
print(f'Loaded {features.shape[0]} samples × {features.shape[1]} features')

## 2. Verification report

In [None]:
report = verify(features, files=files)
print(report.summary())

## 3. Build a labelled DataFrame

In [None]:
SCALAR_NAMES = [
    'f0_mean', 'f0_std', 'f0_min', 'f0_max', 'f0_voiced_frac',
    'hnr_mean', 'rms_mean', 'rms_std',
    'centroid_mean', 'centroid_std', 'bandwidth_mean', 'rolloff_mean', 'zcr_mean',
]
mfcc_mean_names  = [f'mfcc_mean_{i}'  for i in range(N_MFCC)]
mfcc_std_names   = [f'mfcc_std_{i}'   for i in range(N_MFCC)]
delta_mean_names = [f'dmfcc_mean_{i}' for i in range(N_MFCC)]
delta_std_names  = [f'dmfcc_std_{i}'  for i in range(N_MFCC)]

all_names = SCALAR_NAMES + mfcc_mean_names + mfcc_std_names + delta_mean_names + delta_std_names

df = pd.DataFrame(features, columns=all_names)
df.insert(0, 'file', [Path(f).name for f in files])
df.head()

## 4. Descriptive statistics

In [None]:
df[SCALAR_NAMES].describe().T.style.background_gradient(cmap='YlOrRd', axis=1)

## 5. Correlation heatmap (scalar features)

In [None]:
corr = df[SCALAR_NAMES].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(
    corr, mask=mask, annot=True, fmt='.2f',
    cmap='RdBu_r', center=0, vmin=-1, vmax=1,
    linewidths=0.5, ax=ax
)
ax.set_title('Feature Correlation (scalar features)', fontsize=13)
plt.tight_layout()
plt.show()

## 6. F0 distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

f0_nonzero = df['f0_mean'][df['f0_mean'] > 0]
sns.histplot(f0_nonzero, bins=30, kde=True, ax=axes[0])
axes[0].set_xlabel('F0 mean (Hz)')
axes[0].set_title('F0 Mean Distribution')

sns.histplot(df['f0_voiced_frac'], bins=20, kde=True, ax=axes[1])
axes[1].set_xlabel('Voiced fraction')
axes[1].set_title('Voiced Frame Fraction')

plt.tight_layout()
plt.show()

## 7. MFCC mean profiles

In [None]:
mfcc_data = df[mfcc_mean_names].values   # (N, 13)

fig, ax = plt.subplots(figsize=(10, 4))
ax.fill_between(
    range(N_MFCC),
    mfcc_data.mean(0) - mfcc_data.std(0),
    mfcc_data.mean(0) + mfcc_data.std(0),
    alpha=0.3, label='±1 std'
)
ax.plot(range(N_MFCC), mfcc_data.mean(0), marker='o', label='mean')
ax.set_xticks(range(N_MFCC))
ax.set_xticklabels([f'C{i}' for i in range(N_MFCC)])
ax.set_xlabel('MFCC coefficient')
ax.set_ylabel('Value')
ax.set_title('MFCC Mean Profile (corpus average ± std)')
ax.legend()
plt.tight_layout()
plt.show()

## 8. PCA — 2D projection of all features

In [None]:
numeric_cols = [c for c in df.columns if c != 'file']
X = df[numeric_cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=min(10, X_scaled.shape[1]))
pca.fit(X_scaled)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Explained variance
axes[0].bar(range(1, len(pca.explained_variance_ratio_) + 1),
            np.cumsum(pca.explained_variance_ratio_) * 100)
axes[0].axhline(90, color='red', linestyle='--', label='90 %')
axes[0].set_xlabel('Components')
axes[0].set_ylabel('Cumulative variance explained (%)')
axes[0].set_title('PCA — Cumulative Variance')
axes[0].legend()

# 2-D scatter
X2 = pca.transform(X_scaled)[:, :2]
axes[1].scatter(X2[:, 0], X2[:, 1], alpha=0.6, edgecolors='k', linewidths=0.3, s=40)
axes[1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f} %)')
axes[1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f} %)')
axes[1].set_title('PCA 2-D Projection')

plt.tight_layout()
plt.show()

## 9. Export cleaned DataFrame

In [None]:
out_csv = 'features_labelled.csv'
df.to_csv(out_csv, index=False)
print(f'Saved {out_csv}  ({df.shape[0]} rows × {df.shape[1]} cols)')