# MATLAB Spectral Clustering Validation

This notebook validates the `fitkit.community.SpectralCluster` implementation against the original MATLAB reference code from Sanguinetti, Laidler & Lawrence (2005).

## Reference

MATLAB demos: `~/lawrennd/spectral/matlab/`
- `demoCircles.m` - Three concentric circles
- `demoShapes.m` - Image segmentation

## Key Implementation Details

The `SpectralCluster` class uses:
- Gaussian affinity: $A_{ij} = \exp(-\|x_i - x_j\|^2 / (2\sigma^2))$
- Normalized Laplacian: $L = D^{-1/2} A D^{-1/2}$
- Iterative eigenvector selection with elongated k-means
- Origin detector to automatically determine cluster count

**Note on sigma parameter**: Per CIP-0007, sigma must scale with dataset density. For the 300-point circles demo, we use sigma=0.158 (MATLAB equivalent).

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from scipy.linalg import eigh
import sys
sys.path.insert(0, '..')

from fitkit.community import SpectralCluster

%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 5)

## Experiment 1: Three Concentric Circles (demoCircles.m)

Generate three concentric circles with noise and automatically detect the number of clusters.

In [None]:
# Generate three circles (100 points each, matching MATLAB)
np.random.seed(42)
npts = 100
angle = np.linspace(0, 2*np.pi, npts, endpoint=False)
radius_noise = np.random.randn(npts) * 0.1

# Create circles at radii 1, 2, 3 (reusing same noise like MATLAB)
r1 = 1 + radius_noise
r2 = 2 + radius_noise
r3 = 3 + radius_noise

circle1 = np.column_stack([r1 * np.cos(angle), r1 * np.sin(angle)])
circle2 = np.column_stack([r2 * np.cos(angle), r2 * np.sin(angle)])
circle3 = np.column_stack([r3 * np.cos(angle), r3 * np.sin(angle)])

x_circles = np.vstack([circle1, circle2, circle3])
true_labels = np.repeat([0, 1, 2], npts)

print(f"Dataset shape: {x_circles.shape}")
print(f"Expected clusters: 3")

In [None]:
# Visualize the data
plt.figure(figsize=(6, 6))
plt.scatter(x_circles[:, 0], x_circles[:, 1], c=true_labels, cmap='tab10', s=20)
plt.title('Three Concentric Circles (Ground Truth)')
plt.axis('equal')
plt.colorbar(label='True Cluster')
plt.show()

### Run Spectral Clustering

Use `SpectralCluster` with sigma=0.158 (equivalent to MATLAB's sigma=0.05 in their exponential formula).

In [None]:
# Apply spectral clustering
print("Running SpectralCluster...\n")
clf_circles = SpectralCluster(
    sigma=0.158,           # Converts from MATLAB sigma=0.05
    lambda_=0.2,           # Elongation parameter
    max_clusters=10,       # Maximum clusters to consider
    random_state=1,
    verbose=True
)
clf_circles.fit(x_circles)

print(f"\n{'='*60}")
print(f"RESULT: Three Circles")
print(f"{'='*60}")
print(f"Detected clusters: {clf_circles.n_clusters_}")
print(f"Expected clusters: 3")
print(f"Status: {'✓ PASS' if clf_circles.n_clusters_ == 3 else '✗ FAIL'}")
print(f"\nCluster sizes: {np.bincount(clf_circles.labels_)}")
print(f"Eigenvectors used: {clf_circles.eigenvectors_.shape[1]}")
print(f"{'='*60}\n")

In [None]:
# Visualize results
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Ground truth
axes[0].scatter(x_circles[:, 0], x_circles[:, 1], c=true_labels, cmap='tab10', s=20)
axes[0].set_title('Ground Truth (3 circles)', fontweight='bold')
axes[0].axis('equal')
axes[0].grid(True, alpha=0.3)

# Predicted clusters
axes[1].scatter(x_circles[:, 0], x_circles[:, 1], c=clf_circles.labels_, cmap='tab10', s=20)
axes[1].set_title(f'Predicted ({clf_circles.n_clusters_} clusters)', fontweight='bold', 
                  color='green' if clf_circles.n_clusters_ == 3 else 'red')
axes[1].axis('equal')
axes[1].grid(True, alpha=0.3)

# Eigenspace (first 2 eigenvectors)
axes[2].scatter(clf_circles.eigenvectors_[:, 0], clf_circles.eigenvectors_[:, 1], 
                c=clf_circles.labels_, cmap='tab10', s=20)
axes[2].scatter(clf_circles.cluster_centers_[:, 0], clf_circles.cluster_centers_[:, 1],
                c='black', marker='D', s=100, edgecolors='white', linewidths=2, label='Centers')
axes[2].set_xlabel('Eigenvector 1')
axes[2].set_ylabel('Eigenvector 2')
axes[2].set_title('Eigenspace Projection', fontweight='bold')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### Cluster Purity Analysis

In [None]:
from collections import Counter

print("Cluster Purity Analysis:")
print("="*60)
for pred_label in range(clf_circles.n_clusters_):
    mask = clf_circles.labels_ == pred_label
    true_in_cluster = true_labels[mask]
    most_common = Counter(true_in_cluster).most_common(1)[0]
    purity = most_common[1] / len(true_in_cluster) * 100
    print(f"Predicted cluster {pred_label}: {len(true_in_cluster):3d} points, "
          f"{purity:5.1f}% from true cluster {most_common[0]}")
print("="*60)

## Experiment 2: Smaller Dataset (Sigma Scaling)

Test with a smaller dataset (50 points per circle) to verify sigma parameter scaling per CIP-0007.

In [None]:
# Generate smaller dataset
np.random.seed(42)
npts_small = 50
angle_small = np.linspace(0, 2*np.pi, npts_small, endpoint=False)
radius_noise_small = np.random.randn(npts_small) * 0.1

r1_small = 1 + radius_noise_small
r2_small = 2 + radius_noise_small
r3_small = 3 + radius_noise_small

circle1_small = np.column_stack([r1_small * np.cos(angle_small), r1_small * np.sin(angle_small)])
circle2_small = np.column_stack([r2_small * np.cos(angle_small), r2_small * np.sin(angle_small)])
circle3_small = np.column_stack([r3_small * np.cos(angle_small), r3_small * np.sin(angle_small)])

x_small = np.vstack([circle1_small, circle2_small, circle3_small])
true_labels_small = np.repeat([0, 1, 2], npts_small)

print(f"Small dataset shape: {x_small.shape}")
print(f"Expected clusters: 3")

In [None]:
# Test with adjusted sigma (per CIP-0007, sigma=0.2 works for 150 points)
print("Running SpectralCluster on smaller dataset...\n")
clf_small = SpectralCluster(
    sigma=0.2,             # Increased sigma for lower density
    lambda_=0.2,
    max_clusters=10,
    random_state=1,
    verbose=True
)
clf_small.fit(x_small)

print(f"\n{'='*60}")
print(f"RESULT: Smaller Dataset (sigma scaling test)")
print(f"{'='*60}")
print(f"Detected clusters: {clf_small.n_clusters_}")
print(f"Expected clusters: 3")
print(f"Status: {'✓ PASS' if clf_small.n_clusters_ == 3 else '✗ FAIL'}")
print(f"\nCluster sizes: {np.bincount(clf_small.labels_)}")
print(f"{'='*60}\n")

## Summary

Validation against MATLAB reference implementation:

In [None]:
print("\n" + "="*60)
print("VALIDATION SUMMARY")
print("="*60)

print("\n1. THREE CIRCLES (300 points, sigma=0.158)")
print(f"   Expected: 3 clusters")
print(f"   Detected: {clf_circles.n_clusters_} clusters")
print(f"   Status: {'✓ PASS' if clf_circles.n_clusters_ == 3 else '✗ FAIL'}")

print("\n2. SMALLER DATASET (150 points, sigma=0.2)")
print(f"   Expected: 3 clusters")
print(f"   Detected: {clf_small.n_clusters_} clusters")
print(f"   Status: {'✓ PASS' if clf_small.n_clusters_ == 3 else '✗ FAIL'}")
print(f"   Note: Demonstrates sigma scaling from CIP-0007")

print("\n" + "="*60)
print("✓ All tests passed!" if (clf_circles.n_clusters_ == 3 and clf_small.n_clusters_ == 3) 
      else "✗ Some tests failed")
print("="*60 + "\n")