# Clustering Visualization with UMAP

This notebook visualizes article clusters in 2D using UMAP dimensionality reduction.

## Setup

First, install required packages:
```bash
pip install umap-learn matplotlib numpy pandas scikit-learn
```

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from umap import UMAP
from sklearn.manifold import TSNE

# Set plot style
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

## Load Data

In [None]:
# Load the exported data
with open('cluster_data.json', 'r') as f:
    data = json.load(f)

# Extract vectors and labels
vectors = np.array([article['vector'] for article in data['articles']])
labels = np.array([article['cluster'] for article in data['articles']])
titles = [article['title'] for article in data['articles']]
feeds = [article['feedTitle'] for article in data['articles']]

print(f"Loaded {len(vectors)} articles")
print(f"Vector dimensions: {vectors.shape[1]}")
print(f"Number of clusters: {len(np.unique(labels))}")
print(f"\nConfiguration: {data['config']}")

## UMAP 2D Visualization

UMAP (Uniform Manifold Approximation and Projection) is excellent for visualizing high-dimensional data.

In [None]:
# Apply UMAP for 2D visualization
print("Applying UMAP dimensionality reduction...")
umap_model = UMAP(
    n_components=2,
    n_neighbors=15,
    min_dist=0.1,
    metric='cosine',
    random_state=42
)
embeddings_2d = umap_model.fit_transform(vectors)
print("✓ UMAP complete")

In [None]:
# Create visualization
fig, ax = plt.subplots(figsize=(14, 10))

# Define colors for clusters
unique_labels = np.unique(labels)
colors = plt.cm.tab10(np.linspace(0, 1, len(unique_labels)))

# Plot each cluster
for idx, cluster_id in enumerate(unique_labels):
    mask = labels == cluster_id
    cluster_label = f"Cluster {cluster_id}" if cluster_id >= 0 else "Noise"
    
    ax.scatter(
        embeddings_2d[mask, 0],
        embeddings_2d[mask, 1],
        c=[colors[idx]],
        label=cluster_label,
        alpha=0.7,
        s=100,
        edgecolors='black',
        linewidth=0.5
    )

ax.set_xlabel('UMAP Dimension 1', fontsize=12)
ax.set_ylabel('UMAP Dimension 2', fontsize=12)
ax.set_title('Article Clusters - UMAP 2D Projection', fontsize=16, fontweight='bold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('cluster_visualization_umap.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualization saved as 'cluster_visualization_umap.png'")

## Interactive Plot with Article Titles

In [None]:
# Create a more detailed plot with annotations
fig, ax = plt.subplots(figsize=(16, 12))

# Plot points colored by cluster
for idx, cluster_id in enumerate(unique_labels):
    mask = labels == cluster_id
    cluster_label = f"Cluster {cluster_id}" if cluster_id >= 0 else "Noise"
    
    ax.scatter(
        embeddings_2d[mask, 0],
        embeddings_2d[mask, 1],
        c=[colors[idx]],
        label=cluster_label,
        alpha=0.6,
        s=150,
        edgecolors='black',
        linewidth=0.5
    )

# Annotate a few points from each cluster
for cluster_id in unique_labels:
    mask = labels == cluster_id
    cluster_indices = np.where(mask)[0]
    
    # Annotate up to 2 articles per cluster
    for i in cluster_indices[:2]:
        # Truncate title if too long
        title = titles[i][:40] + '...' if len(titles[i]) > 40 else titles[i]
        
        ax.annotate(
            title,
            xy=(embeddings_2d[i, 0], embeddings_2d[i, 1]),
            xytext=(5, 5),
            textcoords='offset points',
            fontsize=8,
            alpha=0.7,
            bbox=dict(boxstyle='round,pad=0.3', facecolor=colors[np.where(unique_labels == cluster_id)[0][0]], alpha=0.3)
        )

ax.set_xlabel('UMAP Dimension 1', fontsize=12)
ax.set_ylabel('UMAP Dimension 2', fontsize=12)
ax.set_title('Article Clusters with Sample Titles - UMAP 2D Projection', fontsize=16, fontweight='bold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('cluster_visualization_annotated.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Annotated visualization saved as 'cluster_visualization_annotated.png'")

## Cluster Analysis

In [None]:
# Analyze each cluster
print("=" * 80)
print("CLUSTER ANALYSIS")
print("=" * 80)

for cluster_id in sorted(unique_labels):
    mask = labels == cluster_id
    cluster_articles = [titles[i] for i in np.where(mask)[0]]
    cluster_feeds = [feeds[i] for i in np.where(mask)[0]]
    
    print(f"\nCluster {cluster_id} ({len(cluster_articles)} articles):")
    print("-" * 80)
    
    # Show feed distribution
    feed_counts = pd.Series(cluster_feeds).value_counts()
    print("Feed distribution:")
    for feed, count in feed_counts.items():
        print(f"  {feed}: {count} articles")
    
    print("\nSample articles:")
    for i, title in enumerate(cluster_articles[:5], 1):
        print(f"  {i}. {title}")
    
    if len(cluster_articles) > 5:
        print(f"  ... and {len(cluster_articles) - 5} more")

## Alternative: t-SNE Visualization

For comparison, you can also try t-SNE (tends to preserve local structure differently than UMAP).

In [None]:
# Apply t-SNE for comparison
print("Applying t-SNE dimensionality reduction...")
tsne_model = TSNE(
    n_components=2,
    perplexity=30,
    n_iter=1000,
    random_state=42
)
embeddings_tsne = tsne_model.fit_transform(vectors)
print("✓ t-SNE complete")

In [None]:
# Create t-SNE visualization
fig, ax = plt.subplots(figsize=(14, 10))

for idx, cluster_id in enumerate(unique_labels):
    mask = labels == cluster_id
    cluster_label = f"Cluster {cluster_id}" if cluster_id >= 0 else "Noise"
    
    ax.scatter(
        embeddings_tsne[mask, 0],
        embeddings_tsne[mask, 1],
        c=[colors[idx]],
        label=cluster_label,
        alpha=0.7,
        s=100,
        edgecolors='black',
        linewidth=0.5
    )

ax.set_xlabel('t-SNE Dimension 1', fontsize=12)
ax.set_ylabel('t-SNE Dimension 2', fontsize=12)
ax.set_title('Article Clusters - t-SNE 2D Projection', fontsize=16, fontweight='bold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('cluster_visualization_tsne.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ t-SNE visualization saved as 'cluster_visualization_tsne.png'")

## Comparison: UMAP vs t-SNE

Side-by-side comparison of both methods.

In [None]:
# Side-by-side comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# UMAP
for idx, cluster_id in enumerate(unique_labels):
    mask = labels == cluster_id
    ax1.scatter(
        embeddings_2d[mask, 0],
        embeddings_2d[mask, 1],
        c=[colors[idx]],
        label=f"Cluster {cluster_id}",
        alpha=0.7,
        s=100
    )

ax1.set_title('UMAP 2D Projection', fontsize=14, fontweight='bold')
ax1.set_xlabel('UMAP Dim 1')
ax1.set_ylabel('UMAP Dim 2')
ax1.legend()
ax1.grid(True, alpha=0.3)

# t-SNE
for idx, cluster_id in enumerate(unique_labels):
    mask = labels == cluster_id
    ax2.scatter(
        embeddings_tsne[mask, 0],
        embeddings_tsne[mask, 1],
        c=[colors[idx]],
        label=f"Cluster {cluster_id}",
        alpha=0.7,
        s=100
    )

ax2.set_title('t-SNE 2D Projection', fontsize=14, fontweight='bold')
ax2.set_xlabel('t-SNE Dim 1')
ax2.set_ylabel('t-SNE Dim 2')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('cluster_comparison_umap_tsne.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Comparison saved as 'cluster_comparison_umap_tsne.png'")