# Semantic News Clustering - Quick Start Notebook

This notebook demonstrates how to use the semantic news clustering system to group news articles by meaning, not just keywords.

## 1. Setup and Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

from sample_data_generator import generate_sample_news_data
from step2_preprocessor import TextPreprocessor
from step3_embeddings import SemanticEmbedder
from step4_similarity_graph import SimilarityGraph
from step5_graph_clustering import GraphClusterer
from step6_traditional_clustering import TraditionalClusterer, ClusteringEvaluator
from step7_visualization import ClusteringVisualizer

print("✓ All modules imported successfully")

## 2. Load Sample Data

In [None]:
# Generate sample news articles
texts, labels, categories = generate_sample_news_data(
    n_samples=100,
    n_categories=5
)

print(f"Loaded {len(texts)} documents from {len(categories)} categories")
print(f"Categories: {', '.join(categories)}")
print(f"\nSample article (category: {categories[labels[0]]}):\n{texts[0]}")

## 3. Preprocess Text

In [None]:
preprocessor = TextPreprocessor(
    remove_stopwords=True,
    lowercase=True,
    min_token_length=2
)

cleaned_texts = preprocessor.preprocess_batch(texts, verbose=True)

print(f"\nOriginal: {texts[0][:100]}...")
print(f"\nCleaned:  {cleaned_texts[0][:100]}...")

## 4. Generate Semantic Embeddings

In [None]:
embedder = SemanticEmbedder(model_name='all-MiniLM-L6-v2')
embeddings = embedder.embed_texts(cleaned_texts, batch_size=32, show_progress=True)
similarity_matrix = embedder.compute_similarity(embeddings)

print(f"\nEmbedding shape: {embeddings.shape}")
print(f"Similarity range: [{similarity_matrix.min():.3f}, {similarity_matrix.max():.3f}]")

## 5. Build Similarity Graph

In [None]:
graph_builder = SimilarityGraph(
    similarity_threshold=0.3,
    top_k_neighbors=10
)

graph = graph_builder.build_graph(similarity_matrix, labels=labels)
stats = graph_builder.get_graph_stats()

print("\nGraph Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

## 6. Apply Clustering Algorithms

In [None]:
n_clusters = len(categories)

# Graph-based clustering
graph_clusterer = GraphClusterer(n_clusters=n_clusters)
spectral_labels = graph_clusterer.spectral_clustering(graph, similarity_matrix)
louvain_labels = graph_clusterer.louvain_clustering(graph)

# Traditional clustering
trad_clusterer = TraditionalClusterer(n_clusters=n_clusters)
kmeans_labels = trad_clusterer.kmeans_clustering(embeddings)

print("\n✓ All clustering methods applied")

## 7. Evaluate and Compare Methods

In [None]:
all_results = {
    'Spectral (Graph)': spectral_labels,
    'Louvain (Graph)': louvain_labels,
    'K-Means (Traditional)': kmeans_labels,
}

evaluator = ClusteringEvaluator()
metrics = evaluator.compare_methods(all_results, labels, embeddings)

## 8. Visualize Results

In [None]:
visualizer = ClusteringVisualizer(figsize=(16, 10))

# t-SNE visualization
visualizer.plot_embeddings_2d(
    embeddings,
    spectral_labels,
    labels,
    method='tsne',
    title='Spectral Clustering (Graph-based)'
)

In [None]:
# Network graph visualization
visualizer.plot_graph_network(
    graph,
    spectral_labels,
    labels,
    layout='spring',
    max_nodes=100
)

In [None]:
# Cluster size distribution
visualizer.plot_cluster_sizes({
    'Spectral': spectral_labels,
    'Louvain': louvain_labels,
    'K-Means': kmeans_labels
})

## 9. Analyze Clusters

In [None]:
# Show sample articles from each cluster
import numpy as np

print("Sample articles from each cluster (Spectral Clustering):\n")

for cluster_id in np.unique(spectral_labels)[:5]:  # Show first 5 clusters
    cluster_indices = np.where(spectral_labels == cluster_id)[0]
    print(f"\nCluster {cluster_id} ({len(cluster_indices)} articles):")
    
    # Show first 2 articles from this cluster
    for idx in cluster_indices[:2]:
        print(f"  - {texts[idx][:80]}...")

## 10. Find Similar Articles

In [None]:
# Function to find similar articles
def find_similar_articles(article_idx, top_n=3):
    similarities = similarity_matrix[article_idx]
    similar_indices = np.argsort(similarities)[::-1][1:top_n+1]  # Exclude self
    
    print(f"Articles similar to: {texts[article_idx][:100]}...\n")
    
    for i, idx in enumerate(similar_indices, 1):
        print(f"{i}. Similarity: {similarities[idx]:.3f}")
        print(f"   {texts[idx][:100]}...\n")

# Example: Find articles similar to the first article
find_similar_articles(0, top_n=3)

## Summary

This notebook demonstrated:
1. Loading and preprocessing news articles
2. Generating semantic embeddings with transformers
3. Building a similarity graph
4. Applying graph-based and traditional clustering
5. Evaluating and comparing methods
6. Visualizing results
7. Analyzing clusters and finding similar articles

**Key Takeaway**: Graph-based methods (Spectral, Louvain) leverage semantic relationships and typically outperform traditional methods (K-Means) that only use embedding distances.