# 04: Topic Discovery

Use NLP to discover research topics and clusters.

## Goals
1. Extract topics from paper abstracts
2. Cluster papers by topic similarity
3. Identify emerging research areas

In [None]:
# Optional: Install scikit-learn for topic modeling
# !pip install scikit-learn

import pandas as pd
import numpy as np

## Using Semantic Scholar Embeddings

The Semantic Scholar MCP provides SPECTER embeddings for semantic similarity.

In [None]:
# Claude Code prompt:
# "Get paper embeddings for 'procedural content generation' papers
#  from 2020-2024 using Semantic Scholar SPECTER embeddings"

## Topic Clustering with TF-IDF

Simple topic extraction from abstracts.

In [None]:
def extract_topics_tfidf(abstracts, n_topics=10, n_words=10):
    """Extract topics using TF-IDF and clustering."""
    try:
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.cluster import KMeans
    except ImportError:
        print("Install scikit-learn: pip install scikit-learn")
        return None
    
    # Create TF-IDF matrix
    vectorizer = TfidfVectorizer(
        max_df=0.95,
        min_df=2,
        stop_words='english',
        ngram_range=(1, 2)
    )
    tfidf_matrix = vectorizer.fit_transform(abstracts)
    
    # Cluster
    kmeans = KMeans(n_clusters=n_topics, random_state=42)
    clusters = kmeans.fit_predict(tfidf_matrix)
    
    # Get top words per cluster
    feature_names = vectorizer.get_feature_names_out()
    topics = []
    
    for i in range(n_topics):
        center = kmeans.cluster_centers_[i]
        top_indices = center.argsort()[-n_words:][::-1]
        top_words = [feature_names[idx] for idx in top_indices]
        topics.append({
            'topic_id': i,
            'words': top_words,
            'n_papers': (clusters == i).sum()
        })
    
    return topics, clusters

## Using OpenAlex Topics

OpenAlex provides pre-computed topic classifications.

In [None]:
# Claude Code prompt:
# "Use OpenAlex to analyze topic trends for 'procedural generation' from 2015-2025"
# "Show me which OpenAlex topics are trending in game AI research"

## Visualize Topic Distribution

In [None]:
import matplotlib.pyplot as plt

def plot_topic_distribution(topics):
    """Plot topic distribution."""
    labels = [f"Topic {t['topic_id']}: {', '.join(t['words'][:3])}" for t in topics]
    sizes = [t['n_papers'] for t in topics]
    
    plt.figure(figsize=(12, 6))
    plt.barh(labels, sizes, color='steelblue')
    plt.xlabel('Number of Papers')
    plt.title('Topic Distribution in Research Corpus')
    plt.tight_layout()
    return plt.gcf()

# topics, clusters = extract_topics_tfidf(abstracts)
# plot_topic_distribution(topics)

## Export Results

In [None]:
# Save topic analysis results
# pd.DataFrame(topics).to_csv('../data/searches/topic_analysis.csv', index=False)