# Latent Semantic Analysis: How Many Topics? A Practical Guide

## Today's Focus
1. Calculate explained variance
2. Create a scree plot
3. Apply the elbow method
4. Test our choice with real data

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# Load presidential speeches
df = pd.read_excel("presidential_speeches_updated.xlsx")
print(f"Loaded {len(df)} speeches from {df['speaker'].nunique()} speakers")

Loaded 20 speeches from 4 speakers


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
# Define a function to apply spaCy to a single text
def process_lemma_spacy(text):

    # Example: return a list of tokens

# Apply the function to the 'speech'


## Step 1: Create TF-IDF Matrix

In [None]:
# Convert to TF-IDF
vectorizer = TfidfVectorizer(stop_words='english',
                             max_features=500,
                             min_df = .25, # Removes terms that appear too infrequently, as % of manuscripts
                             max_df = .75 # Removes terms that appear too frequently, as % of manuscripts
                             )

X_tfidf = vectorizer.fit_transform(df['speech_lemma'])
print(f"Matrix shape: {X_tfidf.shape}")

Matrix shape: (20, 500)


## Step 2: Test Different Numbers of Topics

We'll test 1-10 topics and see how much information each captures.

In [None]:
# Test different numbers of topics
max_topics = 10
variances = []

# Fit and calculate the total variance explained by each number of topics

    # Fit LSA with n topics

    # Store total variance explained

    # Store total variance across

## Step 3: Variance Explained Plot

Look for the number of topics where the plateau of variance explained occurs.

In [None]:
# Instead of re-fitting for each n, fit once with max_topics


# Individual explained variance ratios

# Cumulative explained variance


In [None]:
# ---- Plot ----
x = np.arange(1, max_topics + 1)

fig, ax = plt.subplots(figsize=(9,5))
ax.bar(x, indiv_var, color="seagreen", alpha=0.65, label="Individual Explained Variance")
ax.plot(x, cum_var, marker="o", color="crimson", linewidth=2, label="Cumulative Explained Variance")

# Labels above bars
for xi, yi in zip(x, indiv_var):
    ax.text(xi, yi + 0.01, f"{yi*100:.1f}%", ha="center", va="bottom", fontsize=9, color="seagreen")

# Labels on cumulative points
for xi, yi in zip(x, cum_var):
    ax.annotate(f"{yi*100:.1f}%", (xi, yi), textcoords="offset points", xytext=(0,8),
                ha="center", fontsize=9, color="crimson")

ax.set_title("Explained Variance by Different Principal Components (Topics)")
ax.set_xlabel("Principal Components (Topics)")
ax.set_ylabel("Explained Variance")
ax.set_xticks(x)
ax.set_ylim(0, 1.05)
ax.grid(True, axis="y", alpha=0.3)
ax.legend(loc="upper left")
plt.tight_layout()
plt.show()

## Step 4: Comparing Topic Quality Across Different K Values

Let's examine how well documents cluster with different numbers of topics and identify the most representative words for each topic.


### 4.1: Extract and Compare Top Words for Different K Values

In [None]:
# Initialize empty list to store our results
topics_with_words = []

# How many top words to extract per topic
n_words = 10

# Get the vocabulary (feature names) from the vectorizer
# This gives us the actual words corresponding to each column in our matrix
feature_names = vectorizer.get_feature_names_out()

# Iterate through each topic (component) from the SVD
# topic_idx: 0, 1, 2, ... (topic number)
# topic: 1D array of weights for all words in this topic (e.g. how much a word contributes to a topic)

    # Sort and extract indices of top n-words for this topic

    # Convert indices to actual words using feature_names

    # Get the weight values for these top words

    # Store everything in a dictionary for this topic

In [None]:
# Convert above code into a function
def get_top_words(svd_model, feature_names, n_words=10):
    """Extract top words for each topic"""
    topics_with_words = []

    for topic_idx, topic in enumerate(svd_model.components_):
        # Get indices of top words
        top_word_indices = topic.argsort()[::-1][:n_words]
        top_words = [feature_names[i] for i in top_word_indices]

        # Get their weights
        weights = topic[top_word_indices]

        topics_with_words.append({
            'topic_num': topic_idx + 1,
            'words': top_words,
            'weights': weights
        })

    return topics_with_words

In [None]:
# Compare top words for 3 to 7 topics
k_values_to_compare = [3,4,5,6,7]

# Loop through multiple values of k and compare their outputs in the get_top_words() function


### 4.2: Visualize Topic Coherence


In [None]:
# Create a more detailed comparison for a selected number of topics
selected_k = 4  # CHANGE BASED ON THE VARIABILITY EXPLAINED PLOT AND TOP WORDS ANALYSIS

# Fit a new LSA/SVD Model
svd_selected = TruncatedSVD(n_components=selected_k, random_state=42)
doc_topics = svd_selected.fit_transform(X_tfidf)

# Visualize topics dataframe
pd.DataFrame(doc_topics)

In [None]:
# Create a heatmap of top words per topic
fig, axes = plt.subplots(1, selected_k, figsize=(15, 4))
fig.suptitle(f'Top 10 Words per Topic (k={selected_k})', fontsize=14)

topics = get_top_words(svd_selected, feature_names, n_words=10)

for idx, (ax, topic) in enumerate(zip(axes, topics)):
    # Create bar plot for each topic
    words = topic['words'][::-1]  # Reverse for bottom-to-top display
    weights = topic['weights'][::-1]

    ax.barh(range(len(words)), weights, color=plt.cm.viridis(0.3 + idx * 0.15))
    ax.set_yticks(range(len(words)))
    ax.set_yticklabels(words, fontsize=9)
    ax.set_xlabel('Weight', fontsize=10)
    ax.set_title(f'Topic {idx + 1}', fontsize=11)
    ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

### 4.4: Document Distribution Across Topics

In [None]:
# Show how documents are distributed across topics

# Add dominant topic for each document

# Count documents per dominant topic

# Include speaker from original dataframe
