In [12]:
import re
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import altair as alt

# Load text data
dataset = fetch_20newsgroups(shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

# Preprocess the text data
def preprocess(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    text = text.strip()
    return text

# Create TF-IDF matrix
vectorizer = TfidfVectorizer(preprocessor=preprocess, stop_words='english', max_features=10000)
X = vectorizer.fit_transform(documents)

# Matrix factorization using Non-negative Matrix Factorization (NMF)
num_topics = 10
nmf_model = NMF(n_components=num_topics, random_state=42)
W = nmf_model.fit_transform(X)
H = nmf_model.components_

# Print the topic-word matrix
print("Topic-Word Matrix:")
for topic_idx, topic in enumerate(H):
    print(f"Topic {topic_idx}:")
    word_indices = np.argsort(topic)[::-1][:10]
    words = [vectorizer.get_feature_names_out()[idx] for idx in word_indices]
    print(f"Words: {', '.join(words)}")
    print()

# Visualize the topic-word matrix using Altair
source = pd.DataFrame({
    'topic': np.repeat(np.arange(num_topics), 10),
    'word': np.concatenate([vectorizer.get_feature_names_out()[np.argsort(topic)[::-1]][:10] for topic in H]),
    'value': np.concatenate([np.sort(topic)[::-1][:10] for topic in H])
})

Topic-Word Matrix:
Topic 0:
Words: just, dont, like, im, think, know, good, car, ive, really

Topic 1:
Words: card, video, monitor, mb, vga, cards, drivers, color, ram, bus

Topic 2:
Words: god, jesus, bible, believe, faith, christian, christians, christ, does, gods

Topic 3:
Words: drive, scsi, drives, disk, hard, ide, controller, mb, floppy, mac

Topic 4:
Words: thanks, email, does, know, advance, hi, info, looking, address, anybody

Topic 5:
Words: gebcadredslpittedu, njxp, chastity, shameful, intellect, skepticism, surrender, gordon, banks, soon

Topic 6:
Words: windows, file, files, window, dos, program, use, using, problem, version

Topic 7:
Words: game, team, games, year, players, season, hockey, play, teams, win

Topic 8:
Words: key, chip, encryption, clipper, keys, use, escrow, government, algorithm, bit

Topic 9:
Words: people, government, israel, state, did, said, right, armenian, jews, israeli



In [9]:
X.shape, W.shape, H.shape

((11314, 10000), (11314, 2), (2, 10000))

In [10]:
chart = alt.Chart(source).mark_rect().encode(
    x='word',
    y='topic:O',
    color='value:Q',
    tooltip=['word', 'value']
).properties(
    width=800,
    height=400,
    title='Topic-Word Matrix'
)

chart.interactive()

AttributeError: module 'altair' has no attribute 'visualize'