In [1]:
# Imports
import json
import re
from gensim.models import LdaModel
from gensim import corpora
from gensim.parsing.preprocessing import STOPWORDS

In [2]:
# Preprocessing Function
def preprocess(text):
    tokens = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    tokens = [
        token for token in tokens
        if token not in STOPWORDS and len(token) >= 3
    ]
    return tokens

In [3]:
# Load Model, Dictionary, Labels
lda_model = LdaModel.load("models/lda_model")
dictionary = corpora.Dictionary.load("models/dictionary.dict")

with open("models/topic_labels.json", "r") as f:
    topic_labels = json.load(f)

In [4]:
# Display Topic Summary
print("==== Loaded Topics ====\n")

for topic_id in range(lda_model.num_topics):
    label = topic_labels.get(str(topic_id), f"Topic {topic_id}")
    print(f"{topic_id}: {label}")

==== Loaded Topics ====

0: Technology
1: Science
2: Sports
3: Politics
4: Religion
5: Automobiles
6: Computers
7: Space
8: Medicine
9: General Discussion


In [5]:
# Classification Function
def classify_document(text):

    print("\nDocument Preview:")
    print("-" * 40)
    print(text[:200], "...\n")

    tokens = preprocess(text)
    bow = dictionary.doc2bow(tokens)

    topic_dist = lda_model.get_document_topics(bow)
    top_topics = sorted(topic_dist, key=lambda x: x[1], reverse=True)[:3]

    print("Top 3 Topics:\n")

    for topic_id, prob in top_topics:
        label = topic_labels.get(str(topic_id), f"Topic {topic_id}")
        print(f"{label} ({prob:.4f})")

        top_words = lda_model.show_topic(topic_id, topn=5)
        words = ", ".join([word for word, _ in top_words])
        print(f"Top words: {words}")
        print("-" * 40)

In [6]:
# Sample Documents
samples = [
"The new graphics card delivers amazing performance for gaming. The GPU can handle 4K resolution easily with ray tracing enabled. Gamers will love the improved frame rates.",
"Scientists discovered a new exoplanet orbiting a distant star in the habitable zone. The research team published their findings in Nature journal. This discovery could provide insights into planetary formation.",
"The basketball team won the championship after an incredible final game. The players celebrated with fans in the stadium. It was the team's first title in twenty years.",
"Congress passed a new bill regarding healthcare reform. The president is expected to sign the legislation next week. The policy will affect millions of citizens across the country.",
"I love cooking Italian food at home. Pasta carbonara and margherita pizza are my favorite dishes to make. Fresh ingredients make all the difference in authentic recipes."
]

In [7]:
# Run Example Classifications
for i, sample in enumerate(samples, 1):
    print(f"\n========== SAMPLE {i} ==========")
    classify_document(sample)



Document Preview:
----------------------------------------
The new graphics card delivers amazing performance for gaming. The GPU can handle 4K resolution easily with ray tracing enabled. Gamers will love the improved frame rates. ...

Top 3 Topics:

Politics (0.3835)
Top words: max, use, health, software, files
----------------------------------------
General Discussion (0.3763)
Top words: car, insurance, think, year, like
----------------------------------------
Space (0.2058)
Top words: good, game, excellent, games, missing
----------------------------------------


Document Preview:
----------------------------------------
Scientists discovered a new exoplanet orbiting a distant star in the habitable zone. The research team published their findings in Nature journal. This discovery could provide insights into planetary  ...

Top 3 Topics:

Space (0.5632)
Top words: good, game, excellent, games, missing
----------------------------------------
Computers (0.3875)
Top words: jesus, 