In [48]:
%pip install nltk scikit-learn gensim


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/usr/local/Cellar/jupyterlab/4.0.10/libexec/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [49]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download('stopwords')
from nltk import word_tokenize
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
import numpy as np

[nltk_data] Downloading package punkt to /Users/jj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/jj/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/jj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
def pre_process(text_file, rare_count):
    lowered_text = text_file.lower()
    tokens = word_tokenize(lowered_text)
    stop_words = set(stopwords.words('english'))
    tokens = [w.strip() for w in tokens if w.isalpha() and w not in stop_words]
    
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(w) for w in tokens]
    tokens_df = pd.DataFrame(tokens, columns =['words'])
    tokens_uniques_df = (
        tokens_df['words']
        .value_counts()
        .reset_index(name='count')
        .rename(columns={'index': 'words'})
    )
    tokens_uniques_df = tokens_uniques_df[tokens_uniques_df['count'] > rare_count]
    tokens_uniques_df = tokens_uniques_df.sort_values(by='count', ascending=False)
    token_dict = dict(zip(tokens_uniques_df["words"], tokens_uniques_df["count"]))
    return token_dict

def get_top_ten(topic, print_it=False, print_probs=False):
    topic_count_sorted = sorted(vocab_topic_probs[topic].items(), key=lambda x: x[1], reverse=True)
    if print_it:
        if print_probs:
            print(topic_count_sorted[:10])
        else:
            first_terms = [t[0] for t in topic_count_sorted[:10]]
            print(first_terms)
    return topic_count_sorted[:10]

def get_top_topics():
    top_ten = {}
    for topic in reuters.categories():
        top_ten[topic] = {}
        top_probs = get_top_ten(topic)
        sum_probs_top_ten = sum(count for _, count in top_probs) 
        top_ten[topic]['probs'] = sum_probs_top_ten
        top_ten[topic]['words'] = [w for w, _ in top_probs]
    sorted_topics = sorted(top_ten.items(), key=lambda x: x[1]['probs'], reverse=True)
    for topic, info in sorted_topics[:10]:
        print(f'{topic}: {info["words"]}')
    return sorted_topics

In [51]:
from nltk.corpus import reuters

nltk.download('reuters')
from collections import defaultdict

topic_docs = defaultdict(list)

docs = {}
ctr = 0
files = [f for f in reuters.fileids() if len(reuters.categories(f)) == 1] # To make it easy let's limit it to single category docs
for doc in files:
    docs[ctr] = {}
    topics = reuters.categories(doc)
    text = reuters.raw(doc)
    docs[ctr]['text'] = text 
    docs[ctr]['topics'] = topics
    ctr += 1
rare_count = 0
tokenized_docs = []
for key in docs.keys():
    doc_tokens = pre_process(docs[key]['text'], rare_count)
    docs[key]['token_dict'] = doc_tokens
    tokenized_docs.append(doc_tokens)
raw_vocab = []
# print(tokenized_docs[0]['words'])
for token_dict in tokenized_docs:
        raw_vocab.extend(token_dict.keys())
# print(raw_vocab)
vocab = list(set(raw_vocab))
print(len(raw_vocab))

[nltk_data] Downloading package reuters to /Users/jj/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


396412


In [52]:
# print(reuters.categories())
topic_vocab = {}

for topic in reuters.categories():
    topic_vocab[topic] = {}
    for key, item in docs.items():
        if topic in docs[key]['topics']:
            for word, count in item['token_dict'].items():
                if word not in topic_vocab[topic].keys():
                    topic_vocab[topic][word] = count
                else:
                    topic_vocab[topic][word] += count

vocab_topic_probs = {}

for topic in reuters.categories():
    vocab_topic_probs[topic] = {}
    total_words_in_topic = sum(topic_vocab[topic].values())
    for word in vocab:
        count = topic_vocab[topic].get(word, 0)
        P_word_topic = (count + 1) / (total_words_in_topic + len(vocab))
        vocab_topic_probs[topic][word] = P_word_topic
# print(vocab_topic_probs['acq'])



In [53]:
from gensim.models import LdaModel
from gensim import corpora
dictionary = corpora.Dictionary()
dictionary.add_documents([[w] for w in vocab])
corpus = []
for key in docs.keys():
    token_dict = docs[key]['token_dict']
    bow = [(dictionary.token2id[w], c) for w, c in token_dict.items() if w in dictionary.token2id]
    corpus.append(bow)
num_topics = 10
lda = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    passes=10,        
    random_state=42
)

In [54]:
lda_topics = []
for i, topic in lda.show_topics(num_topics=num_topics, num_words=10, formatted=False):
    print(f"Topic {i}: {[word for word, prob in topic]}")
    lda_topics.append(topic)

Topic 0: ['pct', 'year', 'januari', 'februari', 'said', 'rose', 'rise', 'billion', 'decemb', 'increas']
Topic 1: ['said', 'compani', 'would', 'offer', 'lt', 'share', 'analyst', 'bid', 'merger', 'dlr']
Topic 2: ['mln', 'billion', 'stg', 'year', 'profit', 'tonn', 'sugar', 'trade', 'china', 'dlr']
Topic 3: ['said', 'oil', 'price', 'dlr', 'tonn', 'product', 'crude', 'mln', 'barrel', 'ga']
Topic 4: ['bank', 'rate', 'billion', 'said', 'pct', 'market', 'dollar', 'mark', 'currenc', 'dlr']
Topic 5: ['said', 'trade', 'would', 'countri', 'export', 'japan', 'market', 'meet', 'govern', 'offici']
Topic 6: ['dlr', 'said', 'mln', 'compani', 'lt', 'sale', 'earn', 'inc', 'corp', 'year']
Topic 7: ['lt', 'said', 'ltd', 'pct', 'compani', 'stake', 'unit', 'corp', 'firm', 'group']
Topic 8: ['vs', 'mln', 'ct', 'net', 'loss', 'shr', 'dlr', 'lt', 'profit', 'qtr']
Topic 9: ['share', 'said', 'stock', 'lt', 'pct', 'dlr', 'offer', 'compani', 'common', 'inc']


In [58]:
tt= get_top_topics()

earn: ['vs', 'mln', 'ct', 'net', 'dlr', 'shr', 'loss', 'lt', 'profit', 'said']
acq: ['said', 'lt', 'share', 'dlr', 'compani', 'mln', 'inc', 'pct', 'offer', 'corp']
interest: ['rate', 'pct', 'bank', 'said', 'cut', 'interest', 'market', 'prime', 'day', 'billion']
crude: ['oil', 'said', 'price', 'crude', 'dlr', 'barrel', 'mln', 'opec', 'bpd', 'product']
trade: ['trade', 'said', 'japan', 'billion', 'would', 'dlr', 'year', 'export', 'import', 'japanes']
money-supply: ['billion', 'dlr', 'pct', 'bank', 'mln', 'said', 'money', 'week', 'rose', 'januari']
money-fx: ['said', 'bank', 'market', 'currenc', 'rate', 'dollar', 'exchang', 'mln', 'stg', 'pct']
coffee: ['coffe', 'said', 'export', 'quota', 'produc', 'ico', 'price', 'brazil', 'meet', 'market']
sugar: ['sugar', 'said', 'tonn', 'mln', 'year', 'price', 'trader', 'ec', 'export', 'white']
cpi: ['pct', 'price', 'februari', 'said', 'inflat', 'januari', 'rise', 'year', 'consum', 'rose']


In [63]:
matching_topics = []

''' ChatGPT wrote this and the next section '''
for topic_name, info in tt[:10]:
    s_words = set(info['words'])
    print(f"\nClunky Method topic '{topic_name}':")
    for lda_id, lda_words in lda_top_words.items():
        lda_set = set(lda_words)
        overlap_pct = len(s_words & lda_set) / len(s_words) * 100
        print(f"  LDA topic {lda_id}: {overlap_pct:.1f}% common words")


Clunky Method topic 'earn':
  LDA topic 0: 10.0% common words
  LDA topic 1: 30.0% common words
  LDA topic 2: 30.0% common words
  LDA topic 3: 30.0% common words
  LDA topic 4: 20.0% common words
  LDA topic 5: 10.0% common words
  LDA topic 6: 40.0% common words
  LDA topic 7: 20.0% common words
  LDA topic 8: 90.0% common words
  LDA topic 9: 30.0% common words

Clunky Method topic 'acq':
  LDA topic 0: 20.0% common words
  LDA topic 1: 60.0% common words
  LDA topic 2: 20.0% common words
  LDA topic 3: 30.0% common words
  LDA topic 4: 30.0% common words
  LDA topic 5: 10.0% common words
  LDA topic 6: 70.0% common words
  LDA topic 7: 50.0% common words
  LDA topic 8: 30.0% common words
  LDA topic 9: 80.0% common words

Clunky Method topic 'interest':
  LDA topic 0: 30.0% common words
  LDA topic 1: 10.0% common words
  LDA topic 2: 10.0% common words
  LDA topic 3: 10.0% common words
  LDA topic 4: 60.0% common words
  LDA topic 5: 20.0% common words
  LDA topic 6: 10.0% commo

In [65]:
# 1️⃣ Prepare top words

# LDA top words per topic
lda_top_words = {
    topic_id: [w for w, _ in words]
    for topic_id, words in lda.show_topics(num_topics=10, num_words=10, formatted=False)
}

# Supervised / summed-topic top words
supervised_top_words = {
    topic_name: set(info['words'])
    for topic_name, info in tt[:10]  # top 10 supervised topics
}

# 2️⃣ Find best matching supervised topic for each LDA topic
best_matches = {}

for lda_id, lda_words in lda_top_words.items():
    lda_set = set(lda_words)
    best_topic = None
    best_pct = 0.0

    for s_name, s_set in supervised_top_words.items():
        overlap_pct = len(lda_set & s_set) / len(s_set) * 100
        if overlap_pct > best_pct:
            best_pct = overlap_pct
            best_topic = s_name
    best_matches[lda_id] = (best_topic, best_pct)

total_match = 0
print("LDA topic -> Best matching Clunky Method topic (% overlap):\n")
for lda_id, (topic_name, pct) in best_matches.items():
    print(f"LDA topic {lda_id}: {topic_name} ({pct:.1f}% match)")
    total_match += pct
overall_matching = total_match/10
print(f'Overall match: {overall_matching:.1f}%')

LDA topic -> Best matching Clunky Method topic (% overlap):

LDA topic 0: cpi (70.0% match)
LDA topic 1: acq (60.0% match)
LDA topic 2: trade (40.0% match)
LDA topic 3: crude (80.0% match)
LDA topic 4: money-fx (70.0% match)
LDA topic 5: trade (50.0% match)
LDA topic 6: acq (70.0% match)
LDA topic 7: acq (50.0% match)
LDA topic 8: earn (90.0% match)
LDA topic 9: acq (80.0% match)
Overall match: 66.0%


In [66]:
print(len(reuters.categories()))

90


In [72]:
counts = []
for doc in tokenized_docs:
    token_count = 0
    for word, t_count in doc.items():
        token_count += t_count
    counts.append(token_count)
print(sum(counts)/len(counts))

71.02707423580786


In [73]:
print(len(tokenized_docs))

9160


In [77]:
topic_list = []
for doc in docs:
    topic_list.extend(docs[doc]['topics'])
unique_topics = list(set(topic_list))
print(len(unique_topics))

65
