# Discover Topics Notebook

This notebook mirrors `scripts/discover_topics.py` for LDA topic modeling.

## 1.a Import libraries

In [None]:
import os
import sys

sys.path.append('scripts')
from discover_topics import gather_files, build_corpus, discover_topics, read_extra_stopwords
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import nltk

## 1.b Configure paths

In [None]:
policydocument_path = 'Policy-documents'
slaverydocument_path = 'sources'

## 1.c Choose parameters

In [None]:
language = 'dutch'
extra_stopwords_path = 'stopwords_extra.txt'
use_stemming = True
num_topics = 5
passes = 5
words_per_topic = 10

## 2.a Prepare stop words and stemmer

In [None]:
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words(language))
stop_words.update(read_extra_stopwords(extra_stopwords_path))
stemmer = SnowballStemmer(language) if use_stemming else None

## 2.b Build the corpus

In [None]:
files = gather_files(policydocument_path)
corpus, dictionary = build_corpus(files, stop_words, stemmer)

## 2.c Train the LDA model

In [None]:
lda = discover_topics(corpus, dictionary, num_topics, passes)

## 2.d Display topics

In [None]:
for i, topic in lda.show_topics(num_topics=num_topics, num_words=words_per_topic, formatted=False):
    print(f'Topic {i}: {', '.join(w for w, _ in topic)}')

## 2.e Filter dominating termsUse `filter_common_terms` to drop words shared across many topics so unique themes are easier to spot.

In [None]:
drop_common_threshold = 10
topic_words = extract_topic_words(lda, words_per_topic)
filtered_topics = filter_common_terms(topic_words, drop_common_threshold)
for i, words in filtered_topics.items():
    print(f'Filtered Topic {i}: {', '.join(words)}')

## 2.f Save topic dictionariesLabel topics and save their word lists for later use.

In [None]:
from topic_dictionary import save_topic_words
policy_topic_labels = ['policy_topic1', 'policy_topic2', 'policy_topic3', 'policy_topic4', 'policy_topic5']
dictionary_output_dir = 'Policy_topic_dictionaries'
save_topic_words(filtered_topics, policy_topic_labels, dictionary_output_dir)

## 3.a Evaluate coherenceUse Gensim's `CoherenceModel` to measure how well the topics hold together. Higher scores typically indicate clearer themes.

In [None]:
from gensim.models import CoherenceModel
texts = [tokenize(clean_text(extract_text_from_file(f)), stop_words, stemmer) for f in files]
coherence_model = CoherenceModel(topics=[filtered_topics[i] for i in range(len(filtered_topics))], texts=texts, dictionary=dictionary, coherence='c_v')
coherence = coherence_model.get_coherence()
print(f'Coherence: {coherence:.4f}')