# Discover Topics Notebook

This notebook mirrors `scripts/discover_topics.py` for LDA topic modeling.

## 1.a Import libraries

In [None]:
import os
import sys

sys.path.append('scripts')
from discover_topics import gather_files, build_corpus, discover_topics, read_extra_stopwords
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import nltk

## 1.b Configure paths

In [None]:
policydocument_path = 'Policy-documents'
slaverydocument_path = 'sources'

## 1.c Choose parameters

In [None]:
language = 'dutch'
extra_stopwords_path = 'stopwords_extra.txt'
use_stemming = True
num_topics = 5
passes = 5
words_per_topic = 10

## 2.a Prepare stop words and stemmer

In [None]:
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words(language))
stop_words.update(read_extra_stopwords(extra_stopwords_path))
stemmer = SnowballStemmer(language) if use_stemming else None

## 2.b Build the corpus

In [None]:
files = gather_files(policydocument_path)
corpus, dictionary = build_corpus(files, stop_words, stemmer)

## 2.c Train the LDA model

In [None]:
lda = discover_topics(corpus, dictionary, num_topics, passes)

## 2.d Display topics

In [None]:
for i, topic in lda.show_topics(num_topics=num_topics, num_words=words_per_topic, formatted=False):
    print(f'Topic {i}: {', '.join(w for w, _ in topic)}')