# Build Dictionary Notebook

This notebook replicates the functionality of `scripts/build_dictionary.py`.

## 1.a Import libraries

In [None]:
import os
import sys

# allow importing from scripts directory
sys.path.append('scripts')
from build_dictionary import gather_files, build_vocabulary, read_extra_stopwords
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import nltk

## 1.b Configure input and output paths

In [None]:
policy_documents_path = 'Policy-documents'
policy_dictionary_output = 'policy_dictionary.txt'

# Example for theory documents
theory_documents_path = 'sources'
theory_dictionary_output = 'theory_dictionary.txt'

## 1.c Set processing options

In [None]:
language = 'dutch'
extra_stopwords_path = 'stopwords_extra.txt'
use_stemming = True
min_frequency = 1

## 2.a Prepare stop words and stemmer

In [None]:
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words(language))
stop_words.update(read_extra_stopwords(extra_stopwords_path))
stemmer = SnowballStemmer(language) if use_stemming else None

## 2.b Build vocabulary from the documents

In [None]:
files = gather_files(policy_documents_path)
policy_vocab = build_vocabulary(files, stop_words, stemmer)

## 2.c Save the dictionary

In [None]:
words = sorted([w for w, c in policy_vocab.items() if c >= min_frequency])
with open(policy_dictionary_output, 'w', encoding='utf-8') as fh:
    for w in words:
        fh.write(f'{w}
')