In [84]:
from glob import glob
from datasets import Dataset
import pickle
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from nltk.corpus import stopwords
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import  ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [85]:
import plotly.io as pio
pio.renderers.default = "colab"

In [86]:
stoplist = list(set(stopwords.words('dutch'))) + list(set(stopwords.words('english'))) + list(set(stopwords.words('french')))
stoplist.extend(['pct', 'idem', 'wy', 'zyn' ,'zy','hy', 'pcts', 'dito', 'zoo', 'alhier', 'den', 'eene', 'ten', 'ter', 'zeer', 'eenen'])

In [87]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1", model_kwargs={"torch_dtype": "float16"})
# TODO: truncate texts?

In [88]:
with open("../data/raw/placaatboek/all_vers_preprocessed", 'rb') as b:
    dfs = pickle.load(b)

In [89]:
texts = []
for df in dfs:
    texts.extend(df['legislation'])

In [90]:
print(len(texts))

28487


In [91]:
embeddings = embedding_model.encode(texts, show_progress_bar=True)

Batches:   0%|          | 0/891 [00:00<?, ?it/s]

In [92]:
# with open("plakaatboeken_embeddings.pkl", 'wb') as p:
#     pickle.dump(embeddings, p)

In [93]:
with open("plakaatboeken_embeddings.pkl", 'rb') as p:
    embeddings = pickle.load(p)

In [94]:
params = {  
            # TFIDF
            "reduce_frequent_words": True, "bm25_weighting": True,   
            "seed_words": [],
            "seed_multiplier": 4,
            # UMAP
            "n_neighbors": 10, "n_components": 5, "min_dist": 0.0, "metric_umap": "cosine", "random_state": 42,
            # HDBSCAN (change min_cluster_size for more/less topics?, default is 10, recommended to only increase above 10)
            "min_cluster_size": 14, "metric_hbd": "euclidean", "cluster_selection_method": "eom", "prediction_data": True,
            # Vectorizer model
            "stop_words": stoplist, "min_df": 2, "ngram_range": (1,3),
            # Representation models
            "diversity": 0.3
         }

In [95]:
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=params["reduce_frequent_words"], bm25_weighting=params["bm25_weighting"], 
                                     seed_words=params['seed_words'], seed_multiplier=params["seed_multiplier"])

In [96]:
umap_model = UMAP(n_neighbors=params["n_neighbors"], 
                  n_components=params["n_components"], 
                  min_dist=params["min_dist"], 
                  metric=params["metric_umap"], 
                  random_state=params["random_state"])

In [97]:
hdbscan_model = HDBSCAN(min_cluster_size=params["min_cluster_size"],
                        metric=params["metric_hbd"], 
                        cluster_selection_method=params["cluster_selection_method"], 
                        prediction_data=params["prediction_data"])

In [98]:
vectorizer_model = CountVectorizer(stop_words=params["stop_words"], 
                                   min_df=params["min_df"], 
                                   ngram_range=params["ngram_range"])

In [99]:
representation_models = [MaximalMarginalRelevance(diversity=params["diversity"])]

In [100]:
topic_model = BERTopic(

    # Pipeline models
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_models,
    top_n_words=5,
    verbose=True,
    ctfidf_model=ctfidf_model,
    # nr_topics="auto",
    calculate_probabilities=True,
)

# Train model
topics, probs = topic_model.fit_transform(texts, embeddings)

2025-09-18 11:09:32,378 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-18 11:09:44,612 - BERTopic - Dimensionality - Completed ✓
2025-09-18 11:09:44,612 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-18 11:10:17,657 - BERTopic - Cluster - Completed ✓
2025-09-18 11:10:17,689 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-18 11:10:26,557 - BERTopic - Representation - Completed ✓


In [101]:
set(topics)

{-1,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155}

In [102]:
topic_model.visualize_topics()

In [103]:
print(topic_model.get_topic_freq(-1), len(texts))

12777 28487


In [104]:
topic_model.visualize_hierarchy(orientation='bottom')

In [105]:
[x for x in topic_model.topic_labels_.values() if 'napo' in x or 'slav' in x]

['69_slaven_compagnie slaven_overige slaven_slaven slavinnen',
 '83_balische slaven_aanvoer slaven_slaven batavia_oostersche slaven',
 '89_napoleon_louis napoleon_majesteit louis napoleon_louis napoleon koning',
 '146_ieder zieke_zes slaven_24 maand_maand gespecificeerde',
 '150_volgende voorstel_mansslaven_580 toegen gaan_zwaardere opgelegde straf']

In [106]:
topic_model.visualize_barchart()

In [107]:
topic_model.save("topic_model/full_model_14", serialization="safetensors", save_ctfidf=True, 
                 save_embedding_model="sentence-transformers/distiluse-base-multilingual-cased-v1")

## Reducing outliers (-1 labels)

In [108]:
topic_model = BERTopic.load("topic_model/full_model_14")
print(topic_model.get_topic_freq(-1), len(texts))

12777 28487


In [109]:
print(topic_model.topic_sizes_[89], topic_model.topic_sizes_[92], topic_model.topic_sizes_[135])

32 31 18


In [110]:
new_topics = topic_model.reduce_outliers(texts, topics, strategy="embeddings")

In [111]:
topic_model.update_topics(texts, topics=new_topics)



In [112]:
[x for x in topic_model.topic_labels_.values() if 'napo' in x or 'fransc' in x]

['89_majesteit_koning_holland_napoleon',
 '92_oorlog_engeland_republiek_fransche',
 '135_fransche_franschen_engelschen_fransch']

In [113]:
topic_model.get_topic_freq(-1)

0

In [114]:
topic_model.visualize_topics()

In [115]:
topic_model.visualize_hierarchy(orientation='bottom')

In [116]:
topic_model.visualize_barchart()

In [117]:
topic_model.save("topic_model/reduced_model_14", serialization="safetensors", save_ctfidf=True, 
                 save_embedding_model="sentence-transformers/distiluse-base-multilingual-cased-v1")

In [118]:
print(topic_model.topic_sizes_[89], topic_model.topic_sizes_[92], topic_model.topic_sizes_[135])

54 78 44


In [119]:
topic_model.topic_labels_

{0: '0_ende_sal_ofte_syn',
 1: '1_chinezen_chineesche_chineesen_chin',
 2: '2_houtwerken_hout_bosschen_boomen',
 3: '3_hospitaal_hospitalen_zieken_chirurgyns',
 4: '4_java_en_te_op',
 5: '5_maand_maanden_zal_van',
 6: '6_wagt_officier_officieren_art',
 7: '7_te_dat_en_zyn',
 8: '8_schepen_schip_zal_of',
 9: '9_suiker_molens_suikermolens_molenaars',
 10: '10_zilver_munt_goud_geld',
 11: '11_raad_justitie_leden_den',
 12: '12_het_van_is_den',
 13: '13_ende_schepen_aen_sullen',
 14: '14_kerk_kerken_gemeente_gereformeerde',
 15: '15_corporaals_lieutenant_20_24',
 16: '16_batavia_stad_van_ommelanden',
 17: '17_paarden_paard_wagens_wagen',
 18: '18_zout_12_kannen_dm',
 19: '19_brand_spuiten_no_spuyten',
 20: '20_militairen_officieren_inlandsche_militaire',
 21: '21_secretaris_secretarissen_van_off',
 22: '22_bat_iv_bepaling_comp',
 23: '23_christenen_christen_doop_religie',
 24: '24_kinderen_art_moeder_kind',
 25: '25_ende_india_generael_syn',
 26: '26_malacca_vaart_naar_java',
 27: '27_jaar