In [18]:
import pickle
import pandas as pd

from datasets import load_from_disk
from bertopic import BERTopic
from collections import Counter
import matplotlib.pyplot as plt

In [19]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1", model_kwargs={"torch_dtype": "float16"})

In [20]:
loaded_model = BERTopic.load("topic_model/reduced_model_14", embedding_model=embedding_model)

In [21]:
ds_path = "../data/datasets_all/all-texts"

In [22]:
ds = load_from_disk(ds_path)

In [23]:
print(ds)

Dataset({
    features: ['Source', 'ID', 'CleanedText'],
    num_rows: 607687
})


In [24]:
# embeddings = embedding_model.encode(ds["CleanedText"], show_progress_bar=True)

In [25]:
# with open("all-texts-embeds.pkl", 'wb') as p:
#     pickle.dump(embeddings, p)

In [26]:
with open("all-texts-embeds.pkl", 'rb') as p:
    embeddings = pickle.load(p)

In [27]:
topics, probs = loaded_model.transform(documents=ds['CleanedText'], embeddings=embeddings)

2025-09-18 11:14:23,026 - BERTopic - Predicting topic assignments through cosine similarity of topic and document embeddings.


In [28]:
import plotly.io as pio
pio.renderers.default = 'notebook'

In [29]:
loaded_model.topic_labels_

{0: '0_ende_sal_ofte_syn',
 1: '1_chinezen_chineesche_chineesen_chin',
 2: '2_houtwerken_hout_bosschen_boomen',
 3: '3_hospitaal_hospitalen_zieken_chirurgyns',
 4: '4_java_en_te_op',
 5: '5_maand_maanden_zal_van',
 6: '6_wagt_officier_officieren_art',
 7: '7_te_dat_en_zyn',
 8: '8_schepen_schip_zal_of',
 9: '9_suiker_molens_suikermolens_molenaars',
 10: '10_zilver_munt_goud_geld',
 11: '11_raad_justitie_leden_den',
 12: '12_het_van_is_den',
 13: '13_ende_schepen_aen_sullen',
 14: '14_kerk_kerken_gemeente_gereformeerde',
 15: '15_corporaals_lieutenant_20_24',
 16: '16_batavia_stad_van_ommelanden',
 17: '17_paarden_paard_wagens_wagen',
 18: '18_zout_12_kannen_dm',
 19: '19_brand_spuiten_no_spuyten',
 20: '20_militairen_officieren_inlandsche_militaire',
 21: '21_secretaris_secretarissen_van_off',
 22: '22_bat_iv_bepaling_comp',
 23: '23_christenen_christen_doop_religie',
 24: '24_kinderen_art_moeder_kind',
 25: '25_ende_india_generael_syn',
 26: '26_malacca_vaart_naar_java',
 27: '27_jaar

In [33]:
[x for x in loaded_model.topic_labels_.values() if 'frans' in x or 'napo' in x]

['89_majesteit_koning_holland_napoleon',
 '92_oorlog_engeland_republiek_fransche',
 '135_fransche_franschen_engelschen_fransch']

In [37]:
type(ds['Topic'][0])

int

In [31]:
ds = ds.add_column("Topic", topics)
ds.save_to_disk(ds_path+'-with-topics')

Saving the dataset (0/1 shards):   0%|          | 0/607687 [00:00<?, ? examples/s]

In [32]:
loaded_model.visualize_topics()


KeyboardInterrupt



In [None]:
loaded_model.visualize_hierarchy()

In [None]:
# interesting topics: 12, 27, 65
# delpher still seems a little bad in quality still seems
# could be interesting to further do topic modelling on the 0 topic

In [None]:
print(loaded_model.get_topic_info(12)['Representation'][0])

In [None]:
loaded_model.get_topic_info(27)

In [None]:
print(loaded_model.get_topic_info(65)['Representation'][0])

In [None]:
def plot_yearly_topic(topic, df_topics, show_all=True):
    all_counts = df_topics["year"].value_counts().sort_index().to_frame().reset_index()
    topic_counts = df_topics[df_topics["topic"]==topic]["year"].value_counts().sort_index().to_frame().reset_index()
    if show_all:
        plt.bar(all_counts["year"]-0.2, all_counts["count"], 0.4)
        plt.bar(topic_counts["year"]+0.2, topic_counts["count"], 0.4)
        plt.legend(["All", f"Topic {topic} mentions"])
    else:
        plt.bar(topic_counts['year'], topic_counts['count'])
        plt.legend([f"{topic} mentions"])
    plt.grid()

In [None]:
def plot_topics(topics, df_topics, title):
    df = df_topics[["year", "topic"]]
    df = df[df['topic'].isin(topics)]
    df.groupby(["year", "topic"]).value_counts().unstack().plot.barh()
    plt.title(title)
    plt.show()