In [1]:
import pickle
import pandas as pd

from data.data import Delpher, DBNL, Wikipedia, Plakaatboeken
from bertopic import BERTopic
from collections import Counter
import matplotlib.pyplot as plt

In [2]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1", model_kwargs={"torch_dtype": "float16"})

In [3]:
loaded_model = BERTopic.load("models/full_model_15", embedding_model=embedding_model)

In [4]:
years = [1805, 1806, 1807, 1808, 1809, 1810, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819]

In [6]:
ds_delpher = Delpher().dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)
ds_dbnl = DBNL().dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)
ds_wiki = Wikipedia(language='nl').dataset.filter(lambda example: len(example["CleanedText"])>100)
ds_plakaatboek = Plakaatboeken().dataset.filter(lambda example: example["Year"] in years and len(example["CleanedText"])>100)

Cleaning data since cleaned version not found


Saving the dataset (0/1 shards):   0%|          | 0/30244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/30244 [00:00<?, ? examples/s]

In [None]:
print(len(ds_wiki), len(ds_dbnl), len(ds_plakaatboek))

In [None]:
# delpher_embeddings = embedding_model.encode(ds_dbnl["CleanedText"], show_progress_bar=True)
# with open("dataset_embeds_distiluse-base-multilingual-cased-v1/delpher_embeds.pkl", 'wb') as p:
#     pickle.dump(delpher_embeddings, p)

# dbnl_embeddings = embedding_model.encode(ds_dbnl["CleanedText"], show_progress_bar=True)
# with open("dataset_embeds_distiluse-base-multilingual-cased-v1/dbnl_embeds.pkl", 'wb') as p:
#     pickle.dump(dbnl_embeddings, p)
# 
# wiki_embeddings = embedding_model.encode(ds_wiki["CleanedText"], show_progress_bar=True)
# with open("dataset_embeds_distiluse-base-multilingual-cased-v1/wiki_embeds.pkl", 'wb') as p:
#     pickle.dump(wiki_embeddings, p)
# 
# plakaatboek_embeddings = embedding_model.encode(ds_plakaatboek["CleanedText"], show_progress_bar=True)
# with open("dataset_embeds_distiluse-base-multilingual-cased-v1/plakaatboeken_embeds.pkl", 'wb') as p:
#     pickle.dump(plakaatboek_embeddings, p)

In [None]:
with open("dataset_embeds_distiluse-base-multilingual-cased-v1/delpher_embeds.pkl", 'rb') as p:
    delpher_embeddings = pickle.load(p)
    
with open("dataset_embeds_distiluse-base-multilingual-cased-v1/dbnl_embeds.pkl", 'rb') as p:
    dbnl_embeddings = pickle.load(p)
    
with open("dataset_embeds_distiluse-base-multilingual-cased-v1/wiki_embeds.pkl", 'rb') as p:
    wiki_embeddings = pickle.load(p)
    
with open("dataset_embeds_distiluse-base-multilingual-cased-v1/plakaatboeken_embeds.pkl", 'rb') as p:
    plakaatboek_embeddings = pickle.load(p)

In [None]:
delpher_topics, delpher_probs = loaded_model.transform(documents=ds_delpher['CleanedText'], embeddings=delpher_embeddings)
dbnl_topics, dbnl_probs = loaded_model.transform(documents=ds_dbnl['CleanedText'], embeddings=dbnl_embeddings)
wiki_topics, wiki_probs = loaded_model.transform(documents=ds_wiki['CleanedText'], embeddings=wiki_embeddings)
plakaatboeken_topics, plakaatboeken_probs = loaded_model.transform(documents=ds_plakaatboek['CleanedText'], embeddings=plakaatboek_embeddings)

In [None]:
delpher_topic_freq = dict(Counter(delpher_topics).most_common())
dbnl_topic_freq = dict(Counter(dbnl_topics).most_common())
wiki_topic_freq = dict(Counter(wiki_topics).most_common())
plakaatboeken_topic_freq = dict(Counter(plakaatboeken_topics).most_common())

In [None]:
import plotly.io as pio
pio.renderers.default = 'notebook'

In [None]:
loaded_model.topic_labels_

In [None]:
[x for x in loaded_model.topic_labels_.values() if 'slav' in x or 'napo' in x]

In [None]:
loaded_model.visualize_topics()

In [None]:
loaded_model.visualize_hierarchy()

In [None]:
delpher_topic_freq

In [None]:
dbnl_topic_freq

In [None]:
wiki_topic_freq

In [None]:
plakaatboeken_topic_freq

In [None]:
# interesting topics: 12, 27, 65
# delpher still seems a little bad in quality still seems
# could be interesting to further do topic modelling on the 0 topic

In [None]:
print(loaded_model.get_topic_info(12)['Representation'][0])

In [None]:
loaded_model.get_topic_info(27)

In [None]:
print(loaded_model.get_topic_info(65)['Representation'][0])

In [None]:
df_topics_delpher = pd.DataFrame({"topic":delpher_topics, "text":ds_delpher["CleanedText"], "year":ds_delpher["Year"], "ds": 'delpher'})
df_topics_dbnl = pd.DataFrame({"topic":dbnl_topics, "text":ds_dbnl["CleanedText"], "year":ds_dbnl["Year"], "ds": 'dbnl'})
df_topics_wiki = pd.DataFrame({"topic":wiki_topics, "text":ds_wiki["CleanedText"], "year":ds_wiki["Year"], "ds": 'wiki'})
df_topics_plakaatboeken = pd.DataFrame({"topic":plakaatboeken_topics, "text":ds_plakaatboek["CleanedText"], "year":ds_plakaatboek["Year"], "ds": 'plakaatboek'})

In [None]:
# n = 12
# df_topics_delpher_n = df_topics_delpher[(df_topics_delpher["topic"]==n)]
# df_topics_dbnl_n = df_topics_dbnl[(df_topics_dbnl["topic"]==n)]
# df_topics_plakaatboeken_n = df_topics_plakaatboeken[(df_topics_plakaatboeken["topic"]==n)]

In [None]:
# df_topics_delpher_n.to_csv(f"example_csvs/delpher{n}.csv")
# df_topics_dbnl_n.to_csv(f"example_csvs/dbnl{n}.csv")
# df_topics_plakaatboeken_n.to_csv(f"example_csvs/placaatboek{n}.csv")

In [None]:
def plot_yearly_topic(topic, df_topics, show_all=True):
    all_counts = df_topics["year"].value_counts().sort_index().to_frame().reset_index()
    topic_counts = df_topics[df_topics["topic"]==topic]["year"].value_counts().sort_index().to_frame().reset_index()
    if show_all:
        plt.bar(all_counts["year"]-0.2, all_counts["count"], 0.4)
        plt.bar(topic_counts["year"]+0.2, topic_counts["count"], 0.4)
        plt.legend(["All", f"Topic {topic} mentions"])
    else:
        plt.bar(topic_counts['year'], topic_counts['count'])
        plt.legend([f"{topic} mentions"])
    plt.grid()

In [None]:
def plot_topics(topics, df_topics, title):
    df = df_topics[["year", "topic"]]
    df = df[df['topic'].isin(topics)]
    df.groupby(["year", "topic"]).value_counts().unstack().plot.barh()
    plt.title(title)
    plt.show()

In [None]:
# all_topics_df = pd.concat([df_topics_delpher, df_topics_wiki, df_topics_dbnl, df_topics_plakaatboeken])

In [None]:
plot_topics([12, 27, 65], df_topics_dbnl, 'DBNL')

In [None]:
plot_topics([12, 27, 65], df_topics_delpher, 'Delpher')

In [None]:
plot_topics([12, 27, 65], df_topics_wiki, 'Wiki')

In [None]:
plot_topics([12, 27, 65], df_topics_plakaatboeken, 'Plakaatboeken')

In [None]:
df_topics_dbnl

In [None]:
plot_yearly_topic(12, df_topics_delpher, show_all=False)

In [None]:
plot_yearly_topic(12, df_topics_dbnl)

In [None]:
plot_yearly_topic(12, df_topics_wiki, show_all=False)

In [None]:
plot_yearly_topic(12, df_topics_plakaatboeken)