In [2]:
%%capture
!pip install bertopic

In [3]:
import json

In [4]:
def load_data(filepath):
    with open(filepath, 'r', encoding="utf8") as fp:
        data = [json.loads(line) for line in fp]
        return [item['text'] for item in data if 'text' in item]

In [5]:
def load_data_v2(filepath):
    with open(filepath, 'r', encoding="utf8") as fp:
        return [json.loads(line) for line in fp]

In [6]:
docs = load_data("speeches_20.jsonl")

In [7]:
from bertopic import BERTopic

topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(docs)

Downloading (…)0fe39/.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)83e900fe39/README.md:   0%|          | 0.00/3.79k [00:00<?, ?B/s]

Downloading (…)e900fe39/config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading (…)900fe39/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Batches:   0%|          | 0/338 [00:00<?, ?it/s]

2023-09-25 22:44:23,031 - BERTopic - Transformed documents to Embeddings
2023-09-25 22:44:59,615 - BERTopic - Reduced dimensionality
2023-09-25 22:45:09,551 - BERTopic - Clustered reduced embeddings


In [8]:
freq = topic_model.get_topic_info(); freq.head(10)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4031,-1_die_der_und_das,"[die, der, und, das, wir, in, sie, ist, es, auch]",[Sehr geehrte Frau Präsidentin! Liebe Kollegin...
1,0,379,0_bildung_kinder_und_die,"[bildung, kinder, und, die, das, für, in, bafö...",[Frau Präsidentin! Sehr geehrte Kolleginnen un...
2,1,376,1_ukraine_und_die_der,"[ukraine, und, die, der, in, wir, krieg, ist, ...",[Frau Präsidentin! Liebe Kolleginnen und Kolle...
3,2,346,2_menschen_und_die_sie,"[menschen, und, die, sie, in, der, das, ist, e...",[Frau Präsidentin! Liebe Kolleginnen und Kolle...
4,3,322,3_impfpflicht_die_wir_impfung,"[impfpflicht, die, wir, impfung, sie, und, pan...",[Sehr geehrte Frau Präsidentin! Meine Damen un...
5,4,319,4_euro_haushalt_die_sie,"[euro, haushalt, die, sie, der, das, wir, für,...",[Sehr geehrte Frau Präsidentin! Meine Damen un...
6,5,247,5_pflege_die_wir_der,"[pflege, die, wir, der, und, in, sie, patiente...",[Sehr geehrte Frau Präsidentin! Lieber Gesundh...
7,6,181,6_wir_die_und_in,"[wir, die, und, in, der, das, ist, auch, für, ...",[Sehr geehrte Kolleginnen und Kollegen! Insbes...
8,7,153,7_die_in_wir_der,"[die, in, wir, der, das, und, deutschland, sie...",[Sehr geehrte Frau Präsidentin! Meine Damen un...
9,8,135,8_bundeswehr_soldaten_ukraine_der,"[bundeswehr, soldaten, ukraine, der, die, und,...",[Frau Präsidentin! Meine sehr geehrten Damen u...


-1 refers to all outliers and should typically be ignored. Next, let's take a look at a frequent topic that were generated:

In [9]:
topic_model.get_topic(0)  # Select the most frequent topic

[('bildung', 0.010275061742121068),
 ('kinder', 0.010111806304540706),
 ('und', 0.007864590728803848),
 ('die', 0.0075571229192822565),
 ('das', 0.007489279426862581),
 ('für', 0.007436199553693426),
 ('in', 0.006966213778802973),
 ('bafög', 0.00693495616194467),
 ('sie', 0.006811057958597566),
 ('der', 0.0067508788828815)]

In [None]:
### Attributes

## Attributes

There are a number of attributes that you can access after having trained your BERTopic model:


| Attribute | Description |
|------------------------|---------------------------------------------------------------------------------------------|
| topics_               | The topics that are generated for each document after training or updating the topic model. |
| probabilities_ | The probabilities that are generated for each document if HDBSCAN is used. |
| topic_sizes_           | The size of each topic                                                                      |
| topic_mapper_          | A class for tracking topics and their mappings anytime they are merged/reduced.             |
| topic_representations_ | The top *n* terms per topic and their respective c-TF-IDF values.                             |
| c_tf_idf_              | The topic-term matrix as calculated through c-TF-IDF.                                       |
| topic_labels_          | The default labels for each topic.                                                          |
| custom_labels_         | Custom labels for each topic as generated through `.set_topic_labels`.                                                               |
| topic_embeddings_      | The embeddings for each topic if `embedding_model` was used.                                                              |
| representative_docs_   | The representative documents for each topic if HDBSCAN is used.                                                |

For example, to access the predicted topics for the first 10 documents, we simply run the following:

In [10]:
topic_model.topics_[:10]

[2, 1, -1, 8, 8, 1, 93, 1, -1, -1]

# **Visualization**
There are several visualization options available in BERTopic, namely the visualization of topics, probabilities and topics over time. Topic modeling is, to a certain extent, quite subjective. Visualizations help understand the topics that were created.

## Visualize Topics
After having trained our `BERTopic` model, we can iteratively go through perhaps a hundred topic to get a good
understanding of the topics that were extract. However, that takes quite some time and lacks a global representation.
Instead, we can visualize the topics that were generated in a way very similar to
[LDAvis](https://github.com/cpsievert/LDAvis):

In [11]:
topic_model.visualize_topics()

## Visualize Topic Probabilities

The variable `probabilities` that is returned from `transform()` or `fit_transform()` can
be used to understand how confident BERTopic is that certain topics can be found in a document.

To visualize the distributions, we simply call:

In [13]:
topic_model.visualize_distribution(probs[200])

ValueError: ignored

## Visualize Topic Hierarchy

The topics that were created can be hierarchically reduced. In order to understand the potential hierarchical structure of the topics, we can use scipy.cluster.hierarchy to create clusters and visualize how they relate to one another. This might help selecting an appropriate nr_topics when reducing the number of topics that you have created.

In [14]:
topic_model.visualize_hierarchy(top_n_topics=50)

In [15]:
topic_model.get_document_info(docs)

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,"Ich schwöre, dass ich meine Kraft dem Wohle de...",2,2_menschen_und_die_sie,"[menschen, und, die, sie, in, der, das, ist, e...",[Frau Präsidentin! Liebe Kolleginnen und Kolle...,menschen - und - die - sie - in - der - das - ...,0.031167,False
1,Frau Präsidentin! Sehr geehrte Kolleginnen und...,1,1_ukraine_und_die_der,"[ukraine, und, die, der, in, wir, krieg, ist, ...",[Frau Präsidentin! Liebe Kolleginnen und Kolle...,ukraine - und - die - der - in - wir - krieg -...,0.160074,False
2,"Frau Ministerin, erst einmal herzlichen Dank f...",-1,-1_die_der_und_das,"[die, der, und, das, wir, in, sie, ist, es, auch]",[Sehr geehrte Frau Präsidentin! Liebe Kollegin...,die - der - und - das - wir - in - sie - ist -...,0.702096,False
3,Sie haben vollkommen recht mit der Zeit. Da da...,8,8_bundeswehr_soldaten_ukraine_der,"[bundeswehr, soldaten, ukraine, der, die, und,...",[Frau Präsidentin! Meine sehr geehrten Damen u...,bundeswehr - soldaten - ukraine - der - die - ...,1.000000,False
4,"In Bezug auf die Lieferung schwerer Waffen, in...",8,8_bundeswehr_soldaten_ukraine_der,"[bundeswehr, soldaten, ukraine, der, die, und,...",[Frau Präsidentin! Meine sehr geehrten Damen u...,bundeswehr - soldaten - ukraine - der - die - ...,0.302115,False
...,...,...,...,...,...,...,...,...
10786,Frau Präsidentin! Werte Kolleginnen und Kolleg...,6,6_wir_die_und_in,"[wir, die, und, in, der, das, ist, auch, für, ...",[Sehr geehrte Kolleginnen und Kollegen! Insbes...,wir - die - und - in - der - das - ist - auch ...,0.017369,False
10787,Vielen Dank. – Sehr geehrte Frau Präsidentin! ...,-1,-1_die_der_und_das,"[die, der, und, das, wir, in, sie, ist, es, auch]",[Sehr geehrte Frau Präsidentin! Liebe Kollegin...,die - der - und - das - wir - in - sie - ist -...,0.707677,False
10788,Sehr geehrte Frau Präsidentin! Liebe Kolleginn...,48,48_öffentlich_rechtlichen_medien_journalisten,"[öffentlich, rechtlichen, medien, journalisten...",[Frau Präsidentin! Meine Damen und Herren! Ich...,öffentlich - rechtlichen - medien - journalist...,1.000000,False
10789,Sehr geehrte Frau Präsidentin! Liebe Kolleginn...,-1,-1_die_der_und_das,"[die, der, und, das, wir, in, sie, ist, es, auch]",[Sehr geehrte Frau Präsidentin! Liebe Kollegin...,die - der - und - das - wir - in - sie - ist -...,0.709040,False


In [16]:
def check_type(value):
    # Using type function
    print(f"The type of the value using 'type' function is: {type(value)}")

    # Using isinstance for common types
    if isinstance(value, int):
        print("The value is an integer.")
    elif isinstance(value, float):
        print("The value is a float.")
    elif isinstance(value, str):
        print("The value is a string.")
    elif isinstance(value, list):
        print("The value is a list.")
    elif isinstance(value, dict):
        print("The value is a dictionary.")
    elif isinstance(value, tuple):
        print("The value is a tuple.")
    else:
        print("The value is of some other type.")


In [17]:
check_type(topic_model.get_document_info(docs))

The type of the value using 'type' function is: <class 'pandas.core.frame.DataFrame'>
The value is of some other type.


In [18]:
import pandas as pd


df = pd.DataFrame(topic_model.get_document_info(docs))

# Export DataFrame to CSV
#df.to_csv('topic_models.csv', index=False)

In [None]:
list_of_dicts = load_data_v2("speeches_20.jsonl")

mapping_dict = {item['text']: item['id'] for item in list_of_dicts}

# Add the corresponding 'id' to the dataframe
df['id'] = df['Document'].map(mapping_dict)

print(df.head(10))

                                            Document  Topic  \
0  Ich schwöre, dass ich meine Kraft dem Wohle de...      2   
1  Frau Präsidentin! Sehr geehrte Kolleginnen und...     38   
2  Frau Ministerin, erst einmal herzlichen Dank f...     36   
3  Sie haben vollkommen recht mit der Zeit. Da da...      7   
4  In Bezug auf die Lieferung schwerer Waffen, in...      7   
5  Ich beantworte angesichts der Zeit zunächst di...    115   
6  Danke schön. – Frau Bundesministerin, ich find...     59   
7  Mir liegt auf der Zunge, zu sagen: Zu Zitaten ...     44   
8  Frau Ministerin, ist es denn so, dass der Bund...     -1   
9                                 Nein. Herr Huber.      -1   

                                               Name  \
0                            2_menschen_und_die_sie   
1                     38_russland_ukraine_krieg_die   
2                       36_nachfrage_frage_ich_habe   
3              7_bundeswehr_ukraine_soldaten_waffen   
4              7_bundeswehr_ukr

In [None]:
aggregated_df = df.groupby('Name')['id'].apply(lambda x: ', '.join(x)).reset_index()

print(aggregated_df)

                                                  Name  \
0                                   -1_die_und_der_wir   
1                                    0_die_wir_und_der   
2    100_bundespolizei_polizei_bundespolizeigesetz_die   
3                 101_wald_waldbrand_waldes_waldbrände   
4                  102_nato_finnland_schweden_beitritt   
..                                                 ...   
119                                  96_wir_das_die_in   
120  97_frauen_gleichstellungsbericht_gleichstellun...   
121               98_kultur_identität_deutsche_sprache   
122                99_moldau_moldawien_republik_odessa   
123               9_wohnungen_bauen_wohnen_wohnungsbau   

                                                    id  
0    ID203000900, ID203001000, ID203001400, ID20300...  
1    ID203107100, ID203110500, ID203110600, ID20311...  
2    ID202002800, ID202005000, ID207512100, ID20900...  
3    ID207804600, ID207210900, ID207211000, ID20721...  
4    ID206313000, 

In [None]:
aggregated_df.to_csv('aggregated.csv', index=False)

In [None]:
df.to_json(path_or_buf='data.json', orient='records')

## Visualize Terms

We can visualize the selected terms for a few topics by creating bar charts out of the c-TF-IDF scores for each topic representation. Insights can be gained from the relative c-TF-IDF scores between and within topics. Moreover, you can easily compare topic representations to each other.

In [22]:
topic_model.visualize_barchart(top_n_topics=100)

In [32]:
topic_model.visualize_barchart(top_n_topics=100)

## Visualize Topic Similarity
Having generated topic embeddings, through both c-TF-IDF and embeddings, we can create a similarity matrix by simply applying cosine similarities through those topic embeddings. The result will be a matrix indicating how similar certain topics are to each other.

In [23]:
topic_model.visualize_heatmap(n_clusters=100, width=1000, height=1000)

## Visualize Term Score Decline
Topics are represented by a number of words starting with the best representative word. Each word is represented by a c-TF-IDF score. The higher the score, the more representative a word to the topic is. Since the topic words are sorted by their c-TF-IDF score, the scores slowly decline with each word that is added. At some point adding words to the topic representation only marginally increases the total c-TF-IDF score and would not be beneficial for its representation.

To visualize this effect, we can plot the c-TF-IDF scores for each topic by the term rank of each word. In other words, the position of the words (term rank), where the words with the highest c-TF-IDF score will have a rank of 1, will be put on the x-axis. Whereas the y-axis will be populated by the c-TF-IDF scores. The result is a visualization that shows you the decline of c-TF-IDF score when adding words to the topic representation. It allows you, using the elbow method, the select the best number of words in a topic.


In [24]:
topic_model.visualize_term_rank()

# **Topic Representation**
After having created the topic model, you might not be satisfied with some of the parameters you have chosen. Fortunately, BERTopic allows you to update the topics after they have been created.

This allows for fine-tuning the model to your specifications and wishes.

## Update Topics
When you have trained a model and viewed the topics and the words that represent them,
you might not be satisfied with the representation. Perhaps you forgot to remove
stopwords or you want to try out a different `n_gram_range`. We can use the function `update_topics` to update
the topic representation with new parameters for `c-TF-IDF`:


In [25]:
topic_model.update_topics(docs, n_gram_range=(1, 2))

In [26]:
topic_model.get_topic(0)   # We select topic that we viewed before

[('und', 0.007082377062937601),
 ('die', 0.006891369951162737),
 ('das', 0.006492975211050283),
 ('kinder', 0.006206843477831798),
 ('bildung', 0.006107124464201174),
 ('der', 0.005987202564600468),
 ('für', 0.005984130922153124),
 ('in', 0.005964009311953617),
 ('wir', 0.005757858037738338),
 ('sie', 0.005714691913260871)]

## Topic Reduction
We can also reduce the number of topics after having trained a BERTopic model. The advantage of doing so,
is that you can decide the number of topics after knowing how many are actually created. It is difficult to
predict before training your model how many topics that are in your documents and how many will be extracted.
Instead, we can decide afterwards how many topics seems realistic:





In [27]:
topic_model.reduce_topics(docs, nr_topics=60)

2023-09-25 22:54:03,764 - BERTopic - Reduced number of topics from 118 to 60


<bertopic._bertopic.BERTopic at 0x7d83af12fa60>

In [28]:
# Access the newly updated topics with:
print(topic_model.topics_)

[0, 1, -1, 4, 4, 1, 46, 1, -1, -1, 4, 4, 4, -1, 1, 20, -1, -1, -1, -1, 13, 13, 13, 13, -1, -1, 46, 13, 21, -1, 21, -1, -1, 4, 46, -1, 0, 0, -1, 4, -1, 4, -1, 29, -1, 0, 4, -1, 4, 4, -1, 47, -1, 5, 2, 0, 5, 4, -1, 5, 0, -1, -1, 0, 0, -1, -1, -1, -1, -1, 37, 0, 0, 1, -1, 1, -1, 5, 1, 4, 4, -1, -1, -1, -1, 1, -1, -1, -1, -1, 0, 1, 1, -1, -1, 0, -1, 6, -1, 0, -1, 24, 24, -1, 24, -1, 24, 0, 24, 0, 24, 24, -1, -1, 1, 1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 0, 0, -1, 0, -1, -1, 0, 0, -1, 0, 0, -1, -1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, -1, 0, -1, -1, 0, -1, -1, 0, 0, 0, 0, 0, -1, 0, 0, 29, 0, -1, 0, 0, 0, 2, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, -1, -1, -1, 2, 0, -1, 0, 42, 42, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, -1, 2, 12, -1, -1, 0, 10, 12, -1, -1, 0, 0, 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 1, -1, 0, 1, -1, 0, 31, -1, -1, 31, -1, 0, 38, 0, 38, 38, -1, 38, -1, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, -1, -1, -1, 0, -1, 0, -1, -1, -1, 0, 0, 0, 9, -1, 0, -1, -1, 0, -1, 1, 1, -1, 

In [31]:
topic_model.visualize_heatmap(n_clusters=50, width=1000, height=1000)