In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import spacy

def load_filter():
    nlp = spacy.load("de_core_news_sm")
    filterwords = spacy.lang.de.stop_words.STOP_WORDS
    with open("../docs/filterwords.txt", encoding="utf-8", errors="ignore") as d:
        filterwords.update(d.read().split())
    with open("../docs/german_stopwords_full.txt", encoding="utf-8", errors="ignore") as d:
        filterwords.update(d.read().split()[53:])
    return list(set(filterwords))

stop_words = frozenset(load_filter())

In [2]:
df = pd.read_csv('../data/preprocessed/concat.csv', index_col=0)
df.dropna(subset=['transcript'], inplace=True)
docs = df['transcript'].to_numpy()

In [3]:
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1,2))

In [4]:
topic_model = BERTopic(vectorizer_model = vectorizer_model, verbose=1, language='German', min_topic_size=100)
topics, probs = topic_model.fit_transform(docs)

Batches:   0%|          | 0/2460 [00:00<?, ?it/s]

2022-05-16 19:03:56,511 - BERTopic - Transformed documents to Embeddings
2022-05-16 19:04:15,645 - BERTopic - Reduced dimensionality with UMAP


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2022-05-16 19:04:21,423 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [5]:
#topic_model = BERTopic.load('bertopic_model')
#topics, probs = topic_model.transform(docs)

In [6]:
topic_info = topic_model.get_topic_info()

In [7]:
#topic_model.visualize_topics()

In [8]:
topic_model.visualize_barchart()

In [9]:
topic_model.visualize_hierarchy()

In [10]:
tmp = topic_model.find_topics('schule')
topic_model.get_topic(topic=tmp[0][0])
#topic_model.get_representative_docs(topic=tmp[0][0])

[('kinder', 0.017071077008798518),
 ('schulen', 0.014411696093810092),
 ('schule', 0.013734932244874138),
 ('schüler', 0.010440799442536248),
 ('eltern', 0.008192032285833428),
 ('unterricht', 0.007919721825034127),
 ('lehrer', 0.007689679186696009),
 ('kindern', 0.006538844850000691),
 ('glaube', 0.005084049350508146),
 ('corona', 0.00491073948588948)]

In [11]:
topic_model.get_params()

{'calculate_probabilities': False,
 'diversity': None,
 'embedding_model': <bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x17a3f51c0>,
 'hdbscan_model': HDBSCAN(min_cluster_size=100, prediction_data=True),
 'language': 'German',
 'low_memory': False,
 'min_topic_size': 100,
 'n_gram_range': (1, 1),
 'nr_topics': None,
 'seed_topic_list': None,
 'top_n_words': 10,
 'umap_model': UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=5, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}),
 'vectorizer_model': CountVectorizer(ngram_range=(1, 2),
                 stop_words=frozenset({'Dat', 'Inf.', 'a', 'ab', 'aber',
                                       'abermaliges', 'abermals', 'abgerufen',
                                       'abgerufene', 'abgerufener',
                                       'abgerufenes', 'abgesehen', 'ach', 'acht',
 

In [12]:
#topic_model.save('bertopic_model')

In [13]:
topics_per_class = topic_model.topics_per_class(docs, topics, classes=df['medium'])
topic_model.visualize_topics_per_class(topics_per_class)

23it [08:14, 21.50s/it]


In [14]:
new_topics, new_probs = topic_model.reduce_topics(docs, topics, nr_topics='auto')
new_topic_info = topic_model.get_topic_info()

2022-05-16 19:51:24,313 - BERTopic - Reduced number of topics from 73 to 36


In [15]:
new_topic_info

Unnamed: 0,Topic,Count,Name
0,0,32979,0_ukraine_menschen_deutschland_russland
1,-1,29568,-1_menschen_deutschland_glaube_frage
2,1,2002,1_impfstoff_impfung_impfpflicht_impfen
3,2,1919,2_auto_autos_fahrer_fahren
4,3,1666,3_trump_donald_donald trump_biden
5,4,1048,4_co2_klima_klimaschutz_deutschland
6,5,948,5_flughafen_flugzeug_lufthansa_maschine
7,6,854,6_polizei_demonstranten_menschen_demonstration
8,7,665,7_taliban_afghanistan_kabul_menschen
9,8,660,8_patienten_menschen_ärzte_krankenhaus
