In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import spacy

def load_filter():
    nlp = spacy.load("de_core_news_sm")
    filterwords = spacy.lang.de.stop_words.STOP_WORDS
    with open("../docs/filterwords.txt", encoding="utf-8", errors="ignore") as d:
        filterwords.update(d.read().split())
    with open("../docs/german_stopwords_full.txt", encoding="utf-8", errors="ignore") as d:
        filterwords.update(d.read().split()[53:])
    return list(set(filterwords))

stop_words = frozenset(load_filter())

In [2]:
df = pd.read_pickle('../data/raw/raw.pkl')
df.dropna(subset=['transcript'], inplace=True)
docs = df['transcript'].astype(str).to_numpy()

In [3]:
#vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1,1))
#topic_model = BERTopic(vectorizer_model = vectorizer_model, verbose=1, language='multilingual', min_topic_size=500)

In [4]:
#topics, probs = topic_model.fit_transform(docs)
#topic_model.save('bertopic_model')

In [5]:
topic_model = BERTopic.load('bertopic_model')
topics, probs = topic_model.transform(docs)

Batches:   0%|          | 0/27888 [00:00<?, ?it/s]

In [6]:
topic_info = topic_model.get_topic_info()
topic_dict = pd.Series(topic_info.Name.values,index=topic_info.Topic).to_dict()
df['topic'] = topics
df['topic'] = df['topic'].apply(lambda row: topic_dict[row])
df['topic_prob'] = probs
df.to_pickle('../data/topics_by_minute/topics_by_minute_bertopic.pkl')

In [7]:
topic_model.visualize_topics()

In [8]:
topic_model.visualize_barchart()

In [9]:
topic_model.visualize_hierarchy()

In [10]:
#tmp = topic_model.find_topics('wetter')
#topic_model.get_topic(topic=tmp[0][0])
#topic_model.get_representative_docs(topic=tmp[0][0])

In [11]:
topic_model.get_params()

{'calculate_probabilities': False,
 'diversity': None,
 'embedding_model': <bertopic.backend._sentencetransformers.SentenceTransformerBackend at 0x111f5ba90>,
 'hdbscan_model': HDBSCAN(min_cluster_size=500, prediction_data=True),
 'language': 'multilingual',
 'low_memory': False,
 'min_topic_size': 500,
 'n_gram_range': (1, 1),
 'nr_topics': None,
 'seed_topic_list': None,
 'top_n_words': 10,
 'umap_model': UMAP(angular_rp_forest=True, low_memory=False, metric='cosine', min_dist=0.0, n_components=5, tqdm_kwds={'bar_format': '{desc}: {percentage:3.0f}%| {bar} {n_fmt}/{total_fmt} [{elapsed}]', 'desc': 'Epochs completed', 'disable': True}),
 'vectorizer_model': CountVectorizer(stop_words=frozenset({'Dat', 'Inf.', 'a', 'ab', 'aber',
                                       'abermaliges', 'abermals', 'abgerufen',
                                       'abgerufene', 'abgerufener',
                                       'abgerufenes', 'abgesehen', 'ach', 'acht',
                                

In [12]:
topics_per_class = topic_model.topics_per_class(docs, topics, classes=df['medium'])
topic_model.visualize_topics_per_class(topics_per_class)

23it [00:45,  1.96s/it]


In [16]:
new_topics, new_probs = topic_model.reduce_topics(docs, topics, nr_topics='auto')
new_topic_info = topic_model.get_topic_info()
topic_model.save('reduced_topic_model')
df['reduced_topic'] = new_topics
df['reduced_topic_prob'] = new_probs
df.to_pickle('../data/topics_by_minute/topics_by_minute_bertopic_reduced.pkl')

In [14]:
#topic_model = BERTopic.load('bertopic_model')
#df = pd.read_pickle('../data/topics_by_minute/topics_by_minute_bertopic.pkl')

In [17]:
timestamps = df['date']
topics = df['reduced_topic'].to_numpy()
docs = df['transcript'].to_numpy()
#topics_over_time = pd.read_pickle('topics_over_time.pkl')


In [18]:
topics_over_time = topic_model.topics_over_time(docs, topics, timestamps)
topics_over_time.to_pickle('topics_over_time.pkl')
topic_model.visualize_topics_over_time(topics_over_time, topics=[1])

3731it [1:32:07,  1.48s/it]


In [19]:
topic_model.visualize_topics_over_time(topics_over_time, topics=[0])

In [None]:
#test = topics_over_time.loc[topics_over_time['Topic'] =='0_ukraine_russland_russischen_putin']
#sns.lineplot(x='Timestamp', y='Frequency', data=test)

In [20]:
topic_model.visualize_topics()

In [21]:
topic_model.visualize_barchart()

In [22]:
topic_model.visualize_hierarchy()