In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import spacy

media = [
    'junge Welt',
    "NachDenkSeiten",
    'taz',
    'Süddeutsche Zeitung',
    'stern TV',
    "DER SPIEGEL",
    'Der Tagesspiegel',
    'ARD',
    'Tagesschau',
    'ZDF',
    "ZDFheute Nachrichten",
    'Bayerischer Rundfunk',
    'ntv Nachrichten',
    'RTL',
    'FOCUS Online',
    'ZEIT ONLINE',
    'faz',
    'WELT',
    "BILD",
    'NZZ Neue Zürcher Zeitung',
    "Junge Freiheit",
    'COMPACTTV'
]

def load_filter():
    nlp = spacy.load("de_core_news_sm")
    filterwords = spacy.lang.de.stop_words.STOP_WORDS
    with open("../docs/filterwords.txt", encoding="utf-8", errors="ignore") as d:
        filterwords.update(d.read().split())
    with open("../docs/german_stopwords_full.txt", encoding="utf-8", errors="ignore") as d:
        filterwords.update(d.read().split()[53:])
    return list(set(filterwords))

stop_words = frozenset(load_filter())

In [None]:
df = pd.read_pickle('../data/combined.pkl')
df.dropna(subset=['transcript'], inplace=True)
docs = df['transcript'].astype(str).to_numpy()

In [None]:
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1,1))
topic_model = BERTopic(vectorizer_model = vectorizer_model, verbose=1, language='multilingual', min_topic_size=500)

In [None]:
topics, probs = topic_model.fit_transform(docs)
topic_model.save('bertopic_model_combined')

In [None]:
topic_model = BERTopic.load('bertopic_model_combined')
#topics, probs = topic_model.transform(docs)
#topics = df['topic']

In [None]:
topic_model.visualize_barchart(topics=[3, 7, 10], n_words=10)

In [None]:
topic_info = topic_model.get_topic_info()
topic_dict = pd.Series(topic_info.Name.values,index=topic_info.Topic).to_dict()
df['topic'] = topics
df['topic'] = df['topic'].apply(lambda row: topic_dict[row])
df['topic_prob'] = probs
#df.to_pickle('../data/topics_by_minute/topics_by_minute_bertopic.pkl')
#df.to_pickle('../data/topics_combined.pkl')

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_counts = df['topic'].value_counts()
reverse_topic_dict = dict(zip(topic_counts.index.to_list(), np.arange(-1,89)))
topic_dict = dict(zip(np.arange(-1,89), topic_counts.index.to_list()))
df['topic_number'] = df['topic'].apply(lambda x: reverse_topic_dict[x])

In [None]:
topics_per_class = topic_model.topics_per_class(
    docs=df['transcript'].astype(str).to_numpy(), 
    topics=df['topic_number'].to_numpy(), 
    classes=df['medium'].to_numpy(),
)
frequency_dict = topics_per_class.groupby('Class')['Frequency'].sum().to_dict()
topics_per_class['N'] = topics_per_class['Class'].apply(lambda x: frequency_dict[x])
topics_per_class['Frequency'] = topics_per_class['Frequency']/topics_per_class['N']*100.0
topics_per_class.drop(columns=['N'], inplace=True)
sorted_df = pd.DataFrame()
for medium in media:
    temp_df = topics_per_class[topics_per_class['Class'] == medium]
    sorted_df = pd.concat([sorted_df, temp_df], axis=0)
topics_per_class = sorted_df
topic_model.visualize_topics_per_class(topics_per_class)