In [None]:
import sys, os
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    os.system("test -f heise-articles-2020.db || wget  https://datanizing.com/heiseacademy/nlp-course/blob/main/99_Common/heise-articles-2020.db.gz && gunzip heise-articles-2020.db.gz")
    newsticker_db = 'heise-articles-2020.db'
else:
    newsticker_db = '../99_Common/heise-articles-2020.db'

In [None]:
import sqlite3 
import pandas as pd

sql = sqlite3.connect(newsticker_db)
df = pd.read_sql("SELECT * FROM nlp_articles WHERE datePublished<'2021-01-01' ORDER BY datePublished", 
                 sql, index_col="id", parse_dates=["datePublished"])

In [None]:
from spacy.lang.de.stop_words import STOP_WORDS as stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=list(stop_words), min_df=5, use_idf=False)
tfidf_vectors = tfidf_vectorizer.fit_transform(df["nav"])
tfidf_vectors

In [None]:
from sklearn.decomposition import NMF;

num_topics = 10

nmf = NMF(n_components = num_topics)
nmf.fit(tfidf_vectors)

In [None]:
def topics_table(model, feature_names, n_top_words = 20):
    word_dict = {}
    
    for i in range(model.n_components):
        # ermittle für jedes Topic die größten Werte
        words_ids = model.components_[i].argsort()[:-n_top_words-1:-1]
        words = [feature_names[key] for key in words_ids]
        # und füge die entsprechenden Worte im Klartext dem Dictionary hinzu
        word_dict['Topic #%02d' % i] = words;
    
    return pd.DataFrame(word_dict)

In [None]:
topics_table(nmf, tfidf_vectorizer.get_feature_names_out())

In [None]:
W = nmf.transform(tfidf_vectors)
W.sum(axis=0)/W.sum()*100.0

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

def wordcloud_topic_model_summary(model, feature_names, no_top_words):
    for topic in model.components_:
        freq = { feature_names[i].replace(" ", "_"): topic[i] for i in topic.argsort()[:-no_top_words - 1:-1]}
        wc = WordCloud(background_color="white", max_words=100, width=960, height=540)
        wc.generate_from_frequencies(freq)
        plt.figure(figsize=(12,12))
        plt.imshow(wc, interpolation='bilinear')

In [None]:
wordcloud_topic_model_summary(nmf, tfidf_vectorizer.get_feature_names_out(), 40)