In [None]:
from sklearn import datasets
news = datasets.fetch_20newsgroups()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
STOPWORDS = {'none', 'thereby', 'mine', 'serious', 'whereafter', 'nothing', "'ll", 
             'itself', 'first', 'whoever', '’ve', 'noone', 'moreover', 'regarding', 
             'but', 'various', 'and', 'their', 'between', 'everyone', 'us', 'other', 
             'third', 'last', 'only', 'been', 'always', 'throughout', 'over', 'anyhow', 
             'i', 'nobody', 'be', 'off', "'d", 'then', 'eleven', 'since', "'ve", 'did', 
             'ever', 'than', 'call', 'few', 'could', 'whatever', 'front', 'there', 
             'across', 'whenever', 'is', 'this', 'empty', 'indeed', 'please', 'namely', 
             'his', 'eight', 'those', 'hence', 'wherein', 'amongst', 'using', 'both', 
             '’re', 'seem', 'two', 'several', 'whether', 'about', 'due', 'behind', 'am', 
             'what', 'name', 'has', 'three', 'therefore', '‘s', 'whereas', 'the', 'until', 
             'meanwhile', 'anything', 'that', 'never', 'how', 'sometimes', 'each', 
             'toward', 'doing', 'someone', 'at', 'hereafter', 'almost', 'if', 'same', 
             'her', 'anyone', 'became', 'into', 'latter', 'by', "'s", 'four', 'wherever', 
             'besides', 'must', 'thence', 'in', 'anywhere', 'any', 'twelve', 'out', 'it', 
             'one', 'least', 'used', '‘ll', 'put', 'therein', 'a', 're', 'she', 'are', 
             'beforehand', 'my', 'through', 'ten', 'go', 'too', '’m', 'either', 'below', 
             'else', 'around', 'all', 'except', 'n‘t', 'not', 'such', '‘re', 'was', '’s', 
             'may', 'whence', 'also', 'another', 'beyond', 'without', 'perhaps', 'alone', 
             'should', 'nevertheless', 'own', 'he', 'these', 'seemed', 'give', 'made', 
             'some', 'part', 'on', 'himself', 'hereupon', 'whereupon', 'six', 'via', 'of', 
             'quite', "'m", 'however', 'onto', 'as', 'sometime', 'more', 'while', 'sixty', 
             'does', 'everywhere', 'elsewhere', 'whither', 'who', 'nor', 'seeming', 
             'formerly', 'nowhere', 'our', 'former', 'hereby', 'further', "'re", 
             'can', 'thus', 'something', 'why', 'themselves', 'were', 'amount', 'do', 
             'we', 'beside', 'mostly', 'they', 'very', 'your', 'somewhere', 'upon', 'so', 
             'them', 'latterly', 'neither', 'within', 'enough', 'hers', 'cannot', 'you', 
             'every', 'most', 'ca', 'show', 'will', 'being', 'after', 'though', 'fifteen', 
             'down', 'really', 'although', 'full', 'up', 'well', 'somehow', 'yourself', 'me', 
             'bottom', 'next', 'many', 'unless', 'or', 'anyway', 'five', 'for', 'say', 
             'twenty', 'would', 'otherwise', 'nine', 'no', 'against', 'ourselves', 'just', 
             'even', 'yet', 'above', '‘d', 'again', 'already', 'others', 'before', 'forty', 
             'here', 'move', '‘m', "n't", 'with', 'now', 'seems', 'n’t', 'among', 'which', 
             'towards', 'side', 'still', 'might', 'together', '’ll', 'from', 'everything', 
             'have', 'becoming', 'keep', 'become', 'often', 'herein', 'under', 'whereby', 
             'top', 'thru', 'becomes', 'where', 'along', 'during', 'whole', 'him', 'once', 
             'to', 'afterwards', 'back', 'its', 'get', 'rather', 'because', 'hundred', 
             'make', 'see', 'thereafter', 'done', 'thereupon', 'had', '‘ve', 'ours', 
             'yours', 'much', 'an', 'per', 'whose', 'fifty', 'myself', 'take', 'less', 
             'whom', 'yourselves', 'when', 'herself', '’d',
             'edu', 'university', 'article', 'writes', 'posting', 'nntp', 'host', 
             'organization', 'subject', 'state', 'com', 'netcom', 'uk', 'ac', 'cs', 
             'caltech', 'gov', 'jpl' }

In [None]:
tfidf = TfidfVectorizer(stop_words = list(STOPWORDS))
vec = tfidf.fit_transform(news["data"])

In [None]:
from sklearn.decomposition import NMF
nmf = NMF(n_components = 10, random_state=42)
nmf.fit(vec)

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud


def wordcloud_topic_model_summary(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        # dict zur Speicherung der häufigsten Wörter
        freq = {}
        # nur die letzten (häufigsten) Wörter verwenden
        for i in topic.argsort()[:-no_top_words - 1:-1]:
            freq[feature_names[i].replace(" ", "_")] = topic[i]
        # Wordcloud dafür aufbauen
        wc = WordCloud(background_color="white", max_words=100, width=960, height=540)
        wc.generate_from_frequencies(freq)
        plt.figure(figsize=(12,12))
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off");
        plt.show()
        plt.close()
            
def display_topics(model, feature_names, no_top_words):
    # wie oben, nur als Text
    for topic_idx, topic in enumerate(model.components_):
        first_index = topic.argsort()[-1]
        print("Topic %s (%02d):" % (feature_names[first_index], topic_idx))
        print(" ".join(["'"+feature_names[i]+"'"
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(nmf, tfidf.get_feature_names_out(), 10)

In [None]:
wordcloud_topic_model_summary(nmf, tfidf.get_feature_names_out(), 20)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
cv = TfidfVectorizer(stop_words = list(STOPWORDS), use_idf=False)
cvec = cv.fit_transform(news["data"])

In [None]:
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(cvec)

In [None]:
display_topics(lda, cv.get_feature_names_out(), 10)

In [None]:
wordcloud_topic_model_summary(lda, cv.get_feature_names_out(), 20)