In [None]:
!pip install sklearn

In [None]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

In [None]:
len(newsgroups_train['data'])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3, random_state=42)
clf.fit(vectors, newsgroups_train.target)

In [None]:
from sklearn import metrics
newsgroups_test = fetch_20newsgroups(subset='test')
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='macro')

In [None]:
import pandas as pd
pd.DataFrame(metrics.confusion_matrix(newsgroups_test.target, pred), 
             columns=newsgroups_train.target_names, index=newsgroups_train.target_names)

In [None]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=5)
nmf.fit(vectors)

In [None]:
features = vectorizer.get_feature_names_out()
topics = []
for topic, word_vector in enumerate(nmf.components_):
    total = word_vector.sum()
    largest = word_vector.argsort()[::-1] # invert sort order
    
    topics.append([f" {features[largest[i]]}" for i in range(5)])
pd.DataFrame(topics, columns=[f"Wort {i}" for i in range(5)], index=[f"Topic {i}" for i in range(5)])