In [None]:
from sklearn import datasets
digits = datasets.load_digits()

In [None]:
import numpy as np
from sklearn.decomposition import PCA

pca2 = PCA(n_components=2)
data2 = pca2.fit_transform(digits["data"])
data_inverse = pca2.inverse_transform(data2)
np.square(digits["data"] - data_inverse).sum() / len(digits["data"])

In [None]:
err = []
for n in range(2, 65):
    pca_n = PCA(n_components=n)
    data_n = pca_n.fit_transform(digits["data"])
    data_inverse_n = pca_n.inverse_transform(data_n)
    err.append((n, np.square(digits["data"] - data_inverse_n).sum() / len(digits["data"])))

In [None]:
import pandas as pd
pd.DataFrame(err, columns=["n", "err"]).set_index("n").plot()

In [None]:
from sklearn.manifold import TSNE
tsne2 = TSNE(n_components=2, random_state=42)
tdata2 = tsne2.fit_transform(digits["data"])
tsne2.kl_divergence_

In [None]:
%pip install umap-learn[parametric_umap]

In [None]:
import umap
umap2 = umap.UMAP(n_components=2, random_state=42)
udata2 = umap2.fit_transform(digits["data"])
udata_inverse = umap2.inverse_transform(udata2)
np.square(digits["data"] - udata_inverse).sum() / len(digits["data"])

In [None]:
from sklearn.cluster import KMeans
km5 = KMeans(n_clusters=5, random_state=42).fit(digits["data"])
km10 = KMeans(n_clusters=10, random_state=42).fit(digits["data"])

In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score
print(silhouette_score(digits["data"], km5.labels_))
print(silhouette_score(digits["data"], km10.labels_))
print(calinski_harabasz_score(digits["data"], km5.labels_))
print(calinski_harabasz_score(digits["data"], km10.labels_))

In [None]:
from sklearn.cluster import Birch
birch5 = Birch(n_clusters=5).fit(digits["data"])
birch10 = Birch(n_clusters=10).fit(digits["data"])

print(silhouette_score(digits["data"], birch5.labels_))
print(silhouette_score(digits["data"], birch10.labels_))
print(calinski_harabasz_score(digits["data"], birch5.labels_))
print(calinski_harabasz_score(digits["data"], birch10.labels_))

In [None]:
from sklearn import datasets
news = datasets.fetch_20newsgroups()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
STOPWORDS = {'none', 'thereby', 'mine', 'serious', 'whereafter', 'nothing', "'ll", 
             'itself', 'first', 'whoever', '’ve', 'noone', 'moreover', 'regarding', 
             'but', 'various', 'and', 'their', 'between', 'everyone', 'us', 'other', 
             'third', 'last', 'only', 'been', 'always', 'throughout', 'over', 'anyhow', 
             'i', 'nobody', 'be', 'off', "'d", 'then', 'eleven', 'since', "'ve", 'did', 
             'ever', 'than', 'call', 'few', 'could', 'whatever', 'front', 'there', 
             'across', 'whenever', 'is', 'this', 'empty', 'indeed', 'please', 'namely', 
             'his', 'eight', 'those', 'hence', 'wherein', 'amongst', 'using', 'both', 
             '’re', 'seem', 'two', 'several', 'whether', 'about', 'due', 'behind', 'am', 
             'what', 'name', 'has', 'three', 'therefore', '‘s', 'whereas', 'the', 'until', 
             'meanwhile', 'anything', 'that', 'never', 'how', 'sometimes', 'each', 
             'toward', 'doing', 'someone', 'at', 'hereafter', 'almost', 'if', 'same', 
             'her', 'anyone', 'became', 'into', 'latter', 'by', "'s", 'four', 'wherever', 
             'besides', 'must', 'thence', 'in', 'anywhere', 'any', 'twelve', 'out', 'it', 
             'one', 'least', 'used', '‘ll', 'put', 'therein', 'a', 're', 'she', 'are', 
             'beforehand', 'my', 'through', 'ten', 'go', 'too', '’m', 'either', 'below', 
             'else', 'around', 'all', 'except', 'n‘t', 'not', 'such', '‘re', 'was', '’s', 
             'may', 'whence', 'also', 'another', 'beyond', 'without', 'perhaps', 'alone', 
             'should', 'nevertheless', 'own', 'he', 'these', 'seemed', 'give', 'made', 
             'some', 'part', 'on', 'himself', 'hereupon', 'whereupon', 'six', 'via', 'of', 
             'quite', "'m", 'however', 'onto', 'as', 'sometime', 'more', 'while', 'sixty', 
             'does', 'everywhere', 'elsewhere', 'whither', 'who', 'nor', 'seeming', 
             'formerly', 'nowhere', 'our', 'former', 'hereby', 'further', "'re", 
             'can', 'thus', 'something', 'why', 'themselves', 'were', 'amount', 'do', 
             'we', 'beside', 'mostly', 'they', 'very', 'your', 'somewhere', 'upon', 'so', 
             'them', 'latterly', 'neither', 'within', 'enough', 'hers', 'cannot', 'you', 
             'every', 'most', 'ca', 'show', 'will', 'being', 'after', 'though', 'fifteen', 
             'down', 'really', 'although', 'full', 'up', 'well', 'somehow', 'yourself', 'me', 
             'bottom', 'next', 'many', 'unless', 'or', 'anyway', 'five', 'for', 'say', 
             'twenty', 'would', 'otherwise', 'nine', 'no', 'against', 'ourselves', 'just', 
             'even', 'yet', 'above', '‘d', 'again', 'already', 'others', 'before', 'forty', 
             'here', 'move', '‘m', "n't", 'with', 'now', 'seems', 'n’t', 'among', 'which', 
             'towards', 'side', 'still', 'might', 'together', '’ll', 'from', 'everything', 
             'have', 'becoming', 'keep', 'become', 'often', 'herein', 'under', 'whereby', 
             'top', 'thru', 'becomes', 'where', 'along', 'during', 'whole', 'him', 'once', 
             'to', 'afterwards', 'back', 'its', 'get', 'rather', 'because', 'hundred', 
             'make', 'see', 'thereafter', 'done', 'thereupon', 'had', '‘ve', 'ours', 
             'yours', 'much', 'an', 'per', 'whose', 'fifty', 'myself', 'take', 'less', 
             'whom', 'yourselves', 'when', 'herself', '’d',
             'edu', 'university', 'article', 'writes', 'posting', 'nntp', 'host', 
             'organization', 'subject', 'state', 'com', 'netcom', 'uk', 'ac', 'cs', 
             'caltech', 'gov', 'jpl' }

In [None]:
tfidf = TfidfVectorizer(stop_words = list(STOPWORDS), min_df=5)
vec = tfidf.fit_transform(news["data"])

In [None]:
%pip install tmtoolkit

In [None]:
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim
import numpy as np

In [None]:
def display_topics(model, feature_names, no_top_words):
    # wie oben, nur als Text
    for topic_idx, topic in enumerate(model.components_):
        first_index = topic.argsort()[-1]
        print("Topic %s (%02d):" % (feature_names[first_index], topic_idx))
        print(" ".join(["'"+feature_names[i]+"'"
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=10, random_state=42)
nmf.fit(vec)
display_topics(nmf, tfidf.get_feature_names_out(), 10)

In [None]:
%pip install gensim

In [None]:
res = metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=nmf.components_, 
                        dtm=vec, 
                        vocab=np.array(tfidf.get_feature_names_out()), 
                        texts=tfidf.inverse_transform(vec))
res, sum(res)/len(res)

In [None]:
%pip install tqdm

In [None]:
from tqdm.auto import trange
texts = tfidf.inverse_transform(vec)
coh = []
for n in trange(5, 21):
    nmf = NMF(n_components=n, random_state=42)
    nmf.fit(vec)
    res = metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=nmf.components_, 
                        dtm=vec, 
                        vocab=np.array(tfidf.get_feature_names_out()), 
                        texts=texts)
    coh.append((n, sum(res)/len(res)))

In [None]:
import pandas as pd
pd.DataFrame(coh, columns=["n", "coherence"]).set_index("n").plot.bar()

In [None]:
nmf = NMF(n_components=11, random_state=42)
nmf.fit(vec)
display_topics(nmf, tfidf.get_feature_names_out(), 10)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
cv = TfidfVectorizer(stop_words = list(STOPWORDS), use_idf=False)
cvec = cv.fit_transform(news["data"])

In [None]:
texts = cv.inverse_transform(cvec)
tcoh = []
for n in trange(5, 20):
    lda = LatentDirichletAllocation(n_components=n, random_state=42)
    lda.fit(cvec)
    res = metric_coherence_gensim(measure='c_v', 
                        top_n=25, 
                        topic_word_distrib=lda.components_, 
                        dtm=cvec, 
                        vocab=np.array(cv.get_feature_names_out()), 
                        texts=texts)
    tcoh.append((n, sum(res)/len(res)))

In [None]:
pd.DataFrame(coh, columns=["n", "coherence"]).set_index("n").plot.bar()

In [None]:
lda = LatentDirichletAllocation(n_components=11, random_state=42)
lda.fit(cvec)
display_topics(lda, cv.get_feature_names_out(), 10)