In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from tqdm import tqdm

In [11]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS, analyzer='word', binary=True, min_df=10, max_df=.04)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.04, max_features=None, min_df=10,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [12]:
vectorizer.vocabulary_
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_train.shape

(11314, 10299)

In [13]:
def lda(X_train, n_topics, alpha, beta, n_iter=10):
    n_kw = np.zeros((n_topics, X_train.shape[1]))   #счетчики
    n_dk = np.zeros((X_train.shape[0], n_topics))
    n_k = np.zeros(n_topics)
    docs, words = X_train.nonzero()
    z = np.random.choice(n_topics, len(docs))
    for doc, word, i in zip(docs, words, z):
        n_dk[doc, i] += 1
        n_kw[i, word] += 1
        n_k[i] += 1
    for cur_iter in tqdm(range(n_iter)):
        for i in range(len(docs)):
            n_dk[docs[i], z[i]] -= 1
            n_kw[z[i], words[i]] -= 1
            n_k[z[i]] -= 1
            p = (n_dk[docs[i], :] + alpha[:]) * (n_kw[:, words[i]] + beta[words[i]]) / (n_k[:] + beta.sum())
            z[i] = np.random.choice(np.arange(n_topics), p=p / p.sum())
            n_dk[docs[i], z[i]] += 1
            n_kw[z[i], words[i]] += 1
            n_k[z[i]] += 1
    return z, n_kw, n_dk, n_k

Запускаем модель:

In [14]:
n_topics = 20
z, n_kw, n_dk, n_k = lda(X_train, n_topics, np.ones(n_topics), np.ones(X_train.shape[1]), 50)

100%|██████████████████████████████████████████| 50/50 [30:58<00:00, 37.18s/it]


In [15]:
top_words = np.argsort(n_kw, axis=1)[:, :-11:-1]
for i in range(20):
    doc = np.zeros((1, X_train.shape[1]))
    for j in top_words[i]:
        doc[0, j] = 1
    print('Topic №{}:\t{}'.format(i, '\t'.join(vectorizer.inverse_transform(doc)[0])))

Topic №0:	comments	details	expect	important	later	longer	note	original	possibly	provide
Topic №1:	bike	buy	car	cars	engine	miles	ride	road	speed	turn
Topic №2:	children	country	israel	israeli	jews	killed	land	military	population	war
Topic №3:	11	14	18	24	25	45	50	al	hi	max
Topic №4:	anybody	black	cheers	comes	couple	especially	regards	reply	sorry	wondering
Topic №5:	chip	clinton	clipper	encryption	federal	key	keys	law	private	public
Topic №6:	couple	deleted	difference	goes	guess	oh	sorry	sort	sounds	stuff
Topic №7:	assume	came	nice	oh	ok	reading	simply	somebody	sorry	stuff
Topic №8:	12	game	games	league	play	player	players	season	team	win
Topic №9:	agree	bible	christ	christian	christians	jesus	man	religion	saying	word
Topic №10:	card	computer	disk	mac	memory	monitor	pc	price	sale	video
Topic №11:	came	days	happened	home	left	night	saw	told	took	went
Topic №12:	banks	cause	disease	effect	gordon	medical	pitt	soon	surrender	usually
Topic №13:	area	black	instead	looked	recall	rest	saw	smal

Получили следующие топики:

1) Машины
2) Криптография
3) Спорт
4) Религия
5) Компьютер
6) Наука
7) Работа
8) Сети
9) Преступления
10) Войны 

В исходном датасете присутствуют все эти темы.
При увеличении количества итераций или расширении словаря мы получим более точные данные, но проблема в том, что это будет занимать достаточно много времени.