In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

newsgroups_train.target_names#категории текстов

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True, min_df = 12, max_df = .03)
vectorizer.fit(newsgroups_train.data)# сформировали сокращенный словарь!

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.03, max_features=None, min_df=12,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [6]:
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_train[1]

<1x8821 sparse matrix of type '<class 'numpy.int64'>'
	with 35 stored elements in Compressed Sparse Row format>

In [5]:
from tqdm import tqdm

def algorythm (K, X, it,a, b):
    # заведем счетчики nk,w, nd,k, nk, заполненные нулями
    n_k_w = np.zeros((K, X.shape[1]))
    n_d_k = np.zeros((X.shape[0], K))
    n_k = np.zeros(K)
    
    #случайным образом расставим теги словам, обновим счетчики  nk,w ,  nd,k ,  nk
    doc, word = X.nonzero()
    tags = np.random.choice(K, len(doc))
    
    for w,d,t in zip(word, doc, tags):
        n_k_w[t,w] += 1
        n_d_k[d,t] += 1
        n_k[t] +=1
        
    # пока не сойдемся к стационарному режиму:
    #для каждого  i  от 1 до  W :
    #ndi,zi−=1 ,  nzi,wi−=1 ,  nzi−=1 
    #для каждого  k  от 1 до  K :
    #вычисляем  pk=(nd,k+αk)nk,wi+βwink+βsum 
    #сэмплим новый  zi  из полученного распределения  (p1,...,pK) 
    #ndi,zi+=1 ,  nzi,wi+=1 ,  nzi+=1
    for i in tqdm(range(it)):
        for j in range(len(doc)):
            t = tags[j]
            n_k_w[t,word[j]] -= 1
            n_d_k[doc[j], t] -=1
            n_k[t] -=1
            
            p = (n_d_k[doc[j], :] + a)*(n_k_w[:,word[j]] + b[word[j]]) / \
            (n_k + b.sum())
            tags[j] = np.random.choice(np.arange(K), p = p / p.sum())
            
            n_k_w[tags[j], word[j]] += 1
            n_d_k[ doc[j], tags[j]] += 1
            n_k[tags[j]] += 1
            
    return n_k_w, n_d_k, n_k, tags
            
K = 20
n_k_w, n_d_k, n_k, tags = algorythm(K, X_train, 50, 1*np.ones(K), \
                                    1*np.ones(X_train.shape[1]))

100%|██████████| 50/50 [36:58<00:00, 43.41s/it]


In [8]:
word = np.argsort(n_k_w)[:,:-11:-1]
for k in range(20):
    a = np.zeros((1, X_train.shape[1]))
    for w in word[k]:
        a[0, w] = 1
    print('Topic {}:\t{}'.format(k, '\t'.join(vectorizer.inverse_transform(a)[0])))

Topic 0:	appreciate	company	folks	happens	love	opinions	posting	simple	sorry	wondering
Topic 1:	baseball	games	hockey	league	play	player	players	season	teams	win
Topic 2:	couple	oh	posting	posts	reply	sorry	sound	sounds	thank	week
Topic 3:	build	earth	launch	light	low	moon	nasa	orbit	project	radio
Topic 4:	al	couldn	dave	deleted	hear	internet	office	sorry	uucp	uunet
Topic 5:	13	17	18	19	21	22	23	24	27	40
Topic 6:	country	crime	force	gun	guns	israel	israeli	laws	rights	weapons
Topic 7:	bible	christ	christian	christians	church	claim	faith	jesus	religion	truth
Topic 8:	bike	bought	cars	engine	head	miles	ride	road	speed	turn
Topic 9:	cause	common	disease	effect	experience	food	medical	results	treatment	usually
Topic 10:	btw	exact	haven	hello	market	michael	radio	safety	somebody	story
Topic 11:	board	disk	dos	mac	memory	monitor	pc	ram	sale	video
Topic 12:	american	april	development	house	national	provide	research	university	washington	white
Topic 13:	difference	especially	gets	goes	heat	hot

Имеем следующие темы:
1.
2. Спорт
3.
4.Космос
5.
6.
7.Политика, оружие
8.Религия
9.Машины
10.
11.
12.Компьютер
13.
14.
15.
16.Hardware
17.
18.
19.
20.Криптография

Все они есть в исходном датасете. Если же увеличить число иттераций до 70 и немного расширить словарь можно получить более точный список, однако это требует много времени