In [4]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True, min_df = 11, max_df = .03)
vectorizer.fit(newsgroups_train.data)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.03, max_features=None, min_df=11,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=frozenset({'anything', 'hence', 'though', 'eg', 'becoming', 'own', 'ourselves', 'move', 'them', 'beyond', 'others', 'against', 'everywhere', 'he', 'herein', 'mostly', 'been', 'himself', 'none', 'show', 'yours', 'already', 'amongst', 'everyone', 'by', 'you', 'once', 'of', 'on', 'she', 'whe...d', 'find', 'otherwise', 'third', 'still', 'latter', 'seems', 'they', 'whereupon', 'that', 'might'}),
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_train.shape

(11314, 9455)

In [8]:
from tqdm import tqdm

def LDA(n, X,a, b):
    n_kw = np.zeros((n, X.shape[1]))
    n_dk = np.zeros((X.shape[0], n))
    n_k = np.zeros(n)
    
    doc, word = X.nonzero()
    t = np.random.choice(n, len(doc))
    
    for i, j, k in zip(word, doc, t):
        n_kw[k, i] += 1
        n_dk[j, k] += 1
        n_k[k] +=1

    for i in tqdm(range(50)):
        for j in range(len(doc)):
            n_kw[t[j], word[j]] -= 1
            n_dk[doc[j], t[j]] -=1
            n_k[t[j]] -=1
            
            p = (n_dk[doc[j], :] + a)*(n_kw[:,word[j]] + b[word[j]]) / (n_k + b.sum())
            t[j] = np.random.choice(np.arange(n), p=p/p.sum())
            
            n_kw[t[j], word[j]] += 1
            n_dk[doc[j], t[j]] += 1
            n_k[t[j]] += 1
            
    return n_kw, n_dk, n_k, t
            
n = 20
n_kw, n_dk, n_k, t = LDA(n, X_train, np.ones(n), np.ones(X_train.shape[1]))

100%|██████████| 50/50 [30:31<00:00, 36.52s/it]


In [15]:
words = np.argsort(n_kw)[:,:-11:-1]

for i in range(20):
    d = np.zeros((1, X_train.shape[1]))
    for j in words[i]:
        d[0, j] = 1
    print('Тема {}: \t{}'.format(i + 1, '\t'.join(vectorizer.inverse_transform(d)[0])))

Тема 1: 	chip	clinton	clipper	encryption	keys	president	private	secret	secure	security
Тема 2: 	deleted	goes	hot	looks	main	oh	sounds	suggest	wonder	wondering
Тема 3: 	couple	figure	gives	goes	haven	posted	posting	reply	sounds	unfortunately
Тема 4: 	crime	gun	guns	laws	legal	likely	police	rights	self	weapons
Тема 5: 	ca	contact	date	details	internet	message	newsgroups	posted	posting	related
Тема 6: 	bike	cars	dod	driving	engine	miles	ride	road	speed	turn
Тема 7: 	cost	earth	low	nasa	orbit	project	research	science	systems	university
Тема 8: 	appreciated	difference	exactly	hot	knows	luck	personally	reading	recall	worth
Тема 9: 	apple	board	disk	mac	memory	monitor	pc	ram	speed	video
Тема 10: 	application	code	display	files	ftp	graphics	image	server	sun	window
Тема 11: 	country	history	israel	israeli	jewish	jews	land	military	peace	war
Тема 12: 	13	17	18	19	21	22	23	24	26	27
Тема 13: 	cause	common	disease	effect	food	medical	results	taking	treatment	usually
Тема 14: 	00	asking	box	conditio

Изначальные темы: 'alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc'.

Получили пересечения по темам: спорт, религия, медицина, Middle East, компьютерная графика, транспорт (автомобили), 
программное обеспечение/компьютерная техника, космос, продажи. 