In [90]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
nltk.download('omw-1.4')

from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [131]:
def lemma_text(text, lemmataizer):
    tokens = text.split()
    return ' '.join(map(lambda w: lemmatizer.lemmatize(w), tokens))

def myLDA(n_dk, n_kw, n_k, r_th, text, word, n_topic, alpha = 1, beta = 1, max_iter=10):
    for i in tqdm(range(max_iter)):
        for j in range(len(text)):
            #вычитаем из счетчиков по 1
            n_dk[text[j], r_th[j]] -= 1
            n_kw[r_th[j], word[j]] -= 1
            n_k[r_th[j]] -= 1
            #вычисляем pk-ые
            p = (n_dk[text[j], :] + alpha) * (n_kw[:, word[j]] + beta) / (n_k + X_train.shape[1])

            #новая тема по распределению
            r_th[j] = np.random.choice(np.arange(n_topic), p = p / p.sum())

            #прибавляем счетчикам по 1
            n_dk[text[j], r_th[j]] += 1
            n_kw[r_th[j], word[j]] += 1
            n_k[r_th[j]] += 1
    #return n_dk, n_kw, n_k, r_th
    return n_kw

In [130]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

vectorizer = CountVectorizer(
                    lowercase=True, stop_words= stopwords.words("english"),
                    analyzer='word', binary=True,
                    max_df=0.05, min_df = 0.005,
                    token_pattern = r'(?u)\b[a-z]{2,}\b'
)

lemma_texts = []
for t in tqdm(newsgroups_train.data):
    lemma_texts.append(lemma_text(t, lemmatizer))

X_train = vectorizer.fit_transform(lemma_texts).toarray()
print('After lemma: %d'%(len(vectorizer.vocabulary_)))

100%|██████████| 11314/11314 [00:08<00:00, 1358.53it/s]


After lemma: 2187


In [129]:
n_topic = 20
n_topwords = 10
n_kw = np.zeros( n_topic * X_train.shape[1]).reshape(n_topic, X_train.shape[1])
n_dk = np.zeros( n_topic * X_train.shape[0]).reshape(X_train.shape[0], n_topic)
n_k = np.zeros(n_topic)
#все места, где встречаются слова в текстах
text, word = X_train.nonzero()
#r_th - массив случайных топиков длины len(text)
r_th = np.random.choice(n_topic, len(text))
for i, j, k in zip(text, word, r_th):
    n_dk[i, k] += 1
    n_kw[k, j] += 1
    n_k[k] += 1

In [127]:
n_kw  = myLDA(n_dk, n_kw, n_k, r_th, text, word, 20, 1, 1, 100)

100%|██████████| 100/100 [24:41<00:00, 14.81s/it]


In [128]:
result = np.argsort(n_kw, axis=1)[:, -n_topwords:]
for i in range(n_topic):
    matrix = np.zeros(X_train.shape[1]).reshape(1, -1)
    for j in result[i]:
        matrix[0, j] = 1
    print('theme {} \t{}'.format(i + 1, '\t'.join(vectorizer.inverse_transform(matrix)[0])))

theme 1 	cause	doctor	effect	especially	gordon	soon	surrender	test	usually	whether
theme 2 	address	advance	anybody	appreciate	appreciated	email	hi	info	reply	send
theme 3 	amount	current	difference	higher	large	low	lower	note	rate	small
theme 4 	card	computer	disk	drive	driver	mac	memory	monitor	pc	video
theme 5 	address	archive	article	list	net	order	posted	posting	request	send
theme 6 	ever	feel	free	hope	kind	pretty	quite	remember	seem	trying
theme 7 	fan	game	games	hockey	league	play	player	season	team	win
theme 8 	control	country	crime	criminal	gun	law	person	police	self	weapon
theme 9 	armenian	armenians	away	city	history	killed	men	today	turkish	war
theme 10 	bad	bike	car	drive	engine	front	mile	road	speed	turn
theme 11 	american	april	clinton	money	plan	president	public	states	support	today
theme 12 	anyway	deleted	guess	heard	oh	ok	sorry	start	stuff	wrong
theme 13 	application	code	display	ftp	image	running	software	version	window	windows
theme 14 	chip	clipper	data	encryptio