# Классная Работа

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [2]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [3]:
newsgroups_train.target[:10]

array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [4]:
n = 854
print('Topic = {0}\n'.format(newsgroups_train.target_names[newsgroups_train.target[n]]))
print(newsgroups_train.data[n])

Topic = rec.motorcycles

hey... I'm pretty new to the wonderful world of motorcycles... I just
bought
a used 81 Kaw KZ650 CSR from a friend.... I was just wondering what kind of

saddle bags I could get for it (since I know nothing about them)  are there
bags for the gas tank?  how much would some cost, and how much do they
hold?
thanks for your advice!!!  I may be new to riding, but I love it
already!!!!
:)




In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

vectorizer = CountVectorizer(lowercase=True, stop_words=ENGLISH_STOP_WORDS,
                             analyzer='word', binary=True)
vectorizer.fit(newsgroups_train.data)



CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [6]:
len(vectorizer.vocabulary_)

101322

In [7]:
vectorizer.vocabulary_

{'wondering': 96879,
 'enlighten': 37256,
 'car': 25717,
 'saw': 80420,
 'day': 31927,
 'door': 34741,
 'sports': 84312,
 'looked': 57247,
 'late': 55606,
 '60s': 9843,
 'early': 35902,
 '70s': 11174,
 'called': 25437,
 'bricklin': 24108,
 'doors': 34742,
 'really': 76269,
 'small': 83208,
 'addition': 16806,
 'bumper': 24583,
 'separate': 81450,
 'rest': 77676,
 'body': 23430,
 'know': 54493,
 'tellme': 87913,
 'model': 62594,
 'engine': 37208,
 'specs': 84050,
 'years': 99608,
 'production': 73174,
 'history': 46690,
 'info': 49800,
 'funky': 41874,
 'looking': 57250,
 'mail': 59071,
 'fair': 39296,
 'number': 66680,
 'brave': 23973,
 'souls': 83779,
 'upgraded': 92389,
 'si': 82337,
 'clock': 27889,
 'oscillator': 68519,
 'shared': 81848,
 'experiences': 38637,
 'poll': 72039,
 'send': 81378,
 'brief': 24125,
 'message': 60923,
 'detailing': 33127,
 'procedure': 73122,
 'speed': 84088,
 'attained': 20236,
 'cpu': 30233,
 'rated': 75904,
 'add': 16791,
 'cards': 25769,
 'adapters': 1

In [8]:
vectorizer.vocabulary_.get('car')

25717

In [12]:
text = 'I was wondering if anyone out there could enlighten me on this car I saw'
x = vectorizer.transform([text])
type(x)
x.data
x.nonzero()
x.toarray()
vectorizer.inverse_transform(x)
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_train.shape
X_train.nnz / np.prod(X_train.shape)

0.0006593137467596179

# Задача

Запустить модель LDA и Gibbs Sampling с числов тегов 20. Вывести топ-10 слов по каждому тегу. Соотнести полученные теги с тегами из датасета, сделать выводы

In [0]:
from tqdm import tqdm
tags = 20

n_dk=np.zeros(tags*X_train.shape[0]).reshape(X_train.shape[0],tags)
n_kw=np.zeros(tags*X_train.shape[1]).reshape(tags,X_train.shape[1])
n_k=np.zeros(tags)

docs, words = X_train.nonzero()
z = np.random.choice(tags, len(docs))
    
for i, j, k in zip(docs, words, z):
    n_dk[i, k] += 1
    n_kw[k, j] += 1
    n_k[k] += 1

In [0]:
def LDA(n_dk, n_kw, n_k, z, docs, words, tags, alpha, beta, n_iter):    
    for i in tqdm(range(n_iter)):
        for j in range(len(docs)):
            cur_word = words[j]
            cur_doc = docs[j]
            cur_tag = z[j]
            n_dk[cur_doc, cur_tag] -= 1
            n_kw[cur_tag, cur_word] -= 1
            n_k[cur_tag] -= 1
            p = (n_dk[cur_doc, :] + alpha) * (n_kw[:, cur_word] + beta[cur_word]) / (n_k + beta.sum())
            z[j] = np.random.choice(np.arange(tags), p=p / p.sum())
            n_dk[cur_doc, z[j]] += 1
            n_kw[z[j], cur_word] += 1
            n_k[z[j]] += 1
    return z, n_kw, n_dk, n_k

In [15]:

z,n_kw, n_dk, n_k = LDA(n_dk, n_kw, n_k, z, docs, words, 20, 2*np.ones(20), 2*np.ones(X_train.shape[1]), 50)

100%|██████████| 50/50 [1:00:42<00:00, 72.86s/it]


In [17]:
x = np.argsort(n_kw, axis=1)[:, -10:]
for i in range(20):
    matrix = np.zeros((1, X_train.shape[1]))
    for j in x[i]:
        matrix[0, j] = 1
    print('Tag {}:\t{}'.format(i+1, '\t'.join(vectorizer.inverse_transform(matrix)[0])))

Tag 1:	does	don	good	just	know	like	people	think	time	use
Tag 2:	client	head	hp	morals	propaganda	pt	rumor	shit	support	vga
Tag 3:	14	24	34	hp	mi	mn	mr	mt	mw	pl
Tag 4:	49	appreciate	campaign	icons	leave	looking	mailing	trick	uh	writes
Tag 5:	ah	andrew	experiences	fixing	plenty	pov	prevent	requested	spirituality	texture
Tag 6:	cheer	dropping	funny	goal	looking	modem	owner	racist	reference	white
Tag 7:	486	adaptor	boy	canadian	cheers	effort	jumpers	registration	settings	steve
Tag 8:	class	diamond	download	excuse	formats	pieces	probably	selling	source	windows
Tag 9:	bronze	foot	handheld	kids	msg	quite	risk	sale	sf	switch
Tag 10:	286	anybody	cmos	cubs	mary	miller	ob	page	samuelsson	steve
Tag 11:	126	3d	amendment	andrew	appreciated	inputs	mean	suggestions	thanks	want
Tag 12:	125	adams	adult	asking	extract	lightly	managing	problems	reactions	temp
Tag 13:	350	article	bigger	brain	current	dave	different	dsl	edu	liked
Tag 14:	8900c	appreciated	james	life	lower	nick	sabres	smart	straight	substit