In [1]:
from artm_utils.optimizations import *
from artm_utils.calculations import *
from artm_utils.dataset_preparations import *
from artm_utils.regularizers import *
from sklearn.datasets import fetch_20newsgroups
import numpy as np

In [2]:
dataset = fetch_20newsgroups(
    subset='all',
    categories=[
        'rec.autos',
        'rec.motorcycles',
        'rec.sport.baseball',
        'rec.sport.hockey',
        'sci.crypt',
        'sci.electronics',
        'sci.med',
        'sci.space'
    ],
    remove=('headers', 'footers', 'quotes')
)

In [3]:
train_n_dw_matrix, token_2_num, num_2_token, doc_targets = prepare_sklearn_dataset(dataset, calc_cooccurences=False)

Processed:  0 documents from 7931
Processed:  500 documents from 7931
Processed:  1000 documents from 7931
Processed:  1500 documents from 7931
Processed:  2000 documents from 7931
Processed:  2500 documents from 7931
Processed:  3000 documents from 7931
Processed:  3500 documents from 7931
Processed:  4000 documents from 7931
Processed:  4500 documents from 7931
Processed:  5000 documents from 7931
Processed:  5500 documents from 7931
Processed:  6000 documents from 7931
Processed:  6500 documents from 7931
Processed:  7000 documents from 7931
Processed:  7500 documents from 7931
Nonzero values: 399030


In [6]:
D, W = train_n_dw_matrix.shape

In [7]:
T = 10
top_size = 10

In [8]:
random_gen = np.random.RandomState(42)
phi_matrix = get_prob_matrix_by_counters(random_gen.uniform(size=(T, W)).astype(np.float64))
theta_matrix = get_prob_matrix_by_counters(np.ones(shape=(D, T)).astype(np.float64))

In [9]:
regularization_list = np.zeros(100, dtype=object)
regularization_list[:] = create_reg_lda(0., 0.)

In [10]:
phi, theta = em_optimization(
    n_dw_matrix=train_n_dw_matrix,
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_list=regularization_list,
    iters_count=100,
    params={}
)

for t in xrange(T):
    top = np.argpartition(phi[t, :], -top_size)[-top_size:]
    print t
    for w in top:
        print '\t', num_2_token[w]
    print ''

Iters time 9.62147188187
0
	drug/NN
	also/RB
	health/NN
	cancer/NN
	doctor/NN
	medical/JJ
	study/NN
	person/NN
	patient/NN
	disease/NN

1
	person/NN
	phone/NN
	system/NN
	key/JJ
	law/NN
	encryption/NN
	key/NN
	government/NN
	chip/NN
	used/VB

2
	com/NN
	available/JJ
	email/NN
	anonymous/JJ
	internet/NN
	post/VB
	edu/NN
	information/NN
	list/NN
	mail/NN

3
	thing/NN
	thank/NN
	work/VB
	used/VB
	anyone/NN
	use/VB
	get/VB
	make/VB
	time/NN
	know/VB

4
	mile/NN
	problem/NN
	bike/NN
	used/VB
	buy/VB
	new/JJ
	also/RB
	engine/NN
	get/VB
	car/NN

5
	go/VB
	play/VB
	last/JJ
	year/NN
	win/VB
	player/NN
	get/VB
	game/NN
	team/NN
	db/NN

6
	moon/NN
	satellite/NN
	orbit/NN
	year/NN
	mission/NN
	launch/NN
	earth/NN
	shuttle/NN
	system/NN
	space/NN

7
	picture/NN
	energy/NN
	sky/NN
	com/NN
	light/NN
	theory/NN
	book/NN
	keyboard/NN
	see/VB
	edu/NN

8
	pt/NN
	shot/NN
	play/VB
	game/NN
	pit/NN
	st/NN
	team/NN
	hockey/NN
	goal/NN
	period/NN

9
	take/VB
	time/NN
	make/VB
	go/VB
	say/VB
	think/VB
	person/

In [11]:
phi, theta = artm_thetaless_em_optimization(
    n_dw_matrix=train_n_dw_matrix,
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_list=regularization_list,
    iters_count=100,
    params={'use_B_cheat': False}
)

for t in xrange(T):
    top = np.argpartition(phi[t, :], -top_size)[-top_size:]
    print t
    for w in top:
        print '\t', num_2_token[w]
    print ''

Iters time 11.2615590096
0
	cancer/NN
	doctor/NN
	drug/NN
	health/NN
	patient/NN
	effect/NN
	disease/NN
	cause/VB
	study/NN
	medical/JJ

1
	use/NN
	key/JJ
	message/NN
	bit/NN
	law/NN
	phone/NN
	encryption/NN
	key/NN
	government/NN
	chip/NN

2
	moon/NN
	shuttle/NN
	news/NN
	satellite/NN
	mission/NN
	orbit/NN
	list/NN
	earth/NN
	email/NN
	space/NN

3
	anyone/NN
	many/JJ
	know/VB
	time/NN
	want/VB
	person/NN
	problem/NN
	find/VB
	used/VB
	also/RB

4
	mile/NN
	low/JJ
	price/NN
	speed/NN
	power/NN
	car/NN
	buy/VB
	sell/VB
	engine/NN
	bike/NN

5
	hit/VB
	season/NN
	db/NN
	lose/VB
	player/NN
	win/VB
	year/NN
	team/NN
	last/JJ
	game/NN

6
	service/NN
	provide/VB
	company/NN
	available/JJ
	program/NN
	mail/NN
	information/NN
	include/VB
	new/JJ
	system/NN

7
	large/JJ
	world/NN
	article/NN
	book/NN
	part/NN
	com/NN
	post/VB
	call/VB
	group/NN
	edu/NN

8
	pit/NN
	shot/NN
	series/NN
	play/VB
	game/NN
	st/NN
	period/NN
	team/NN
	hockey/NN
	goal/NN

9
	even/RB
	well/RB
	think/VB
	good/JJ
	take/VB
	

In [12]:
phi, theta = artm_thetaless_em_optimization(
    n_dw_matrix=train_n_dw_matrix,
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_list=regularization_list,
    iters_count=100,
    params={'use_B_cheat': True}
)

for t in xrange(T):
    top = np.argpartition(phi[t, :], -top_size)[-top_size:]
    print t
    for w in top:
        print '\t', num_2_token[w]
    print ''

Iters time 11.3745319843
0
	cancer/NN
	doctor/NN
	cause/VB
	drug/NN
	health/NN
	disease/NN
	study/NN
	patient/NN
	effect/NN
	medical/JJ

1
	give/VB
	number/NN
	find/VB
	want/VB
	many/JJ
	new/JJ
	also/RB
	person/NN
	know/VB
	make/VB

2
	name/NN
	group/NN
	information/NN
	post/VB
	available/JJ
	list/NN
	send/VB
	com/NN
	edu/NN
	mail/NN

3
	current/JJ
	problem/NN
	power/NN
	type/NN
	line/NN
	help/VB
	thank/NN
	work/VB
	used/VB
	anyone/NN

4
	mile/NN
	road/NN
	drive/VB
	water/NN
	price/NN
	buy/VB
	bike/NN
	engine/NN
	speed/NN
	car/NN

5
	key/JJ
	db/NN
	law/NN
	phone/NN
	system/NN
	encryption/NN
	government/NN
	key/NN
	bit/NN
	chip/NN

6
	orbit/NN
	university/NN
	science/NN
	center/NN
	mission/NN
	satellite/NN
	earth/NN
	program/NN
	system/NN
	space/NN

7
	bill/NN
	soon/RB
	pitcher/NN
	theory/NN
	field/NN
	ball/NN
	position/NN
	run/NN
	book/NN
	hit/VB

8
	st/NN
	goal/NN
	player/NN
	win/VB
	game/NN
	hockey/NN
	team/NN
	period/NN
	season/NN
	play/VB

9
	say/VB
	year/NN
	take/VB
	good/JJ
	time