In [1]:
from gensim import corpora
from settings.common import load_flat_dataset

dataset = load_flat_dataset('data/sample_tweets.csv', delimiter=' ')
dictionary = corpora.Dictionary(dataset)
dictionary.filter_extremes()
corpus = [dictionary.doc2bow(doc) for doc in dataset]

from gensim.models import FastText
from gensim.models.fasttext import save_facebook_model
from settings.common import load_flat_dataset

dataset_name = 'sample_tweets'
dataset = load_flat_dataset('data/{}.csv'.format(dataset_name))
ft = FastText(sentences=dataset, vector_size=100, min_count=50)
save_facebook_model(ft, 'local_{}_ft.bin'.format(dataset_name))

from tm_pipeline.tndmallet import TndMallet
from tm_pipeline.etndmallet import eTndMallet

tnd_path = 'mallet-tnd/bin/mallet'
etnd_path = 'mallet-etnd/bin/mallet'
mallet_path = 'mallet-2.0.8/bin/mallet'

In [2]:
model1 = TndMallet(tnd_path, corpus, num_topics=30, id2word=dictionary, workers=4,
                   alpha=50, beta=0.01, skew=25, noise_words_max=200, iterations=1000)

Mallet NFT: 30 topics, 5 topic bits, 11111 topic mask
Data loaded.
Max Noise Dist as of Initialization: 4
Max Topic Value as of Initialization: 2
skew: 25.0
max tokens: 5
total tokens: 265
<10> LL/token: -7.85315
<20> LL/token: -7.62399
<30> LL/token: -7.51149
<40> LL/token: -7.67828

0	1.66667	nevertrump party 
1	1.66667	donald$trump makes 
2	1.66667	great delegates trump2016 america 
3	1.66667	president 
4	1.66667	convention delegates 
5	1.66667	donald$trump donald 
6	1.66667	republican party rncincle clinton 
7	1.66667	america campaign 
8	1.66667	pence donald rncincle great 
9	1.66667	trump2016 makes 
10	1.66667	rncincle terrible night clinton 
11	1.66667	make 
12	1.66667	clinton 
13	1.66667	convention pence people 
14	1.66667	rncincle america 
15	1.66667	hillary make 
16	1.66667	donald$trump night 
17	1.66667	donald$trump convention 
18	1.66667	campaign floor 
19	1.66667	trumptrain 
20	1.66667	vote floor 
21	1.66667	trumptrain republican floor 
22	1.66667	melania rncincle great 
23

In [3]:
model2 = eTndMallet(etnd_path, corpus, num_topics=30, id2word=dictionary, workers=4,
                   alpha=50, beta=0.01, skew=25, noise_words_max=200,
                   tau=200, embedding_path='local_sample_tweets_ft.bin',
                   closest_x_words=3, iterations=1000)

Mallet eNFT: 30 topics, 5 topic bits, 11111 topic mask
Data loaded.
Max Noise Dist as of Initialization: 9
Max Topic Value as of Initialization: 2
Starting Embedding Space Size: 0
skew: 25.0
max tokens: 5
total tokens: 265
<10> LL/token: -7.79314
<20> LL/token: -7.65264
<30> LL/token: -7.85008
<40> LL/token: -7.5657

0	1.66667	rncincle republican president 
1	1.66667	make donald$trump 
2	1.66667	trumptrain trump2016 rncincle 
3	1.66667	melania president 
4	1.66667	america melania 
5	1.66667	donald$trump make 
6	1.66667	floor 
7	1.66667	vote night nevertrump 
8	1.66667	convention donald$trump 
9	1.66667	party donald$trump 
10	1.66667	rncincle 
11	1.66667	republican nevertrump delegates 
12	1.66667	rncincle vote 
13	1.66667	donald$trump party hillary 
14	1.66667	convention 
15	1.66667	hillary america 
16	1.66667	lost campaign melania 
17	1.66667	great 
18	1.66667	people donald$trump 
19	1.66667	donald terrible 
20	1.66667	donald$trump lost 
21	1.66667	clinton gopconvention 
22	1.66667	te

In [5]:
topics = model1.show_topics(num_topics=10, num_words=20, formatted=False)
noise = model1.load_noise_dist()
noise_list = sorted([(x, noise[x]) for x in noise.keys()], key=lambda x: x[1], reverse=True)

In [6]:
from tm_pipeline.nlda import NLDA

model = NLDA(dataset=dataset, tnd_k=30, tnd_alpha=50, tnd_beta0=0.01, tnd_beta1=25, tnd_noise_words_max=200,
                 tnd_iterations=1000, lda_iterations=1000, lda_k=30, nlda_phi=10, nlda_topic_depth=100, top_words=20,
                 save_path='results/', mallet_tnd_path=tnd_path, mallet_lda_path=mallet_path, random_seed=1824, run=True)

Mallet NFT: 30 topics, 5 topic bits, 11111 topic mask
Data loaded.
Max Noise Dist as of Initialization: 2
Max Topic Value as of Initialization: 2
skew: 25.0
max tokens: 5
total tokens: 265
<10> LL/token: -7.92471
<20> LL/token: -7.68186
<30> LL/token: -7.54292
<40> LL/token: -7.55353

0	1.66667	melania vote delegates 
1	1.66667	floor republican 
2	1.66667	melania 
3	1.66667	convention rncincle 
4	1.66667	party 
5	1.66667	make nevertrump melania 
6	1.66667	convention america 
7	1.66667	republican gopconvention 
8	1.66667	great trumptrain 
9	1.66667	trump2016 republican 
10	1.66667	delegates party president 
11	1.66667	rncincle 
12	1.66667	donald$trump america terrible 
13	1.66667	donald$trump pence 
14	1.66667	vote lost 
15	1.66667	president rncincle 
16	1.66667	terrible 
17	1.66667	hillary 
18	1.66667	donald$trump make donald 
19	1.66667	clinton 
20	1.66667	donald$trump convention 
21	1.66667	people 
22	1.66667	rncincle 
23	1.66667	pence republican 
24	1.66667	makes nevertrump 
25	1.66

In [25]:
model1 = TndMallet(tnd_path, corpus, num_topics=15, id2word=dictionary, workers=4,
                   alpha=50, beta=0.01, skew=25, noise_words_max=200, iterations=150)

Mallet NFT: 15 topics, 4 topic bits, 1111 topic mask
Data loaded.
Max Noise Dist as of Initialization: 3
Max Topic Value as of Initialization: 4
skew: 25.0
max tokens: 5
total tokens: 265
<10> LL/token: -7.77429
<20> LL/token: -7.27543
<30> LL/token: -7.07744
<40> LL/token: -7.03081

0	3.33333	trump2016 pence terrible vote 
1	3.33333	makes president night make melania 
2	3.33333	rncincle 
3	3.33333	vote rncincle party 
4	3.33333	great clinton lost vote 
5	3.33333	donald$trump floor 
6	3.33333	republican make terrible night 
7	3.33333	people delegates clinton vote 
8	3.33333	melania 
9	3.33333	gopconvention trumptrain donald hillary makes night 
10	3.33333	nevertrump campaign hillary 
11	3.33333	america make 
12	3.33333	donald$trump donald america 
13	3.33333	convention president 
14	3.33333	convention party donald 

<50> LL/token: -6.91312
<60> LL/token: -7.20881
<70> LL/token: -7.00371
<80> LL/token: -6.94161
<90> LL/token: -6.90848

0	3.33333	terrible delegates night hillary 
1	3.333