# Note: Using control_mode = True

1. Notebook zur Kontrolle die Richtigkeit von Datenvorverarbeitung
2. Benutzung control_mode = True, um nur wenige Daten aufzurufen

In [1]:
# init TextDataLoader für die Datenquelle 20 News Groups
# Daten abrufen vom Sklearn, tokenisieren und besondere Charaktern entfernen
from src.prepare_dataset import TextDataLoader
import pandas as pd

textsloader = TextDataLoader(source="20newsgroups", train_size=None, test_size=None)
textsloader.load_tokenize_texts("20newsgroups", control_mode = False)
textsloader.show_example_raw_texts(n_docs=2)
# Vorverarbeitung von Daten mit folgenden Schritten:
use_bert_embedding = False
stopwords_filter = True
textsloader.preprocess_texts(length_one_remove=True, 
                             punctuation_lower = True, 
                             stopwords_filter = stopwords_filter,
                             use_bert_embedding = use_bert_embedding)
# Daten zerlegen für Train, Test und Validation. Erstellen Vocabular aus dem Trainset

min_df=100
textsloader.split_and_create_voca_from_trainset(max_df=0.7, 
                                                min_df=min_df, 
                                                stopwords_remove_from_voca=stopwords_filter)

loading texts: ...
train-size after loading: 11314
test-size after loading: 7532
finished load!
check some sample texts of the dataset after filter punctuation and digits
['From', ':', 'lerxst', '@', 'wam', '.', 'umd', '.', 'edu', '(', "where's", 'my', 'thing', ')', 'Subject', ':', 'WHAT', 'car', 'is', 'this', '!', '?', 'Nntp', 'Posting', 'Host', ':', 'rac3', '.', 'wam', '.', 'umd', '.', 'edu', 'Organization', ':', 'University', 'of', 'Maryland', ',', 'College', 'Park', 'Lines', ':', '15', 'I', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'I', 'saw', 'the', 'other', 'day', '.', 'It', 'was', 'a', '2', 'door', 'sports', 'car', ',', 'looked', 'to', 'be', 'from', 'the', 'late', '60s', '/', 'early', '70s', '.', 'It', 'was', 'called', 'a', 'Bricklin', '.', 'The', 'doors', 'were', 'really', 'small', '.', 'In', 'addition', ',', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', '.', 'This', 'is', 'all

In [2]:
print(f'Dokumenten vor der Weiteraktualisierung durch empty-doc and 1-len-doc')
print(len(textsloader.train_indices))
print(len(textsloader.test_indices))
print(len(textsloader.val_indices))
print(len(textsloader.train_indices)+len(textsloader.test_indices)+ len(textsloader.val_indices))

Dokumenten vor der Weiteraktualisierung durch empty-doc and 1-len-doc
11214
7532
100
18846


In [3]:
docs = pd.DataFrame()
docs['text'] = textsloader.complete_docs
docs

Unnamed: 0,text
0,lerxst wam umd thing subject car nntp posting ...
1,guykuo carson washington guy kuo subject si cl...
2,twillis ec ecn purdue thomas willis subject pb...
3,jgreen amber joe green subject weitek organiza...
4,jcm head cfa harvard jonathan mcdowell subject...
...,...
18841,richmond spiff princeton stupendous man subjec...
18842,smytonj alleg jim smyton subject monitors hour...
18843,hhenderson vax clarku subject game length brav...
18844,utarlg uta subject intel chmos design kit news...


In [4]:
# Erstellen BOW-Repräsentation für ETM Modell
for_lda_model = False
word2id, id2word, train_set, test_set, val_set = textsloader.create_bow_and_savebow_for_each_set(for_lda_model=for_lda_model)

save docs in txt...
save docs finished
train-size-after-all: 11214
test-size-after-all: 7532
validation-size-after-all: 100
test-size-after-all: 11214
test-indices-length: 11214
test-size-after-all: 100
test-indices-length: 100
test-size-after-all: 7532
test-indices-length: 7532
length train-documents-indices : 896087
length of the vocabulary: 3102


start: creating bow representation...
finised creating bow input!

start: creating bow representation...
finised creating bow input!

start: creating bow representation...
finised creating bow input!

start: creating bow representation...
finised creating bow input!

start: creating bow representation...
finised creating bow input!



In [5]:
my_vocab = sorted(list(word2id.keys()))

In [6]:
print(f'Dokumenten nach dem der Weiteraktualisierung durch empty-doc and 1-len-doc')
print(len(textsloader.train_indices))
print(len(textsloader.test_indices))
print(len(textsloader.val_indices))
print(len(textsloader.train_indices)+len(textsloader.test_indices)+ len(textsloader.val_indices))

Dokumenten nach dem der Weiteraktualisierung durch empty-doc and 1-len-doc
11214
7532
100
18846


In [7]:
# re-erstellen von Dokumenten nach der Vorverarbeitungen. Die Dokumenten sind in Wörtern und werden für Word-Embedding Training benutzt
docs_tr, docs_t, docs_v = textsloader.get_docs_in_words_for_each_set()
train_docs_df = pd.DataFrame()
train_docs_df['text-after-preprocessing'] = [' '.join(doc) for doc in docs_tr]
train_docs_df

save docs in txt...
save docs finished


Unnamed: 0,text-after-preprocessing
0,jackson defense nntp posting host university i...
1,apollo hp red police state usa nntp posting ho...
2,dartmouth brian hughes installing ram quadra r...
3,bu boston university physics department articl...
4,king eng umd doug computer design lab maryland...
...,...
11209,ac steve christians university south article a...
11210,bill cs bill info college computer science dis...
11211,bbs net bill anderson restrictions bbs public ...
11212,umich monitor add card apple computer keywords...


# Erstellung von Embeddings

In [8]:
from pathlib import Path

save_path = Path.joinpath(Path.cwd(), f'prepared_data/min_df_{min_df}')
figures_path = Path.joinpath(Path.cwd(), f'figures/min_df_{min_df}')
Path(save_path).mkdir(parents=True, exist_ok=True)
Path(figures_path).mkdir(parents=True, exist_ok=True)
print(save_path)

vocab = list(word2id.keys())
model_name = "skipgram"
from src.embedding import WordEmbeddingCreator
from pathlib import Path

if model_name != "bert" and use_bert_embedding == False:
  wb_creator = WordEmbeddingCreator(model_name=model_name, documents = docs_tr, save_path= save_path)
  wb_creator.train(min_count=0, embedding_size= 300)
  wb_creator.create_and_save_vocab_embedding(vocab, save_path)
else:
  print('festlegen welches Modell für word2vec soll genutzt werden!\n Wenn bert-Modell, bitte die Vocabular aktualisieren durch use_bert_embedding = True')
#------------------------------------------------------------------------------------
if model_name != "bert" and use_bert_embedding == False:
  v = list(wb_creator.model.wv.vocab)[0]
  vec = list(wb_creator.model.wv.__getitem__(v))
  print(f'{model_name} word-embedding of the word: {v}')
  print(f'some elements of vector: {vec[:5]}')
  print(f'total dim of vector: {len(vec)}')
  print(f'show some semantic similar words \n')
  for i in range(0,2):
      print(f'neighbors of word: {vocab[i]}')
      print([r[0] for r in wb_creator.find_most_similar_words(n_neighbor=5, word=vocab[i])])
      print([r[1] for r in wb_creator.find_most_similar_words(n_neighbor=5, word=vocab[i])])
      print(100*"-")
else:
  print('festlegen welches Modell für word2vec soll genutzt werden!\n Wenn bert-Modell, bitte die Vocabular aktualisieren durch use_bert_embedding = True')

/home/miss-luu/Downloads/Sommersemester_2022/Data-Mining-Project/replication-topic-modelling-in-embedding-space/prepared_data/min_df_100
train word-embedding with skipgram


 14%|█▎        | 423/3102 [00:00<00:00, 4224.39it/s]

length of vocabulary from word-embedding with skipgram: 3102
length of vocabulary after creating BOW: 3102


100%|██████████| 3102/3102 [00:00<00:00, 4021.69it/s]


skipgram word-embedding of the word: jackson
some elements of vector: [0.17036203, 0.22225577, 0.018616812, 0.17042501, -0.12448632]
total dim of vector: 300
show some semantic similar words 

neighbors of word: countries
['nations', 'interests', 'policies', 'ethnic', 'governments']
[0.8291685581207275, 0.8290121555328369, 0.8282902836799622, 0.793046236038208, 0.7915384769439697]
----------------------------------------------------------------------------------------------------
neighbors of word: figures
['rare', 'legally', 'remains', 'limits', 'offered']
[0.740115761756897, 0.7281169295310974, 0.7175210118293762, 0.7142847776412964, 0.6874909400939941]
----------------------------------------------------------------------------------------------------


# Kontrollieren BoW Format

# Vergleichen mit other ETM

1. Für den Vergleic wurde das Paket verwendet: https://github.com/lffloyd/embedded-topic-model

In [21]:
# read data, data were already preprocessed
corpus_file = 'prepared_data/train_docs_from_complete_docs.txt'
documents = []
with open(corpus_file) as f:
    lines = f.readlines()
for line in lines:
    line = line.split(":")[1].strip()
    documents.append(line)
print(len(documents))

11214


In [43]:
# create input for etm, using method of the pakage
from embedded_topic_model.utils import preprocessing
vocabulary, train_dataset, empty_test_dataset = preprocessing.create_etm_datasets(
    documents, 
    min_df=100, #because we use documents, which we already filtered
    max_df=0.7, 
    train_size=1.0, 
)
for w in vocabulary:
    if w not in my_vocab:
        print(w)
print(vocabulary[:10])

['countries', 'piece', 'game', 'walk', 'carried', 'choice', 'lets', 'save', 'discussion', 'sex']


In [28]:
# own prefitted embedding loading and convert to the format the ETM package can use
from src.embedding import read_prefitted_embedding
import numpy as np 

# save embedding_data again for this vocabulary, sorted by the vocabulary
# covert to embedding txt for embedding mapping
vocab_embedding, embedding_data = read_prefitted_embedding(model_name, vocabulary, save_path)
print(len(embedding_data[1]))
print(vocab_embedding[:10])
# cover to format that gensim KeyedVectors can read
data = pd.DataFrame(embedding_data,
                    index=vocab_embedding)
embedding_save_path = f'prepared_data/min_df_{min_df}/{model_name}_prefitted_embeddings_control_mode.txt'
np.savetxt(embedding_save_path, data.reset_index().values, 
           delimiter=" ", 
           header="{} {}".format(len(data), len(data.columns)),
           comments="",
           fmt=["%s"] + ["%.18e"]*len(data.columns))
# test if all correct
import gensim
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format(embedding_save_path, binary=False)
print(vocab_embedding[0])
word_vectors.most_similar(vocab_embedding[0], topn=5)

300
['countries', 'piece', 'game', 'walk', 'carried', 'choice', 'lets', 'discussion', 'save', 'sex']
countries


[('nations', 0.8291685581207275),
 ('governments', 0.7915384769439697),
 ('economic', 0.7736542224884033),
 ('organizations', 0.7644751071929932),
 ('citizens', 0.7580311298370361)]

In [29]:
from embedded_topic_model.models.etm import ETM

# Training an ETM instance
etm_instance = ETM(
    vocabulary,
    embeddings=embedding_save_path, # You can pass here the path to a word2vec file or
                                   # a KeyedVectors instance
    use_c_format_w2vec=True,
    num_topics=20,
    epochs=100,
    debug_mode=True,
    lr = 0.002,
    train_embeddings=False, # Optional. If True, ETM will learn word embeddings jointly with
                            # topic embeddings. By default, is False. If 'embeddings' argument
                            # is being passed, this argument must not be True
    seed=42,
)

etm_instance.fit(train_dataset)
topics = etm_instance.get_topics(20)
topic_coherence = etm_instance.get_topic_coherence()
topic_diversity = etm_instance.get_topic_diversity()
for idx, tp in enumerate(topics):
    as_str = " ".join(tp)
    print(f'topic: {idx+1} {as_str}')
print(topic_coherence)
print(topic_diversity)

Reading embeddings from original word2vec TXT file...
Topics before training: [['mit', 'laboratory', 'org', 'research', 'stanford', 'columbia', 'de', 'distribution', 'reply', 'gmt'], ['bike', 'keywords', 'posting', 'car', 'cover', 'nntp', 'advice', 'engine', 'misc', 'rear'], ['israeli', 'policy', 'id', 'columbia', 'org', 'israel', 'virginia', 'united', 'bnr', 'cc'], ['rec', 'dod', 'research', 'mil', 'news', 'group', 'newsreader', 'sci', 'corp', 'gmt'], ['fax', 'phone', 'number', 'voice', 'tel', 'box', 'call', 'germany', 'serial', 'compatible'], ['ground', 'support', 'radio', 'technology', 'applications', 'georgia', 'phone', 'international', 'fax', 'national'], ['political', 'policy', 'view', 'genocide', 'war', 'jewish', 'good', 'morality', 'armenia', 'atheists'], ['uiuc', 'cso', 'win', 'ball', 'deleted', 'clock', 'played', 'king', 'net', 'apple'], ['ac', 'office', 'ms', 'windows', 'access', 'mine', 'de', 'newsreader', 'uunet', 'uk'], ['fire', 'atf', 'frank', 'waco', 'texas', 'koresh', 

Epoch 28 - Learning Rate: 0.002 - KL theta: 5.05 - Rec loss: 466.69 - NELBO: 471.74
Epoch 29 - Learning Rate: 0.002 - KL theta: 5.15 - Rec loss: 462.68 - NELBO: 467.83
Epoch 30 - Learning Rate: 0.002 - KL theta: 5.26 - Rec loss: 463.44 - NELBO: 468.7
Topics: [['host', 'distribution', 'university', 'posting', 'nntp', 'org', 'reply', 'uk', 'mit', 'article'], ['posting', 'university', 'nntp', 'uk', 'ac', 'host', 'reply', 'mail', 'internet', 'org'], ['nntp', 'opinions', 'posting', 'university', 'article', 'host', 'writes', 'dod', 'org', 'corporation'], ['writes', 'article', 'cs', 'posting', 'host', 'university', 'nntp', 'ca', 'uiuc', 'cc'], ['call', 'system', 'number', 'problem', 'phone', 'time', 'find', 'back', 'work', 'put'], ['gun', 'government', 'fbi', 'mr', 'armed', 'waco', 'police', 'federal', 'citizens', 'turkish'], ['good', 'power', 'car', 'engine', 'make', 'bike', 'wheel', 'rear', 'front', 'buy'], ['team', 'play', 'year', 'games', 'game', 'won', 'players', 'player', 'playoffs', 'w

Epoch 59 - Learning Rate: 0.002 - KL theta: 7.38 - Rec loss: 454.71 - NELBO: 462.09
Epoch 60 - Learning Rate: 0.002 - KL theta: 7.44 - Rec loss: 453.65 - NELBO: 461.09
Topics: [['host', 'posting', 'nntp', 'distribution', 'org', 'research', 'reply', 'mit', 'university', 'center'], ['uk', 'ac', 'university', 'cwru', 'internet', 'science', 'au', 'reply', 'gov', 'cleveland'], ['opinions', 'dod', 'disclaimer', 'mine', 'bike', 'employer', 'ride', 'hey', 'care', 'expressed'], ['writes', 'article', 'cs', 'university', 'news', 'cc', 'uiuc', 'cso', 'vms', 'columbia'], ['problem', 'system', 'line', 'number', 'part', 'set', 'read', 'code', 'find', 'current'], ['government', 'gun', 'law', 'president', 'mr', 'police', 'turkish', 'rights', 'armed', 'fbi'], ['power', 'car', 'high', 'wheel', 'good', 'rear', 'left', 'low', 'speed', 'engine'], ['year', 'game', 'team', 'games', 'play', 'win', 'players', 'won', 'player', 'playoffs'], ['mail', 'ibm', 'info', 'apple', 'net', 'list', 'price', 'email', 'send',

Epoch 90 - Learning Rate: 0.002 - KL theta: 8.05 - Rec loss: 448.65 - NELBO: 456.7
Topics: [['posting', 'host', 'nntp', 'org', 'research', 'reply', 'apr', 'center', 'distribution', 'university'], ['uk', 'ac', 'university', 'cwru', 'internet', 'au', 'reply', 'case', 'western', 'posting'], ['opinions', 'bike', 'dod', 'mine', 'disclaimer', 'care', 'ride', 'advice', 'hey', 'buying'], ['writes', 'cs', 'article', 'university', 'cc', 'news', 'uiuc', 'cso', 'virginia', 'columbia'], ['system', 'line', 'find', 'problem', 'set', 'number', 'code', 'part', 'window', 'run'], ['government', 'law', 'gun', 'president', 'mr', 'police', 'people', 'rights', 'waco', 'armed'], ['power', 'car', 'high', 'left', 'low', 'wheel', 'switch', 'drive', 'speed', 'side'], ['year', 'game', 'team', 'play', 'games', 'win', 'players', 'baseball', 'won', 'player'], ['information', 'mail', 'list', 'info', 'price', 'send', 'ibm', 'apple', 'net', 'sale'], ['point', 'question', 'fact', 'claim', 'made', 'true', 'make', 'general

In [35]:
for idx, tp in enumerate(topics):
    as_str = " ".join(tp[:25])
    print(f'topic: {idx+1} {as_str}')

topic: 1 posting host nntp org research reply apr distribution center mit university michael newsreader gmt tin institute keywords harvard writes article
topic: 2 uk ac university cwru au reply case posting internet western cleveland host nntp institute gov science australia technology usa fax
topic: 3 opinions bike mine dod care disclaimer buy ride buying advice hey pay bmw kind fun money engine opinion riding cheap
topic: 4 writes cs article university cc news uiuc cso columbia virginia utexas rochester vms urbana dept illinois vax umd buffalo uxa
topic: 5 line system find problem set window run code number include part call based make source read order function shell current
topic: 6 government gun law president mr people police rights waco fbi killed press armed turkish soldiers national citizens house turks weapons
topic: 7 power car high left low drive speed switch wheel side light rear black front ground white cable road supply cars
topic: 8 game year team play games win players

In [36]:
print(train_dataset['tokens'][0])
print(train_dataset['counts'][0])

[  63  187  194  254  371  398  450  488  507  536  643  647  671  778
  872 1047 1074 1092 1106 1166 1234 1266 1357 1388 1398 1414 1462 1501
 1530 1571 1671 1732]
[1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 3 1 1 1 1]


# Mein train_set als Eingabedaten

In [49]:
documents[0]

'guykuo carson washington guy kuo subject si clock poll final call summary final call si clock reports keywords si acceleration clock upgrade article shelley organization university washington lines nntp posting host carson washington fair number brave souls upgraded si clock oscillator shared experiences poll send message detailing experiences procedure top speed attained cpu rated speed add cards adapters heat sinks hour usage day floppy disk functionality floppies requested summarizing days add network knowledge base clock upgrade answered poll guy kuo guykuo washington'

In [52]:
" ".join(docs_tr[0])

'jackson defense nntp posting host university illinois article king writes article midway uchicago uchicago ted frank writes article harvard harvard david smith writes granted simple fact holding job improve chances job future city kid hold minimum job money made dealing drugs kid hold minimum job money made dealing drugs kids hold minimum jobs city kids give chance reason city kids legitimate work suppose correct answer family values mind assuming city family values hope'

In [53]:
print(train_set['tokens'][0])
print(train_set['counts'][0])
id2word = {}
for w, wid in word2id.items():
    id2word[wid] = w

doc = [id2word[wid] for wid in train_set['tokens'][0]]
print(sorted(doc))

[ 147  151  166  261  271  420  565  582  587  627  662  754  820  823
  856  921  978 1051 1085 1192 1198 1202 1255 1336 1377 1457 1480 1680
 1694 1807 1845 1871 1929 2043 2124 2240 2246 2437 2498 2624 2633 2701
 2710 2816 2937 2940 3020 3046]
[2 1 1 1 2 1 1 1 1 3 1 1 3 1 1 1 2 1 2 4 1 2 1 3 1 1 1 1 3 1 1 1 1 1 1 2 1
 4 1 1 2 1 3 2 1 1 2 1]
['answer', 'article', 'assuming', 'chance', 'chances', 'city', 'correct', 'david', 'dealing', 'defense', 'drugs', 'fact', 'family', 'frank', 'future', 'give', 'granted', 'harvard', 'hold', 'holding', 'hope', 'host', 'illinois', 'improve', 'jackson', 'job', 'jobs', 'kid', 'kids', 'king', 'legitimate', 'made', 'midway', 'mind', 'minimum', 'money', 'nntp', 'posting', 'reason', 'simple', 'smith', 'suppose', 'ted', 'uchicago', 'university', 'values', 'work', 'writes']


In [56]:
vocab_embedding, embedding_data = read_prefitted_embedding(model_name, my_vocab, save_path)
print(len(embedding_data[1]))
print(vocab_embedding[:10])
# cover to format that gensim KeyedVectors can read
data = pd.DataFrame(embedding_data,
                    index=vocab_embedding)
embedding_save_path = f'prepared_data/min_df_{min_df}/{model_name}_prefitted_embeddings_my_data_control_mode.txt'
np.savetxt(embedding_save_path, data.reset_index().values, 
           delimiter=" ", 
           header="{} {}".format(len(data), len(data.columns)),
           comments="",
           fmt=["%s"] + ["%.18e"]*len(data.columns))
word_vectors = KeyedVectors.load_word2vec_format(embedding_save_path, binary=False)

300
['aaron', 'ab', 'ability', 'absolute', 'absolutely', 'absurd', 'abuse', 'ac', 'academic', 'acc']


In [64]:
w = vocab_embedding[20]
print(w)
word_vectors.most_similar(w, topn=10)

accuracy


[('ensure', 0.786088228225708),
 ('errors', 0.703749418258667),
 ('analysis', 0.6551498174667358),
 ('algorithms', 0.6537426710128784),
 ('providing', 0.63914954662323),
 ('techniques', 0.6342453956604004),
 ('observations', 0.6257032155990601),
 ('detailed', 0.6231043338775635),
 ('approach', 0.6161506175994873),
 ('concerns', 0.6156328916549683)]

In [65]:
# Training an ETM instance
etm_instance = ETM(
    my_vocab,
    embeddings=embedding_save_path, # You can pass here the path to a word2vec file or
                                   # a KeyedVectors instance
    use_c_format_w2vec=True,
    num_topics=20,
    epochs=100,
    debug_mode=True,
    lr = 0.002,
    train_embeddings=False, # Optional. If True, ETM will learn word embeddings jointly with
                            # topic embeddings. By default, is False. If 'embeddings' argument
                            # is being passed, this argument must not be True
    seed=42,
)

etm_instance.fit(train_set)
topics = etm_instance.get_topics(20)

Reading embeddings from original word2vec TXT file...
Topics before training: [['mit', 'laboratory', 'org', 'research', 'stanford', 'columbia', 'de', 'distribution', 'reply', 'gmt'], ['bike', 'keywords', 'posting', 'car', 'rider', 'cover', 'nntp', 'advice', 'engine', 'misc'], ['israeli', 'policy', 'id', 'columbia', 'org', 'israel', 'virginia', 'united', 'bnr', 'cc'], ['rec', 'dod', 'research', 'mil', 'news', 'group', 'newsreader', 'sci', 'corp', 'gmt'], ['fax', 'phone', 'number', 'voice', 'tel', 'box', 'call', 'sweden', 'germany', 'serial'], ['ground', 'support', 'wire', 'radio', 'technology', 'applications', 'georgia', 'phone', 'international', 'fax'], ['political', 'policy', 'view', 'genocide', 'war', 'jewish', 'good', 'morality', 'armenia', 'atheists'], ['uiuc', 'cso', 'hitter', 'win', 'ball', 'deleted', 'clock', 'played', 'dare', 'mon'], ['ac', 'office', 'ms', 'windows', 'access', 'mine', 'de', 'newsreader', 'uunet', 'uk'], ['fire', 'atf', 'frank', 'waco', 'texas', 'koresh', 'burns

Epoch 29 - Learning Rate: 0.002 - KL theta: 2.1 - Rec loss: 629.33 - NELBO: 631.43
Epoch 30 - Learning Rate: 0.002 - KL theta: 2.22 - Rec loss: 635.99 - NELBO: 638.21
Topics: [['cornell', 'uwaterloo', 'info', 'cmu', 'andy', 'science', 'uwo', 'math', 'ca', 'leland'], ['ford', 'washington', 'keywords', 'cars', 'summary', 'europe', 'car', 'berkeley', 'riding', 'engines'], ['pp', 'armenia', 'turkish', 'professor', 'max', 'davidian', 'turkey', 'chi', 'troops', 'armenian'], ['advice', 'apple', 'read', 'info', 'newsgroup', 'posted', 'law', 'sci', 'monitors', 'reading'], ['assume', 'serial', 'trust', 'exist', 'exists', 'god', 'ability', 'false', 'depends', 'chip'], ['body', 'widget', 'building', 'window', 'properly', 'environment', 'destroy', 'event', 'actions', 'immediately'], ['ins', 'solntze', 'wpd', 'livesey', 'keith', 'cwru', 'mountain', 'magnus', 'hell', 'caltech'], ['sf', 'apple', 'se', 'ca', 'fi', 'summary', 'riding', 'keywords', 'berkeley', 'washington'], ['nasa', 'job', 'jobs', 'vote

Epoch 55 - Learning Rate: 0.002 - KL theta: 4.33 - Rec loss: 618.0 - NELBO: 622.33
Epoch 56 - Learning Rate: 0.002 - KL theta: 4.4 - Rec loss: 621.97 - NELBO: 626.37
Epoch 57 - Learning Rate: 0.002 - KL theta: 4.42 - Rec loss: 616.1 - NELBO: 620.52
Epoch 58 - Learning Rate: 0.002 - KL theta: 4.46 - Rec loss: 612.08 - NELBO: 616.54
Epoch 59 - Learning Rate: 0.002 - KL theta: 4.62 - Rec loss: 614.45 - NELBO: 619.07
Epoch 60 - Learning Rate: 0.002 - KL theta: 4.56 - Rec loss: 612.15 - NELBO: 616.71
Topics: [['cornell', 'uwaterloo', 'bikes', 'info', 'cmu', 'adobe', 'csd', 'centris', 'cars', 'andy'], ['ford', 'produced', 'modern', 'parts', 'nuclear', 'review', 'claims', 'model', 'volume', 'production'], ['soviet', 'angeles', 'extermination', 'professor', 'ohanus', 'pitt', 'appressian', 'geb', 'max', 'cal'], ['advice', 'real', 'group', 'logic', 'newsgroup', 'content', 'random', 'cross', 'ignore', 'read'], ['assume', 'trust', 'serial', 'infinite', 'eternal', 'exist', 'depends', 'god', 'fails'

Epoch 81 - Learning Rate: 0.002 - KL theta: 5.23 - Rec loss: 606.49 - NELBO: 611.72
Epoch 82 - Learning Rate: 0.002 - KL theta: 5.31 - Rec loss: 608.41 - NELBO: 613.72
Epoch 83 - Learning Rate: 0.002 - KL theta: 5.31 - Rec loss: 611.1 - NELBO: 616.41
Epoch 84 - Learning Rate: 0.002 - KL theta: 5.37 - Rec loss: 613.47 - NELBO: 618.84
Epoch 85 - Learning Rate: 0.002 - KL theta: 5.37 - Rec loss: 610.94 - NELBO: 616.31
Epoch 86 - Learning Rate: 0.002 - KL theta: 5.39 - Rec loss: 611.65 - NELBO: 617.04
Epoch 87 - Learning Rate: 0.002 - KL theta: 5.34 - Rec loss: 607.51 - NELBO: 612.85
Epoch 88 - Learning Rate: 0.002 - KL theta: 5.42 - Rec loss: 616.54 - NELBO: 621.96
Epoch 89 - Learning Rate: 0.002 - KL theta: 5.42 - Rec loss: 607.0 - NELBO: 612.42
Epoch 90 - Learning Rate: 0.002 - KL theta: 5.42 - Rec loss: 606.8 - NELBO: 612.22
Topics: [['cornell', 'uwaterloo', 'bikes', 'info', 'cmu', 'adobe', 'centris', 'csd', 'purdue', 'ford'], ['produced', 'modern', 'nuclear', 'review', 'ford', 'model'

In [66]:
topic_coherence = etm_instance.get_topic_coherence()
topic_diversity = etm_instance.get_topic_diversity()
for idx, tp in enumerate(topics):
    as_str = " ".join(tp)
    print(f'topic: {idx+1} {as_str}')
print(topic_coherence)
print(topic_diversity)

topic: 1 cornell uwaterloo bikes info cmu adobe centris csd purdue ford cars andy dave diamond ted sean leland science duke advice
topic: 2 produced modern nuclear review model ford japan parts reasons claims aid production john volume chemical plans national mass report final
topic: 3 truth soviet atheists angeles pitt livesey extermination geb god belief eternal rangers nj faith ohanus appressian professor hitter max caltech
topic: 4 advice logic cross random effect method newsgroup bother ignore real group thoughts content takes points reading net avoid thread law
topic: 5 assume trust eternal serial infinite au phone cramer takes ideas exist crypto faith understand false depends fails uga timothy universe
topic: 6 destroy immediately properly body purpose found lives children save acts event called structure saved enter pieces created pass die translation
topic: 7 matt portal ins mountain texas supporters keith donald magnus houston cwru solntze gov ucsd disclaimer cup austin dartm

In [None]:
#update read_prefitted_embedding
#update test dataset
#add save docs in prepared_data
#control prepared_data
#control ETM

In [17]:
"""
    def __init__(
        self,
        vocabulary,
        embeddings=None,
        use_c_format_w2vec=False,
        model_path=None,
        batch_size=1000,
        num_topics=50,
        rho_size=300,
        emb_size=300,
        t_hidden_size=800,
        theta_act='relu',
        train_embeddings=False,
        lr=0.005,
        lr_factor=4.0,
        epochs=20,
        optimizer_type='adam',
        seed=2019,
        enc_drop=0.0,
        clip=0.0,
        nonmono=10,
        wdecay=1.2e-6,
        anneal_lr=False,
        bow_norm=True,
        num_words=10,
        log_interval=2,
        visualize_every=10,
        eval_batch_size=1000,
        eval_perplexity=False,
        debug_mode=False,
    ):
"""

"\n    def __init__(\n        self,\n        vocabulary,\n        embeddings=None,\n        use_c_format_w2vec=False,\n        model_path=None,\n        batch_size=1000,\n        num_topics=50,\n        rho_size=300,\n        emb_size=300,\n        t_hidden_size=800,\n        theta_act='relu',\n        train_embeddings=False,\n        lr=0.005,\n        lr_factor=4.0,\n        epochs=20,\n        optimizer_type='adam',\n        seed=2019,\n        enc_drop=0.0,\n        clip=0.0,\n        nonmono=10,\n        wdecay=1.2e-6,\n        anneal_lr=False,\n        bow_norm=True,\n        num_words=10,\n        log_interval=2,\n        visualize_every=10,\n        eval_batch_size=1000,\n        eval_perplexity=False,\n        debug_mode=False,\n    ):\n"