Following tutorial ["Topic Modeling for Fun and Profit"](http://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html)

In [6]:
import itertools
import logging
import os
import pickle
import time

from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
from cltk.corpus.utils.formatter import cltk_normalize
from cltk.stop.greek.stops import STOPS_LIST
import gensim
from gensim.corpora.mmcorpus import MmCorpus
from gensim.utils import simple_preprocess
import numpy as np

In [7]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

In [8]:
user_dir = os.path.expanduser('~/cltk_data/user_data/lda_1kgreek/')
try:
    os.makedirs(user_dir)
except FileExistsError:
    pass

In [9]:
PREPROCESS_DEACCENT = False
STOPS_LIST = [simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)[0] for stop in STOPS_LIST if len(simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)) > 0]
STOPS_LIST = ['τῆϲ', 'τοῖϲ', 'εἰϲ', 'πρὸϲ', 'τοὺϲ']
STOPS_LIST += ["τηϲ", "τοιϲ", "εϲτι", "προϲ", "ειϲ", "ταϲ", "ωϲ", "τουϲ", "ξυν", 'ξὺν', 'πρε', 'ἀλλ']  # useful for after rm accents
STOPS_LIST = [cltk_normalize(stop) for stop in STOPS_LIST]

In [10]:
# TODO: make sure that preprocessing is same for TLGCorpus class and other setups

TOK_MIN = 3  # rm words shorter than
TOK_MAX = 20  # rm words longer than
DOC_MIN = 50  # drop docs shorter than
def tokenize(text):
    """Tokenize and rm stopwords. The Gensim `simple_preprocess` will work fine
    here becuase the Greek text has already been aggressively cleaned up.
    https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
    """
    tokens = [token for token in simple_preprocess(text, deacc=PREPROCESS_DEACCENT, min_len=TOK_MIN, max_len=TOK_MAX)]
    return [token for token in tokens if token not in STOPS_LIST]
    

#! todo: strip ascii
def iter_tlg(tlg_dir):
    """Stream TLG doc-by-doc."""
    file_names = os.listdir(tlg_dir)
    for file_name in file_names:
        file_path = os.path.join(tlg_dir, file_name)
        with open(file_path) as file_open:
            file_read = file_open.read()
        tokens = tokenize(file_read)
        tokens = [cltk_normalize(token) for token in tokens]
        # ignore very short docs
        # todo: get file length distribution to better know what is short in TLG
        if len(tokens) < DOC_MIN:
            continue
        yield file_name, tokens

In [11]:
# Take a look at the docs post-processing
# Open corpus iterator
# docs_path_rel'~/cltk_data/greek/text/tlg/plaintext/'  # for tlg
docs_path_rel = '~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/'
docs_preprocessed = os.path.expanduser(docs_path_rel)
stream = iter_tlg(docs_preprocessed)
for title, tokens in itertools.islice(iter_tlg(docs_preprocessed), 8):
    print(title, tokens[:10])  # print the article title and its first ten tokens

tlg4102.tlg006.opp-grc1.txt ['supplementum', 'varietas', 'lectionis', 'catenam', 'evangelium', 'catenae', 'novum', 'testamentum', 'university', 'leipzig']
tlg2200.tlg00518.opp-grc1.txt ['declamatio', 'libanius', 'richard', 'foerster', 'university', 'leipzig', 'european', 'social', 'fund', 'saxony']
tlg0062.tlg050.1st1K-grc1.txt ['deorum', 'concilium', 'lucianus', 'samosatenus', 'harmon', 'harvard', 'library', 'arcadia', 'fund', 'gregory']
tlg2959.tlg011.opp-ger1.txt ['genesim', 'catenis', 'methodius', 'nathanael', 'bonwetsch', 'university', 'leipzig', 'european', 'social', 'fund']
tlg2000.tlg001.opp-grc2.txt ['enneades', 'plotinus', 'richard', 'volkmann', 'google', 'digital', 'humanities', 'award', 'gregory', 'crane']
tlg0057.tlg014.1st1K-grc1.txt ['nervorum', 'dissectione', 'galen', 'karl', 'gottlob', 'kühn', 'andrew', 'mellon', 'foundation', 'published']
tlg0086.tlg042.1st1K-grc1.txt ['somno', 'vigilia', 'aristotle', 'immanuel', 'bekker', 'andrew', 'mellon', 'foundation', 'digital', 

# Mk word dictionaries

In [12]:
# Open corpus iterator
doc_stream = (tokens for _, tokens in iter_tlg(docs_preprocessed))

In [13]:
no_below = 20
no_above = 0.1

In [14]:
# store the dictionary, for future reference
dict_name = 'gensim_dict_id2word_1kgrk_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.dict'.format(no_below, 
                                                                                                            no_above, 
                                                                                                            TOK_MIN, 
                                                                                                            TOK_MAX, 
                                                                                                            DOC_MIN, 
                                                                                                            PREPROCESS_DEACCENT)
dict_path = os.path.join(user_dir, dict_name)

try:
    id2word_tlg = gensim.corpora.dictionary.Dictionary.load(dict_path)
except FileNotFoundError:
    t0 = time.time()
    # ~4 min on TLG corpus if rm accents; ~w min if not
    id2word_tlg = gensim.corpora.Dictionary(doc_stream)
    # this cutoff might lose too much info, we'll see
    # ignore words that appear in less than 20 documents or more than 10% documents
    id2word_tlg.filter_extremes(no_below=no_below, no_above=no_above)
    id2word_tlg.save(dict_path)
    print('Time to mk new corpus dictionary:', time.time() - t0)
print(id2word_tlg)

INFO : loading Dictionary object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_dict_id2word_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.dict
INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : built Dictionary(878943 unique tokens: ['supplementum', 'varietas', 'lectionis', 'catenam', 'evangelium']...) from 676 documents (total 16765179 corpus positions)
INFO : discarding 854231 tokens: [('varietas', 15), ('lectionis', 16), ('catenam', 4), ('university', 667), ('leipzig', 661), ('european', 345), ('social', 345), ('fund', 422), ('saxony', 345), ('gregory', 649)]...
INFO : keeping 24712 tokens which were in no less than 20 and no more than 67 (=10.0%) documents
INFO : resulting dictionary: Dictionary(24712 unique tokens: ['supplementum', 'evangelium', 'catenae', 'novum', 'testamentum']...)
INFO : saving Dictionary object under /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_dict_id2word_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50

Time to mk new corpus dictionary: 83.08647394180298
Dictionary(24712 unique tokens: ['supplementum', 'evangelium', 'catenae', 'novum', 'testamentum']...)


# Mk vectors

Now start again with the corpus, turning the actual words into integers from our map.

In [15]:
# Illustrate what this BoW space looks like with example doc
doc = "περὶ ποιητικῆς αὐτῆς τε καὶ τῶν εἰδῶν αὐτῆς, ἥν τινα δύναμιν ἕκαστον ἔχει, καὶ πῶς δεῖ συνίστασθαι τοὺς μύθους [10] εἰ μέλλει καλῶς ἕξειν ἡ ποίησις, ἔτι δὲ ἐκ πόσων καὶ ποίων ἐστὶ μορίων, ὁμοίως δὲ καὶ περὶ τῶν ἄλλων ὅσα τῆς αὐτῆς ἐστι μεθόδου, λέγωμεν ἀρξάμενοι κατὰ φύσιν πρῶτον ἀπὸ τῶν πρώτων."
doc = ' '.join(simple_preprocess(doc))
bow = id2word_tlg.doc2bow(tokenize(doc))
print(bow)  # words both in BoW dict and doc
print(id2word_tlg[bow[0][0]])  # map int back to str

[(2822, 1), (6358, 1)]
ποίησις


In [16]:
class TLGCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """Yield each document in turn, as a list of tokens (unicode strings).
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        self.titles = []
        for title, tokens in itertools.islice(iter_tlg(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

In [17]:
clip_docs_at = 25 # None for final
# make the BoW corpus
# creates a stream of bag-of-words vectors
corpus_bow_tlg = TLGCorpus(docs_preprocessed, id2word_tlg, clip_docs=clip_docs_at)

# reduce corpus size for faster testing
#corpus_bow_tlg = gensim.utils.ClippedCorpus(corpus_bow_tlg, 100)

# vector = next(iter(corpus_bow_tlg))
# print(vector)  # print the first vector in the stream
# [(0, 1), (1, 1), (2, 1), ...]

# # what is the most common word in that first article?
# most_index, most_count = max(vector, key=lambda _tuple: _tuple[1])
# print(id2word_tlg[most_index], most_count)  # μιλησιοις 2

In [18]:
# Save BoW
# ~4 min on TLG corpus
bow_name = 'gensim_bow_1kgrk_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.mm'.format(no_below, 
                                                                                                no_above, 
                                                                                                TOK_MIN, 
                                                                                                TOK_MAX, 
                                                                                                DOC_MIN, 
                                                                                                PREPROCESS_DEACCENT)
bow_path = os.path.join(user_dir, bow_name)
t0 = time.time()
gensim.corpora.MmCorpus.serialize(bow_path, corpus_bow_tlg)
print('Time to save BoW space:', time.time() - t0)

# Later load saved corpus with:
# corpus_bow_tlg = gensim.corpora.MmCorpus(bow_path)

INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_bow_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_bow_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : PROGRESS: saving document #0
INFO : saved 25x17096 matrix, density=6.529% (27903/427400)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_bow_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm.index


Time to save BoW space: 2.873065710067749


In [19]:
total_included_docs = len(corpus_bow_tlg.titles)  # used later for testing results

# LDA transformation

In [20]:
# Quick testing using just a part of the corpus

NUM_TOPICS_LIST = [2, 3, 5, 10, 25, 50, 100]
NUM_TOPICS_LIST.append(len(get_epithets()))  # mk topics same number as traditional epithets
NUM_TOPICS_LIST = sorted(NUM_TOPICS_LIST)
PASSES = 1

In [21]:
for num_topics in NUM_TOPICS_LIST:
    print('Beginning training ...')
    print('... {} topics and {} passes ...'.format(num_topics, PASSES))
    t0 = time.time()
    lda_model = gensim.models.LdaMulticore(corpus_bow_tlg, num_topics=num_topics, id2word=id2word_tlg, passes=PASSES)
    
    # save LDA vector space
    lda_space_name = 'gensim_lda_space_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.mm'.format(num_topics, 
                                                                                                                                        PASSES, 
                                                                                                                                        no_below, 
                                                                                                                                        no_above, 
                                                                                                                                        TOK_MIN, 
                                                                                                                                        TOK_MAX, 
                                                                                                                                        DOC_MIN, 
                                                                                                                                        PREPROCESS_DEACCENT)
    path_lda = os.path.join(user_dir, lda_space_name)
    gensim.corpora.MmCorpus.serialize(path_lda, lda_model[corpus_bow_tlg])
    
    # save model
    lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    path_lda = os.path.join(user_dir, lda_model_name)
    lda_model.save(path_lda)
    print('Time to train LDA model space:', time.time() - t0)

INFO : using symmetric alpha at 0.5
INFO : using symmetric eta at 4.046617028164455e-05
INFO : using serial LDA version on this node


Beginning training ...
... 2 topics and 1 passes ...


INFO : running online LDA training, 2 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #0 (0.500): 0.003*"und" + 0.002*"das" + 0.002*"πεπερασμένον" + 0.002*"nicht" + 0.002*"ἀπείρου" + 0.002*"aristoteles" + 0.002*"κινοῦντος" + 0.001*"μεταβάλλον" + 0.001*"def" + 0.001*"ist"
INFO : topic #1 (0.500): 0.003*"πεπερασμένον" + 0.003*"def" + 0.003*"ἀπείρου" + 0.002*"κινοῦντος" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον" + 0.002*"aristoteles" + 0.002*"mrg" + 0.002*"κινουμένῳ" + 0.002*"αὐτοκίνητον"
INFO : topic diff=1.142663, rho=1.000000
INFO : -9.531 per-word bound, 739.6 perplexity estimate based on a held-out corpus of 25 documents with 78324 words
INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user

Time to train LDA model space: 8.970294952392578
Beginning training ...
... 3 topics and 1 passes ...


INFO : running online LDA training, 3 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #0 (0.333): 0.005*"und" + 0.003*"def" + 0.002*"πεπερασμένον" + 0.002*"nicht" + 0.002*"das" + 0.002*"ist" + 0.002*"ἀπείρου" + 0.002*"κινοῦντος" + 0.002*"μεταβάλλον" + 0.002*"προσεχῶς"
INFO : topic #1 (0.333): 0.003*"ἀπείρου" + 0.003*"πεπερασμένον" + 0.003*"κινοῦντος" + 0.002*"mrg" + 0.002*"εὔδημος" + 0.002*"aristoteles" + 0.002*"def" + 0.002*"vulg" + 0.002*"ἠρεμεῖ" + 0.002*"κινουμένῳ"
INFO : topic #2 (0.333): 0.003*"πεπερασμένον" + 0.002*"def" + 0.002*"aristoteles" + 0.002*"μεταβάλλον" + 0.002*"ἀπείρου" + 0.002*"κινοῦντος" + 0.002*"und" + 0.002*"αὐτοκίνητον" + 0.001*"προσεχῶς" + 0.001*"cet"
INFO : topic diff=1.451915, rho=1.0

Time to train LDA model space: 8.65969181060791
Beginning training ...
... 5 topics and 1 passes ...


INFO : running online LDA training, 5 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #0 (0.200): 0.003*"und" + 0.003*"πεπερασμένον" + 0.002*"ἀπείρου" + 0.002*"def" + 0.002*"μεταβάλλον" + 0.002*"aristoteles" + 0.002*"mon" + 0.002*"κινοῦντος" + 0.002*"κινουμένῳ" + 0.002*"προσεχῶς"
INFO : topic #1 (0.200): 0.003*"πεπερασμένον" + 0.002*"μεταβάλλον" + 0.002*"ἀπείρου" + 0.002*"κινοῦντος" + 0.002*"aristoteles" + 0.002*"def" + 0.002*"mrg" + 0.002*"vulg" + 0.002*"εὔδημος" + 0.001*"αὐτοκίνητον"
INFO : topic #2 (0.200): 0.005*"und" + 0.003*"das" + 0.003*"nicht" + 0.002*"def" + 0.002*"ἀπείρου" + 0.002*"πεπερασμένον" + 0.002*"κινοῦντος" + 0.002*"von" + 0.002*"ist" + 0.002*"abbrev"
INFO : topic #3 (0.200): 0.003*"πεπερασμ

Time to train LDA model space: 10.812721490859985
Beginning training ...
... 10 topics and 1 passes ...


INFO : running online LDA training, 10 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #8 (0.100): 0.003*"und" + 0.003*"πεπερασμένον" + 0.002*"def" + 0.002*"das" + 0.002*"aristoteles" + 0.002*"κινοῦντος" + 0.002*"vulg" + 0.002*"ἀπείρου" + 0.002*"mrg" + 0.002*"μεταβάλλον"
INFO : topic #4 (0.100): 0.010*"und" + 0.006*"nicht" + 0.006*"mon" + 0.005*"das" + 0.004*"auch" + 0.004*"ist" + 0.004*"aber" + 0.003*"den" + 0.003*"von" + 0.003*"πεπερασμένον"
INFO : topic #9 (0.100): 0.003*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.002*"def" + 0.002*"κινοῦντος" + 0.002*"aristoteles" + 0.002*"μεταβάλλον" + 0.002*"mrg" + 0.002*"νόησις" + 0.001*"εὔδημος" + 0.001*"διαιρετόν"
INFO : topic #7 (0.100): 0.004*"πεπερασμένον" + 0.004*"κινοῦ

Time to train LDA model space: 13.826842069625854
Beginning training ...
... 25 topics and 1 passes ...


INFO : running online LDA training, 25 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #1 (0.040): 0.003*"ἀπείρου" + 0.002*"πεπερασμένον" + 0.002*"def" + 0.002*"und" + 0.002*"κινοῦντος" + 0.002*"aristoteles" + 0.002*"abbrev" + 0.002*"mrg" + 0.002*"προσεχῶς" + 0.002*"νόησις"
INFO : topic #0 (0.040): 0.004*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.003*"μεταβάλλον" + 0.003*"def" + 0.002*"aristoteles" + 0.002*"mrg" + 0.002*"προσεχῶς" + 0.002*"κινοῦντος" + 0.002*"ἀπείρῳ" + 0.002*"διαιρετὸν"
INFO : topic #18 (0.040): 0.003*"πεπερασμένον" + 0.002*"def" + 0.002*"aristoteles" + 0.002*"εὔδημος" + 0.002*"νόησις" + 0.002*"κινοῦντος" + 0.001*"προσεχῶς" + 0.001*"ἀπείρου" + 0.001*"ἠρεμεῖ" + 0.001*"μεταβάλλον"
INFO : topic #22 (0

Time to train LDA model space: 16.23465394973755
Beginning training ...
... 50 topics and 1 passes ...


INFO : running online LDA training, 50 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #48 (0.020): 0.003*"πεπερασμένον" + 0.003*"def" + 0.002*"κινοῦντος" + 0.002*"aristoteles" + 0.002*"προσεχῶς" + 0.002*"ἀπείρου" + 0.002*"und" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον" + 0.002*"mrg"
INFO : topic #31 (0.020): 0.005*"und" + 0.003*"das" + 0.003*"πεπερασμένον" + 0.003*"nicht" + 0.002*"def" + 0.002*"ist" + 0.002*"μεταβάλλον" + 0.002*"den" + 0.002*"εὔδημος" + 0.002*"aristoteles"
INFO : topic #23 (0.020): 0.002*"νόησις" + 0.002*"πεπερασμένον" + 0.001*"def" + 0.001*"κινοῦντος" + 0.001*"ἀπείρου" + 0.001*"mrg" + 0.001*"φλεγμονῆς" + 0.001*"εὔδημος" + 0.001*"μεταβάλλον" + 0.001*"ἐγρηγορέναι"
INFO : topic #33 (0.020): 0.003*

Time to train LDA model space: 25.56251335144043
Beginning training ...
... 55 topics and 1 passes ...


INFO : running online LDA training, 55 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #18 (0.018): 0.002*"πεπερασμένον" + 0.002*"macl" + 0.002*"νόησις" + 0.001*"μεταβάλλον" + 0.001*"mrg" + 0.001*"def" + 0.001*"ἀπείρου" + 0.001*"aristoteles" + 0.001*"φλεβοτομίας" + 0.001*"εὔδημος"
INFO : topic #15 (0.018): 0.004*"und" + 0.003*"πεπερασμένον" + 0.002*"das" + 0.002*"def" + 0.002*"φλεβοτομίας" + 0.002*"ἀπείρου" + 0.001*"aristoteles" + 0.001*"κινοῦντος" + 0.001*"von" + 0.001*"ist"
INFO : topic #48 (0.018): 0.005*"und" + 0.004*"nicht" + 0.004*"das" + 0.003*"den" + 0.003*"πεπερασμένον" + 0.002*"von" + 0.002*"aber" + 0.002*"sie" + 0.002*"ist" + 0.002*"auch"
INFO : topic #29 (0.018): 0.004*"πεπερασμένον" + 0.003*"def"

Time to train LDA model space: 24.77016019821167
Beginning training ...
... 100 topics and 1 passes ...


INFO : running online LDA training, 100 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #53 (0.010): 0.003*"πεπερασμένον" + 0.002*"und" + 0.002*"κινοῦντος" + 0.002*"das" + 0.002*"def" + 0.002*"κινουμένῳ" + 0.002*"νόησις" + 0.002*"nicht" + 0.001*"μεταβάλλον" + 0.001*"εὔδημος"
INFO : topic #23 (0.010): 0.004*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.003*"κινοῦντος" + 0.003*"def" + 0.003*"μεταβάλλον" + 0.002*"aristoteles" + 0.002*"mrg" + 0.002*"προσεχῶς" + 0.002*"pseudo" + 0.002*"εὔδημος"
INFO : topic #97 (0.010): 0.004*"πεπερασμένον" + 0.003*"κινοῦντος" + 0.003*"ἀπείρου" + 0.003*"def" + 0.002*"προσεχῶς" + 0.002*"mrg" + 0.002*"aristoteles" + 0.002*"αὐτοκίνητον" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον"
INFO : topic #42 

Time to train LDA model space: 38.82135248184204


In [22]:
# # Examples of how to use the model
# lda_model.print_topics(-1)  # print a few most important words for each LDA topic
# # transform text into the bag-of-words space
# bow_vector = id2word_tlg.doc2bow(tokenize(doc))
# print([(id2word_tlg[id], count) for id, count in bow_vector])

# # transform into LDA space
# lda_vector = lda_model[bow_vector]
# print(lda_vector)

# # print the document's single most prominent LDA topic
# print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))

# Evaluation

## Word intrusion

> For each trained topic, they take its first ten words, then substitute one of them with another, randomly chosen word (intruder!) and see whether a human can reliably tell which one it was. If so, the trained topic is topically coherent (good); if not, the topic has no discernible theme (bad)

In [23]:
for num_topics in NUM_TOPICS_LIST:
    # load model
    lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    print('Loading model: {} ...'.format(lda_model_name))
    print('... for word intrusion testing ...')
    path_lda = os.path.join(user_dir, lda_model_name)
    lda_model = gensim.models.LdaMulticore.load(path_lda)
    
    # select top 50 words for each of the LDA topics
    print('Top 50 words of each LDA model:')
    top_words = [[word for word, _ in lda_model.show_topic(topicno, topn=50)] for topicno in range(lda_model.num_topics)]
    print(top_words)
    print('')

    # get all top 50 words in all 20 topics, as one large set
    all_words = set(itertools.chain.from_iterable(top_words))
    print("Can you spot the misplaced word in each topic?")

    # for each topic, replace a word at a different index, to make it more interesting
    replace_index = np.random.randint(0, 10, lda_model.num_topics)

    replacements = []
    for topicno, words in enumerate(top_words):
        other_words = all_words.difference(words)
        replacement = np.random.choice(list(other_words))
        replacements.append((words[replace_index[topicno]], replacement))
        words[replace_index[topicno]] = replacement
        print("%i: %s" % (topicno, ' '.join(words[:10])))
    
    print("Actual replacements were:")
    print(list(enumerate(replacements)))
    print('')

INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_m

Loading model: gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for word intrusion testing ...
Top 50 words of each LDA model:
[['und', 'das', 'πεπερασμένον', 'nicht', 'ἀπείρου', 'aristoteles', 'κινοῦντος', 'μεταβάλλον', 'def', 'ist', 'mrg', 'προσεχῶς', 'νόησις', 'vulg', 'cet', 'aber', 'den', 'κινουμένῳ', 'ἀπείρῳ', 'von', 'ἠρεμεῖ', 'simpl', 'στερήσεως', 'ἠρεμία', 'εὔδημος', 'vgl', 'κατηγορίαις', 'auch', 'αὐτοκίνητον', 'sich', 'sie', 'διαιρετόν', 'κινήσεται', 'ἀίδιος', 'wie', 'νόησιν', 'φλεβοτομίας', 'κινούντων', 'κινοῖτο', 'κινηθήσεται', 'διαιρετὸν', 'ἐντελεχείᾳ', 'iterat', 'φυσικοῦ', 'ἐναντίαι', 'νοητοῦ', 'wenn', 'φθορά', 'ἀμερὲς', 'πέρατι'], ['πεπερασμένον', 'def', 'ἀπείρου', 'κινοῦντος', 'εὔδημος', 'μεταβάλλον', 'aristoteles', 'mrg', 'κινουμένῳ', 'αὐτοκίνητον', 'προσεχῶς', 'und', 'vulg', 'διαιρετὸν', 'ἠρεμεῖ', 'ἀδιαίρετον', 'στερήσεως', 'ἀλλοιώσεως', 'phys', 'διαιρετόν', 'νόησις', 'ἠρεμία', 'πεπερασμένα', 'ἀπείρῳ

INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_l

Top 50 words of each LDA model:
[['abbrev', 'πεπερασμένον', 'def', 'εὔδημος', 'ἀπείρου', 'μεταβάλλον', 'κινοῦντος', 'mrg', 'νόησις', 'aristoteles', 'κινουμένῳ', 'προσεχῶς', 'ἀπείρῳ', 'νόησιν', 'ἠρεμεῖ', 'vulg', 'στερήσεως', 'αὐτοκίνητον', 'ἀίδιος', 'νοητῷ', 'διαιρετὸν', 'κινηθήσεται', 'φθορά', 'πέρατι', 'ἀλλοιώσεως', 'κινήσεται', 'καθόσον', 'ἀλλοιώσεις', 'εὐδαιμονεῖν', 'ἀλλοιοῦσθαι', 'ἀδιαίρετον', 'ἠρεμία', 'κατηγορίαις', 'φυσικοῦ', 'τιμαίῳ', 'simpl', 'ἀναξαγόρας', 'κινοῖτο', 'ταὐτομάτου', 'διαιρετόν', 'φλεβοτομίας', 'ἐναντίωσις', 'phys', 'πάθημα', 'iterat', 'ἠρεμίας', 'τοσόνδε', 'exc', 'ἠρεμοῦν', 'κέντρου'], ['und', 'def', 'πεπερασμένον', 'aristoteles', 'κινοῦντος', 'εὔδημος', 'νόησις', 'ἀπείρου', 'αὐτοκίνητον', 'ἠρεμεῖ', 'mrg', 'μεταβάλλον', 'στερήσεως', 'κινουμένῳ', 'προσεχῶς', 'vulg', 'das', 'ἠρεμία', 'διαιρετὸν', 'φθορά', 'nicht', 'διαιρετόν', 'κινούντων', 'ἀπείρῳ', 'ἀλλοιώσεως', 'ist', 'ἀναξαγόρας', 'κατηγορίαις', 'ἀμερὲς', 'phys', 'ἐναντίωσις', 'κινοῦντα', 'κινηθήσεται', 'ἠρεμοῦ

INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_l

Top 50 words of each LDA model:
[['cet', 'ἀπείρου', 'πεπερασμένον', 'aristoteles', 'def', 'κινοῦντος', 'εὔδημος', 'vulg', 'μεταβάλλον', 'κινουμένῳ', 'νόησις', 'προσεχῶς', 'mrg', 'διαιρετόν', 'simpl', 'ἠρεμεῖ', 'ἀπείρῳ', 'αὐτοκίνητον', 'ἠρεμία', 'ἀρτηρίαις', 'σφυγμῶν', 'νόησιν', 'διαιρετὸν', 'στερήσεως', 'hab', 'ἀναξαγόρας', 'ἀλλοιώσεως', 'ἀδιαίρετον', 'νοητοῦ', 'κινοῖτο', 'ἀλλʼ', 'πέρατι', 'pal', 'κινούντων', 'ἀμερὲς', 'καθόσον', 'ἠρεμοῦν', 'πεπερασμένα', 'ἐπʼ', 'φλεγμονῆς', 'ἀρτηρίαι', 'κινηθήσεται', 'κατηγορίαις', 'ἀλλοιοῦσθαι', 'νοητῷ', 'ἐνεργεία', 'κινήσεται', 'φθορά', 'ἠρεμίαν', 'ἄπειρόν'], ['mon', 'var', 'πεπερασμένον', 'aristoteles', 'ἀπείρου', 'def', 'μεταβάλλον', 'κινουμένῳ', 'κινοῦντος', 'εὔδημος', 'αὐτοκίνητον', 'macl', 'ferr', 'προσεχῶς', 'ἠρεμεῖ', 'mrg', 'στερήσεως', 'κατηγορίαις', 'simpl', 'διαιρετόν', 'ἠρεμία', 'ἀδιαίρετον', 'ξένους', 'ἀπείρῳ', 'μάνθανε', 'ἀίδιος', 'πειρῶ', 'pseudo', 'σιγᾶν', 'πέρατι', 'vulg', 'καθόσον', 'κινήσεται', 'ἠρεμοῦν', 'ταὐτομάτου', 'ἀμερὲς', 'ἀ

INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state


Top 50 words of each LDA model:
[['πεπερασμένον', 'κινοῦντος', 'def', 'ἀπείρου', 'aristoteles', 'νόησις', 'mrg', 'εὔδημος', 'κινουμένῳ', 'αὐτοκίνητον', 'προσεχῶς', 'vulg', 'ἀπείρῳ', 'στερήσεως', 'ἠρεμεῖ', 'μεταβάλλον', 'ἀναξαγόρας', 'πεπερασμένα', 'ταὐτομάτου', 'ἠρεμία', 'ἀδιαίρετον', 'διαιρετόν', 'κινοῦντα', 'νόησιν', 'ἠρεμοῦν', 'κινοῖτο', 'ἐναντίαι', 'ἀλλοιώσεως', 'ἠρεμίας', 'καλέουσι', 'πέρατι', 'νευρώδης', 'φλὲψ', 'hippocrates', 'κέντρου', 'littré', 'émile', 'κοιλίης', 'ἀμερὲς', 'iterat', 'κινηθήσεται', 'διαιρετὸν', 'κίνησίς', 'νοητῷ', 'simpl', 'καθόσον', 'phys', 'κομήτης', 'φθορά', 'ἐπίπεδον'], ['ἀπείρου', 'πεπερασμένον', 'κινοῦντος', 'aristoteles', 'μεταβάλλον', 'προσεχῶς', 'def', 'εὔδημος', 'mrg', 'vulg', 'νόησις', 'αὐτοκίνητον', 'ἠρεμεῖ', 'κινουμένῳ', 'στερήσεως', 'simpl', 'διαιρετὸν', 'κατηγορίαις', 'und', 'ἀπείρῳ', 'φυσικοῦ', 'das', 'ὑπέκκαυμα', 'ἠρεμία', 'ἠρεμοῦν', 'ἀναξαγόρας', 'ἀδιαίρετον', 'κομήτης', 'κινηθήσεται', 'ἀλλοιώσεως', 'πεπερασμένα', 'νόησιν', 'νοητῷ', 'κινοῖτο'

INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state


Top 50 words of each LDA model:
[['πεπερασμένον', 'def', 'ἀπείρου', 'κινοῦντος', 'εὔδημος', 'μεταβάλλον', 'mrg', 'ἀπείρῳ', 'αὐτοκίνητον', 'aristoteles', 'προσεχῶς', 'κινουμένῳ', 'διαιρετὸν', 'ἠρεμεῖ', 'νόησις', 'vulg', 'κατηγορίαις', 'διαιρετόν', 'ἠρεμία', 'phys', 'simpl', 'στερήσεως', 'ταὐτομάτου', 'iterat', 'ἠρεμοῦν', 'κινηθήσεται', 'ἀναξαγόρας', 'κινήσεται', 'κομήτης', 'ἠρεμίας', 'φθορά', 'κινοῖτο', 'πεπερασμένα', 'ἐναντίαι', 'ἀμερῶν', 'πέρατι', 'παρμενίδης', 'ἀλλοιώσεως', 'ἐναντίωσις', 'τοσόνδε', 'κινῆται', 'exc', 'κίνησίς', 'ἀδιαίρετον', 'φυσικοῦ', 'ἀλλοιοῦσθαι', 'ἀίδιος', 'ἐντελεχείᾳ', 'κινητὸν', 'ἄπειρόν'], ['πεπερασμένον', 'vab', 'ἀπείρου', 'κινοῦντος', 'εὔδημος', 'ξενίας', 'aristoteles', 'def', 'μεταβάλλον', 'gasda', 'δημοσθένης', 'mrg', 'αὐτοκίνητον', 'ἠρεμία', 'στερήσεως', 'διαιρετὸν', 'vulg', 'ἠρεμεῖ', 'φίλιππος', 'διαιρετόν', 'simpl', 'κινουμένῳ', 'ἀπείρῳ', 'προσεχῶς', 'νόησις', 'κατηγορίαις', 'ἀναξαγόρας', 'ἐντελεχείᾳ', 'δημοσθένους', 'ἠρεμίας', 'ἀδιαίρετον', 'κινούντων',

## Split doc

> We'll split each document into two parts, and check that 1) topics of the first half are similar to topics of the second 2) halves of different documents are mostly dissimilar

In [24]:
# evaluate on 1k documents **not** used in LDA training
docs_preprocessed = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
doc_stream = (tokens for _, tokens in iter_tlg(docs_preprocessed))  # generator
test_docs = list(itertools.islice(doc_stream, 100, 200))  # ['πανυ', 'καλως', ...], [...], ...]

In [25]:
def intra_inter(model, test_docs, num_pairs=10000):
    # split each test document into two halves and compute topics for each half
    part1 = [model[id2word_tlg.doc2bow(tokens[: len(tokens) // 2])] for tokens in test_docs]
    part2 = [model[id2word_tlg.doc2bow(tokens[len(tokens) // 2 :])] for tokens in test_docs]
    
    # print computed similarities (uses cossim)
    print("average cosine similarity between corresponding parts (higher is better):")
    print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))

    random_pairs = np.random.randint(0, len(test_docs), size=(num_pairs, 2))
    print("average cosine similarity between {} random parts (lower is better):".format(num_pairs))    
    print(np.mean([gensim.matutils.cossim(part1[i[0]], part2[i[1]]) for i in random_pairs]))

In [26]:
for num_topics in NUM_TOPICS_LIST:
    # load model
    lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    print('Loading model: {} ...'.format(lda_model_name))
    print('... for testing split document topic matching ...')
    path_lda = os.path.join(user_dir, lda_model_name)
    lda_model = gensim.models.LdaMulticore.load(path_lda)

    print("LDA results:")
    # what should num_pairs be?
    intra_inter(lda_model, test_docs, num_pairs=total_included_docs)
    print('')

INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_m

Loading model: gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_m

average cosine similarity between corresponding parts (higher is better):
0.939026655163
average cosine similarity between 25 random parts (lower is better):
0.799057982384

Loading model: gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_m

average cosine similarity between corresponding parts (higher is better):
0.794662723955
average cosine similarity between 25 random parts (lower is better):
0.55966581763

Loading model: gensim_lda_model_1kgrk_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_l

average cosine similarity between corresponding parts (higher is better):
0.710675697827
average cosine similarity between 25 random parts (lower is better):
0.548864555785

Loading model: gensim_lda_model_1kgrk_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_l

average cosine similarity between corresponding parts (higher is better):
0.610022293555
average cosine similarity between 25 random parts (lower is better):
0.422802167198

Loading model: gensim_lda_model_1kgrk_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_l

average cosine similarity between corresponding parts (higher is better):
0.631935925884
average cosine similarity between 25 random parts (lower is better):
0.34912143236

Loading model: gensim_lda_model_1kgrk_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_l

average cosine similarity between corresponding parts (higher is better):
0.675960562428
average cosine similarity between 25 random parts (lower is better):
0.480053669458

Loading model: gensim_lda_model_1kgrk_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gens

average cosine similarity between corresponding parts (higher is better):
0.68854714947
average cosine similarity between 25 random parts (lower is better):
0.441823644465

Loading model: gensim_lda_model_1kgrk_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:
average cosine similarity between corresponding parts (higher is better):
0.590445309956
average cosine similarity between 25 random parts (lower is better):
0.29908593498



# Score all docs

In [27]:
id_auth_map = get_id_author()

In [None]:
# write to file topics for each doc
for num_topics in NUM_TOPICS_LIST:
    print('num topics', num_topics)
    # load model
    lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    print('Loading model: {} ...'.format(lda_model_name))
    print('... scoring topics of all documents ...')
    path_lda = os.path.join(user_dir, lda_model_name)
    # https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.get_document_topics
    lda_model = gensim.models.LdaMulticore.load(path_lda)

    # mk save path name
    scores_name = lda_model_name.rstrip('.model') + '.scores'
    scores_path = os.path.join(user_dir, scores_name)
    doc_topics = ''
    print('Going to write LDA scores for each file at: "{}"'.format(scores_path))
    for file_name, tokens in iter_tlg(docs_preprocessed):
        # print(file_name, tokens[:10])  # print the article title and its first ten tokens
        # print(file_name)
        topic_distribution = str(lda_model[id2word_tlg.doc2bow(tokens)])
        # print(topic_distribution)
        
        # convert file name to author name, and get epithet
        # auth_id = file_name.lstrip('TLG').rstrip('.TXT')  # for TLG
        auth_id = file_name.rstrip('.txt')  # for 1K Greek
        auth_name = None
        auth_epithet = None
        # auth_name = id_auth_map[auth_id]  # for TLG
        # auth_epithet = str(get_epithet_of_author(auth_id))  # for TLG
        
        doc_topics += 'file: ' + file_name + '\n'
        doc_topics += 'author: ' + auth_name + '\n'
        doc_topics += 'epithet: ' + auth_epithet + '\n'
        doc_topics += topic_distribution + '\n\n'
    print('Wrote file to: "{}"'.format(scores_path))
    with open(scores_path, 'w') as file_open:
        file_open.write(doc_topics)
    print('')

INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_m

num topics 2
Loading model: gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... scoring topics of all TLG documents ...


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute dispatcher to None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_m

Writing file to: "/home/kyle/cltk_data/user_data/lda_1kgreek/gensim_lda_model_1kgrk_numtopics2_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFals.scores"

num topics 3
Loading model: gensim_lda_model_1kgrk_numtopics3_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... scoring topics of all TLG documents ...
