Following tutorial ["Topic Modeling for Fun and Profit"](http://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html)

In [1]:
import itertools
import logging
import os
import pickle
import sys
import time

from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
import gensim
from gensim.corpora.mmcorpus import MmCorpus
from gensim.utils import simple_preprocess
import numpy as np

In [2]:
# put current dir in path, for importing local module
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
# import local module
from lda_helpers import mk_working_dir
from lda_helpers import working_dir
from lda_helpers import tokenize
from lda_helpers import iter_docs
from lda_helpers import PREPROCESS_DEACCENT
from lda_helpers import TOK_MIN
from lda_helpers import TOK_MAX
from lda_helpers import DOC_MIN
from lda_helpers import remove_ascii
from lda_helpers import STOPS_LIST
from lda_helpers import no_below
from lda_helpers import no_above
from lda_helpers import GenerateCorpus

In [4]:
# enable verbose print-to-screen logging for Gensim
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

In [5]:
# where our results will go in ~/cltk_data/user_data
mk_working_dir(working_dir)

In [6]:
# Take a look at the docs post-processing
# Open corpus iterator
# docs_path_rel'~/cltk_data/greek/text/tlg/plaintext/'  # for tlg
docs_path_rel = '~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/'
docs_preprocessed = os.path.expanduser(docs_path_rel)
stream = iter_docs(docs_preprocessed, rm_ascii=remove_ascii)
for title, tokens in itertools.islice(iter_docs(docs_preprocessed, rm_ascii=remove_ascii), 8):
    print(title, tokens[:10])  # print the article title and its first ten tokens

tlg4102.tlg006.opp-grc1.txt ['ερμηνεία', 'διαφόρων', 'εἰσ', 'κατά', 'λουκᾶν', 'εὐαγγέλιον', 'ὅτι', 'μὲν', 'ἄλλοι', 'εὐαγγελισταὶ']
tlg2200.tlg00518.opp-grc1.txt ['νόμος', 'τὸν', 'ξενίας', 'ἁλόντα', 'πιπράσκεσθαι', 'ἑάλω', 'ξενίας', 'δημοσθένης', 'ἔπεμψε', 'φίλιππος']
tlg0062.tlg050.1st1K-grc1.txt ['θεων', 'εκκλησια', 'ζευς', 'μηκέτι', 'τονθορύζετε', 'θεοί', 'μηδὲ', 'κατὰ', 'γωνίας', 'συστρεφόμενοι']
tlg2959.tlg011.opp-ger1.txt ['βραβεῖον', 'δυναστείαις', 'κατὰ', 'τὸν', 'ἰώβ', 'καὶ', 'κοπρία', 'παντὸς', 'θρόνου', 'βασιλικοῦ']
tlg2000.tlg001.opp-grc2.txt ['ζωιον', 'και', 'τις', 'ανθρωπος', 'ἡδοναὶ', 'καὶ', 'λῦπαι', 'φόβοι', 'καὶ', 'θάρρη']
tlg0057.tlg014.1st1K-grc1.txt ['γαληνου', 'περι', 'νευρων', 'ανατομης', 'βιβλιον', 'ὅτι', 'μὲν', 'οὐδὲν', 'τῶν', 'τοῦ']
tlg0086.tlg042.1st1K-grc1.txt ['περι', 'υπνου', 'και', 'εγρηγορσεως', 'περὶ', 'ὕπνου', 'καὶ', 'ἐγρηγόρσεως', 'σκεπτέον', 'τίνα']
tlg0057.tlg031.1st1K-grc1.txt ['γαληνου', 'περι', 'χρειας', 'σφυγμων', 'βιβλιον', 'τίς', 'χρεία', 'τῶν', 

# Mk word dictionaries

In [7]:
# Open corpus iterator
doc_stream = (tokens for _, tokens in iter_docs(docs_preprocessed, rm_ascii=remove_ascii))

In [8]:
# store the dictionary, for future reference
dict_name = 'gensim_dict_id2word_1kgrk_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.dict'.format(no_below, 
                                                                                                            no_above, 
                                                                                                            TOK_MIN, 
                                                                                                            TOK_MAX, 
                                                                                                            DOC_MIN, 
                                                                                                            PREPROCESS_DEACCENT)
dict_path = os.path.join(working_dir, dict_name)

# consider doing same filtering as done in the class, then combinging count
try:
    id2word_map = gensim.corpora.dictionary.Dictionary.load(dict_path)
except FileNotFoundError:
    t0 = time.time()
    # ~4 min on TLG corpus if rm accents; ~w min if not
    id2word_map = gensim.corpora.Dictionary(doc_stream)
    # this cutoff might lose too much info, we'll see
    # ignore words that appear in less than 20 documents or more than 10% documents
    id2word_map.filter_extremes(no_below=no_below, no_above=no_above)
    id2word_map.save(dict_path)
    print('Time to mk new corpus dictionary:', time.time() - t0)
print(id2word_map)

INFO : loading Dictionary object from ~/cltk_data/user_data/lda_1kgreek/gensim_dict_id2word_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.dict
INFO : loaded ~/cltk_data/user_data/lda_1kgreek/gensim_dict_id2word_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.dict


Dictionary(22820 unique tokens: ['λουκᾶν', 'ἄρχονται', 'λουκᾶς', 'προοιμίοις', 'ζαχαρίου']...)


# Mk vectors

Now start again with the corpus, turning the actual words into integers from our map.

In [9]:
# Illustrate what this BoW space looks like with example doc
doc = "περὶ ποιητικῆς αὐτῆς τε καὶ τῶν εἰδῶν αὐτῆς, ἥν τινα δύναμιν ἕκαστον ἔχει, καὶ πῶς δεῖ συνίστασθαι τοὺς μύθους [10] εἰ μέλλει καλῶς ἕξειν ἡ ποίησις, ἔτι δὲ ἐκ πόσων καὶ ποίων ἐστὶ μορίων, ὁμοίως δὲ καὶ περὶ τῶν ἄλλων ὅσα τῆς αὐτῆς ἐστι μεθόδου, λέγωμεν ἀρξάμενοι κατὰ φύσιν πρῶτον ἀπὸ τῶν πρώτων."
doc = ' '.join(simple_preprocess(doc))
bow = id2word_map.doc2bow(tokenize(doc, rm_ascii=remove_ascii))
print(bow)  # words both in BoW dict and doc
print(id2word_map[bow[0][0]])  # map int back to str

[(2469, 1), (5973, 1)]
ποίησις


In [10]:
clip_docs_at = 25 # None for final
# make the BoW corpus
# creates a stream of bag-of-words vectors
corpus_bow_tlg = GenerateCorpus(docs_preprocessed, id2word_map, clip_docs=clip_docs_at)

# reduce corpus size for faster testing
#corpus_bow_tlg = gensim.utils.ClippedCorpus(corpus_bow_tlg, 100)

# vector = next(iter(corpus_bow_tlg))
# print(vector)  # print the first vector in the stream
# [(0, 1), (1, 1), (2, 1), ...]

# # what is the most common word in that first article?
# most_index, most_count = max(vector, key=lambda _tuple: _tuple[1])
# print(id2word_map[most_index], most_count)  # μιλησιοις 2

In [11]:
# Save BoW
# ~4 min on TLG corpus
bow_name = 'gensim_bow_1kgrk_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.mm'.format(no_below, 
                                                                                                no_above, 
                                                                                                TOK_MIN, 
                                                                                                TOK_MAX, 
                                                                                                DOC_MIN, 
                                                                                                PREPROCESS_DEACCENT)
bow_path = os.path.join(working_dir, bow_name)
t0 = time.time()
gensim.corpora.MmCorpus.serialize(bow_path, corpus_bow_tlg)
print('Time to save BoW space:', time.time() - t0)

# Later load saved corpus with:
# corpus_bow_tlg = gensim.corpora.MmCorpus(bow_path)

INFO : storing corpus in Matrix Market format to ~/cltk_data/user_data/lda_1kgreek/gensim_bow_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : saving sparse matrix to ~/cltk_data/user_data/lda_1kgreek/gensim_bow_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : PROGRESS: saving document #0
INFO : saved 25x22128 matrix, density=4.745% (26247/553200)
INFO : saving MmCorpus index to ~/cltk_data/user_data/lda_1kgreek/gensim_bow_1kgrk_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm.index


Time to save BoW space: 4.633636236190796


In [12]:
total_included_docs = len(corpus_bow_tlg.titles)  # used later for testing results

# LDA transformation

In [13]:
# Quick testing using just a part of the corpus

NUM_TOPICS_LIST = [2, 3, 5, 10, 25, 50, 100]
NUM_TOPICS_LIST.append(len(get_epithets()))  # mk topics same number as traditional epithets
NUM_TOPICS_LIST = sorted(NUM_TOPICS_LIST)
PASSES = 1

In [18]:
# open permissions to working dir
# sometimes necessary for notebook
os.chmod(working_dir, 777)

FileNotFoundError: [Errno 2] No such file or directory: '~/cltk_data/user_data/lda_1kgreek/'

In [16]:
for num_topics in NUM_TOPICS_LIST:
    print('Beginning training ...')
    print('... {} topics and {} passes ...'.format(num_topics, PASSES))
    t0 = time.time()
    lda_model = gensim.models.LdaMulticore(corpus_bow_tlg, num_topics=num_topics, id2word=id2word_map, passes=PASSES)
    
    # save LDA vector space
    lda_space_name = 'gensim_lda_space_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.mm'.format(num_topics, 
                                                                                                                                        PASSES, 
                                                                                                                                        no_below, 
                                                                                                                                        no_above, 
                                                                                                                                        TOK_MIN, 
                                                                                                                                        TOK_MAX, 
                                                                                                                                        DOC_MIN, 
                                                                                                                                        PREPROCESS_DEACCENT)
    path_lda = os.path.join(working_dir, lda_space_name)
    gensim.corpora.MmCorpus.serialize(path_lda, lda_model[corpus_bow_tlg])
    
    # save model
    lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    # path_lda = os.path.join(working_dir, lda_model_name)
    path_lda = os.path.join('.', lda_model_name)
    lda_model.save(path_lda)
    print('Time to train LDA model space:', time.time() - t0)

INFO : using symmetric alpha at 0.5
INFO : using symmetric eta at 4.3821209465381244e-05
INFO : using serial LDA version on this node


Beginning training ...
... 2 topics and 1 passes ...


INFO : running online LDA training, 2 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #0 (0.500): 0.004*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.002*"εὔδημος" + 0.002*"αὐτοκίνητον" + 0.002*"μεταβάλλον" + 0.002*"προσεχῶς" + 0.002*"στερήσεως" + 0.002*"ἀπείρῳ" + 0.001*"κινουμένῳ" + 0.001*"διαιρετὸν"
INFO : topic #1 (0.500): 0.002*"πεπερασμένον" + 0.002*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.002*"κινουμένῳ" + 0.002*"προσεχῶς" + 0.002*"εὔδημος" + 0.001*"νόησις" + 0.001*"ἠρεμεῖ" + 0.001*"ἠρεμία" + 0.001*"διαιρετόν"
INFO : topic diff=1.113922, rho=1.000000
INFO : -9.566 per-word bound, 757.7 perplexity estimate based on a held-out corpus of 25 documents with 69697 words
INFO : storing corpus in Matrix Market format to ~/clt

Time to train LDA model space: 11.044687271118164
Beginning training ...
... 3 topics and 1 passes ...


INFO : running online LDA training, 3 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #0 (0.333): 0.003*"πεπερασμένον" + 0.002*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.002*"προσεχῶς" + 0.001*"κινουμένῳ" + 0.001*"αὐτοκίνητον" + 0.001*"νόησις" + 0.001*"ἀπείρῳ" + 0.001*"εὔδημος" + 0.001*"ἠρεμεῖ"
INFO : topic #1 (0.333): 0.003*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.002*"κινουμένῳ" + 0.002*"εὔδημος" + 0.002*"προσεχῶς" + 0.002*"αὐτοκίνητον" + 0.002*"ἠρεμία" + 0.002*"μεταβάλλον" + 0.001*"ἠρεμεῖ" + 0.001*"νόησις"
INFO : topic #2 (0.333): 0.004*"πεπερασμένον" + 0.003*"μεταβάλλον" + 0.003*"ἀπείρου" + 0.002*"εὔδημος" + 0.002*"κινουμένῳ" + 0.002*"προσεχῶς" + 0.001*"ἠρεμεῖ" + 0.001*"στερήσεως" + 0.001*"αὐτοκίνητον" + 0.001*"νόησι

Time to train LDA model space: 9.511889219284058
Beginning training ...
... 5 topics and 1 passes ...


INFO : running online LDA training, 5 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #0 (0.200): 0.003*"πεπερασμένον" + 0.002*"μεταβάλλον" + 0.002*"ἀπείρου" + 0.002*"εὔδημος" + 0.002*"κινουμένῳ" + 0.002*"αὐτοκίνητον" + 0.001*"διαιρετὸν" + 0.001*"προσεχῶς" + 0.001*"ἠρεμεῖ" + 0.001*"νόησις"
INFO : topic #1 (0.200): 0.002*"νόησις" + 0.002*"ἀπείρου" + 0.002*"πεπερασμένον" + 0.001*"μεταβάλλον" + 0.001*"προσεχῶς" + 0.001*"νόησιν" + 0.001*"ἀλλοιώσεως" + 0.001*"ταὐτομάτου" + 0.001*"ἠρεμεῖ" + 0.001*"νοητῷ"
INFO : topic #2 (0.200): 0.003*"πεπερασμένον" + 0.002*"μεταβάλλον" + 0.002*"προσεχῶς" + 0.002*"ἀπείρου" + 0.002*"εὔδημος" + 0.002*"κινουμένῳ" + 0.002*"αὐτοκίνητον" + 0.002*"στερήσεως" + 0.001*"ἀπείρῳ" + 0.001*"ἐναν

Time to train LDA model space: 9.570417404174805
Beginning training ...
... 10 topics and 1 passes ...


INFO : running online LDA training, 10 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #4 (0.100): 0.003*"πεπερασμένον" + 0.002*"νόησις" + 0.002*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.002*"εὔδημος" + 0.002*"προσεχῶς" + 0.001*"ἠρεμεῖ" + 0.001*"κινουμένῳ" + 0.001*"ἠρεμία" + 0.001*"ἀπείρῳ"
INFO : topic #6 (0.100): 0.003*"πεπερασμένον" + 0.003*"μεταβάλλον" + 0.002*"ἀπείρου" + 0.002*"προσεχῶς" + 0.002*"κινουμένῳ" + 0.002*"αὐτοκίνητον" + 0.002*"εὔδημος" + 0.002*"διαιρετὸν" + 0.002*"ἀπείρῳ" + 0.001*"ἠρεμεῖ"
INFO : topic #7 (0.100): 0.003*"πεπερασμένον" + 0.002*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.001*"εὔδημος" + 0.001*"νόησις" + 0.001*"κινουμένῳ" + 0.001*"αὐτοκίνητον" + 0.001*"προσεχῶς" + 0.001*"ἠρεμεῖ" + 0.001*"ἀλλοιώσεω

Time to train LDA model space: 11.825273036956787
Beginning training ...
... 25 topics and 1 passes ...


INFO : running online LDA training, 25 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #8 (0.040): 0.003*"πεπερασμένον" + 0.002*"μεταβάλλον" + 0.002*"εὔδημος" + 0.002*"ἀπείρου" + 0.002*"προσεχῶς" + 0.002*"αὐτοκίνητον" + 0.002*"διαιρετὸν" + 0.001*"στερήσεως" + 0.001*"κινουμένῳ" + 0.001*"ἀπείρῳ"
INFO : topic #19 (0.040): 0.003*"πεπερασμένον" + 0.002*"κομήτης" + 0.002*"μεταβάλλον" + 0.002*"προσεχῶς" + 0.002*"εὔδημος" + 0.002*"ἀπείρου" + 0.002*"ὑπέκκαυμα" + 0.001*"πλανωμένων" + 0.001*"κινουμένῳ" + 0.001*"ἀναθυμιάσεως"
INFO : topic #15 (0.040): 0.003*"νόησις" + 0.002*"ἀπείρου" + 0.002*"πεπερασμένον" + 0.001*"μεταβάλλον" + 0.001*"νόησιν" + 0.001*"ἀπείρῳ" + 0.001*"ἠρεμεῖ" + 0.001*"αὐτοκίνητον" + 0.001*"εὔδημος" + 0.

Time to train LDA model space: 16.358034372329712
Beginning training ...
... 50 topics and 1 passes ...


INFO : running online LDA training, 50 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #48 (0.020): 0.003*"πεπερασμένον" + 0.003*"ἀπείρου" + 0.002*"εὔδημος" + 0.002*"μεταβάλλον" + 0.002*"κινουμένῳ" + 0.002*"αὐτοκίνητον" + 0.002*"ἀπείρῳ" + 0.002*"προσεχῶς" + 0.002*"στερήσεως" + 0.001*"νόησις"
INFO : topic #28 (0.020): 0.004*"πεπερασμένον" + 0.004*"ἀπείρου" + 0.003*"εὔδημος" + 0.002*"προσεχῶς" + 0.002*"κινουμένῳ" + 0.002*"μεταβάλλον" + 0.002*"νόησις" + 0.002*"ἠρεμεῖ" + 0.002*"ἠρεμία" + 0.002*"στερήσεως"
INFO : topic #26 (0.020): 0.002*"πεπερασμένον" + 0.002*"αὐτοκίνητον" + 0.002*"ἀπείρου" + 0.001*"εὔδημος" + 0.001*"μεταβάλλον" + 0.001*"ἀπείρῳ" + 0.001*"διαιρετὸν" + 0.001*"κινουμένῳ" + 0.001*"στερήσεως" + 0.001*

Time to train LDA model space: 23.434708833694458
Beginning training ...
... 55 topics and 1 passes ...


INFO : running online LDA training, 55 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #7 (0.018): 0.003*"ἀπείρου" + 0.002*"νόησις" + 0.002*"ἀλλʼ" + 0.002*"πεπερασμένον" + 0.002*"μεταβάλλον" + 0.002*"εὔδημος" + 0.001*"κινουμένῳ" + 0.001*"προσεχῶς" + 0.001*"νοητῷ" + 0.001*"ἠρεμεῖ"
INFO : topic #32 (0.018): 0.003*"ἀπείρου" + 0.003*"πεπερασμένον" + 0.002*"εὔδημος" + 0.002*"ἀπείρῳ" + 0.002*"προσεχῶς" + 0.002*"ἠρεμεῖ" + 0.002*"μεταβάλλον" + 0.002*"κινουμένῳ" + 0.001*"ἀδιαίρετον" + 0.001*"ἠρεμία"
INFO : topic #49 (0.018): 0.004*"πεπερασμένον" + 0.003*"μεταβάλλον" + 0.002*"ἀπείρου" + 0.002*"προσεχῶς" + 0.002*"στερήσεως" + 0.002*"εὔδημος" + 0.002*"ἠρεμία" + 0.002*"κινουμένῳ" + 0.002*"ἀδιαίρετον" + 0.001*"αὐτοκίνητον"

Time to train LDA model space: 24.374109745025635
Beginning training ...
... 100 topics and 1 passes ...


INFO : running online LDA training, 100 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #2 (0.010): 0.004*"πεπερασμένον" + 0.002*"ἀπείρου" + 0.002*"μεταβάλλον" + 0.002*"προσεχῶς" + 0.001*"δημοσθένης" + 0.001*"εὔδημος" + 0.001*"ἀπείρῳ" + 0.001*"ὑπέκκαυμα" + 0.001*"κομήτης" + 0.001*"ἠρεμεῖ"
INFO : topic #34 (0.010): 0.003*"ἀλλʼ" + 0.002*"φλεβοτομίας" + 0.002*"μεταβάλλον" + 0.002*"οὐδʼ" + 0.002*"πεπερασμένον" + 0.002*"λίμνην" + 0.002*"νόησις" + 0.002*"ἐπʼ" + 0.001*"ἀπείρου" + 0.001*"κινουμένῳ"
INFO : topic #51 (0.010): 0.006*"πουλὺ" + 0.004*"τῇσι" + 0.003*"ὅκου" + 0.003*"ὁκόταν" + 0.003*"ὁκόσον" + 0.003*"ἐπὴν" + 0.003*"ὀδόντες" + 0.003*"θερμαινόμενον" + 0.003*"λιπαρὸν" + 0.003*"φύονται"
INFO : topic #20 (0.010):

Time to train LDA model space: 36.92840552330017


In [None]:
# # Examples of how to use the model
# lda_model.print_topics(-1)  # print a few most important words for each LDA topic
# # transform text into the bag-of-words space
# bow_vector = id2word_map.doc2bow(tokenize(doc, rm_ascii=remove_non_ascii))
# print([(id2word_map[id], count) for id, count in bow_vector])

# # transform into LDA space
# lda_vector = lda_model[bow_vector]
# print(lda_vector)

# # print the document's single most prominent LDA topic
# print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))

# Evaluation

## Word intrusion

> For each trained topic, they take its first ten words, then substitute one of them with another, randomly chosen word (intruder!) and see whether a human can reliably tell which one it was. If so, the trained topic is topically coherent (good); if not, the topic has no discernible theme (bad)

In [None]:
for num_topics in NUM_TOPICS_LIST:
    # load model
    lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    print('Loading model: {} ...'.format(lda_model_name))
    print('... for word intrusion testing ...')
    path_lda = os.path.join(working_dir, lda_model_name)
    lda_model = gensim.models.LdaMulticore.load(path_lda)
    
    # select top 50 words for each of the LDA topics
    print('Top 50 words of each LDA model:')
    top_words = [[word for word, _ in lda_model.show_topic(topicno, topn=50)] for topicno in range(lda_model.num_topics)]
    print(top_words)
    print('')

    # get all top 50 words in all 20 topics, as one large set
    all_words = set(itertools.chain.from_iterable(top_words))
    print("Can you spot the misplaced word in each topic?")

    # for each topic, replace a word at a different index, to make it more interesting
    replace_index = np.random.randint(0, 10, lda_model.num_topics)

    replacements = []
    for topicno, words in enumerate(top_words):
        other_words = all_words.difference(words)
        replacement = np.random.choice(list(other_words))
        replacements.append((words[replace_index[topicno]], replacement))
        words[replace_index[topicno]] = replacement
        print("%i: %s" % (topicno, ' '.join(words[:10])))
    
    print("Actual replacements were:")
    print(list(enumerate(replacements)))
    print('')

## Split doc

> We'll split each document into two parts, and check that 1) topics of the first half are similar to topics of the second 2) halves of different documents are mostly dissimilar

In [None]:
# evaluate on 1k documents **not** used in LDA training
docs_preprocessed = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
doc_stream = (tokens for _, tokens in iter_docs(docs_preprocessed))  # generator
test_docs = list(itertools.islice(doc_stream, 100, 200))  # ['πανυ', 'καλως', ...], [...], ...]

In [None]:
def intra_inter(model, test_docs, num_pairs=10000):
    # split each test document into two halves and compute topics for each half
    part1 = [model[id2word_map.doc2bow(tokens[: len(tokens) // 2])] for tokens in test_docs]
    part2 = [model[id2word_map.doc2bow(tokens[len(tokens) // 2 :])] for tokens in test_docs]
    
    # print computed similarities (uses cossim)
    print("average cosine similarity between corresponding parts (higher is better):")
    print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))

    random_pairs = np.random.randint(0, len(test_docs), size=(num_pairs, 2))
    print("average cosine similarity between {} random parts (lower is better):".format(num_pairs))    
    print(np.mean([gensim.matutils.cossim(part1[i[0]], part2[i[1]]) for i in random_pairs]))

In [None]:
for num_topics in NUM_TOPICS_LIST:
    # load model
    lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    print('Loading model: {} ...'.format(lda_model_name))
    print('... for testing split document topic matching ...')
    path_lda = os.path.join(working_dir, lda_model_name)
    lda_model = gensim.models.LdaMulticore.load(path_lda)

    print("LDA results:")
    # what should num_pairs be?
    intra_inter(lda_model, test_docs, num_pairs=total_included_docs)
    print('')

# Score all docs

In [None]:
id_auth_map = get_id_author()

In [None]:
# write to file topics for each doc
for num_topics in NUM_TOPICS_LIST:
    print('num topics', num_topics)
    # load model
    lda_model_name = 'gensim_lda_model_1kgrk_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    print('Loading model: {} ...'.format(lda_model_name))
    print('... scoring topics of all documents ...')
    path_lda = os.path.join(working_dir, lda_model_name)
    # https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.get_document_topics
    lda_model = gensim.models.LdaMulticore.load(path_lda)

    # mk save path name
    scores_name = lda_model_name.rstrip('.model') + '.scores'
    scores_path = os.path.join(working_dir, scores_name)
    doc_topics = ''
    print('Going to write LDA scores for each file at: "{}"'.format(scores_path))
    for file_name, tokens in iter_docs(docs_preprocessed):
        # print(file_name, tokens[:10])  # print the article title and its first ten tokens
        # print(file_name)
        topic_distribution = str(lda_model[id2word_map.doc2bow(tokens)])
        # print(topic_distribution)
        
        # convert file name to author name, and get epithet
        # auth_id = file_name.lstrip('TLG').rstrip('.TXT')  # for TLG
        auth_id = file_name.rstrip('.txt')  # for 1K Greek
        auth_name = None
        auth_epithet = None
        # auth_name = id_auth_map[auth_id]  # for TLG
        # auth_epithet = str(get_epithet_of_author(auth_id))  # for TLG
        
        doc_topics += 'file: ' + file_name + '\n'
        doc_topics += 'author: ' + auth_name + '\n'
        doc_topics += 'epithet: ' + auth_epithet + '\n'
        doc_topics += topic_distribution + '\n\n'
    print('Wrote file to: "{}"'.format(scores_path))
    with open(scores_path, 'w') as file_open:
        file_open.write(doc_topics)
    print('')