Following tutorial ["Topic Modeling for Fun and Profit"](http://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html)

In [1]:
import itertools
import logging
import os
import pickle
import time

from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithets
from cltk.corpus.greek.tlg.parse_tlg_indices import get_epithet_of_author
from cltk.corpus.greek.tlg.parse_tlg_indices import get_id_author
from cltk.corpus.utils.formatter import cltk_normalize
from cltk.stop.greek.stops import STOPS_LIST
import gensim
from gensim.corpora.mmcorpus import MmCorpus
from gensim.utils import simple_preprocess
import numpy as np

Arabic not supported. Install `pyarabic` library to tokenize Arabic.


In [2]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

In [3]:
user_dir = os.path.expanduser('~/cltk_data/user_data/lda_tlg/')
try:
    os.makedirs(user_dir)
except FileExistsError:
    pass

In [4]:
PREPROCESS_DEACCENT = False
STOPS_LIST = [simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)[0] for stop in STOPS_LIST if len(simple_preprocess(stop, deacc=PREPROCESS_DEACCENT)) > 0]
STOPS_LIST = ['τῆϲ', 'τοῖϲ', 'εἰϲ', 'πρὸϲ', 'τοὺϲ']
STOPS_LIST += ["τηϲ", "τοιϲ", "εϲτι", "προϲ", "ειϲ", "ταϲ", "ωϲ", "τουϲ", "ξυν", 'ξὺν', 'πρε']  # useful for after rm accents
STOPS_LIST = [cltk_normalize(stop) for stop in STOPS_LIST]

In [5]:
# TODO: make sure that preprocessing is same for TLGCorpus class and other setups

TOK_MIN = 3  # rm words shorter than
TOK_MAX = 20  # rm words longer than
DOC_MIN = 50  # drop docs shorter than
def tokenize(text):
    """Tokenize and rm stopwords. The Gensim `simple_preprocess` will work fine
    here becuase the Greek text has already been aggressively cleaned up.
    https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
    """
    tokens = [token for token in simple_preprocess(text, deacc=PREPROCESS_DEACCENT, min_len=TOK_MIN, max_len=TOK_MAX)]
    return [token for token in tokens if token not in STOPS_LIST]
    

def iter_tlg(tlg_dir):
    """Stream TLG doc-by-doc."""
    file_names = os.listdir(tlg_dir)
    for file_name in file_names:
        file_path = os.path.join(tlg_dir, file_name)
        with open(file_path) as file_open:
            file_read = file_open.read()
        tokens = tokenize(file_read)
        tokens = [cltk_normalize(token) for token in tokens]
        # ignore very short docs
        # todo: get file length distribution to better know what is short in TLG
        if len(tokens) < DOC_MIN:
            continue
        yield file_name, tokens

In [6]:
# Take a look at the docs post-processing
# Open corpus iterator
tlg_preprocessed = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
stream = iter_tlg(tlg_preprocessed)
for title, tokens in itertools.islice(iter_tlg(tlg_preprocessed), 8):
    print(title, tokens[:10])  # print the article title and its first ten tokens

TLG1828.TXT ['παρά', 'αἰγυπτίων', 'γεωμετρεῖν', 'μαθόντα', 'φησὶ', 'παμφίλη', 'πρῶτον', 'καταγράψαι', 'κύκλου', 'τρίγωνον']
TLG3088.TXT ['ἑῴας', 'καὶ', 'οὗτος', 'ἀποστατεῖ', 'κατ', 'αὐτοῦ', 'καὶ', 'ὑπερισχύσας', 'βασιλείας', 'ἐπιλαμβάνεται']
TLG2005.TXT ['μαρτύριον', 'τοῦ', 'ἁγίου', 'πιονίου', 'τοῦ', 'πρεσβυτέρου', 'καὶ', 'τῶν', 'σὺν', 'αὐτῷ']
TLG0036.TXT ['αἰάζω', 'τὸν', 'ἄδωνιν', 'ἀπώλετο', 'καλὸς', 'ἄδωνις', 'ὤλετο', 'καλὸς', 'ἄδωνις', 'ἐπαιάζουσιν']
TLG0474.TXT ['ταγηνίας', 'ἤδη', 'τεθέασαι', 'χλιαροὺς', 'σίζοντας', 'ὅταν', 'αὐτοῖσιν', 'ἐπιχέῃς', 'μέλι', 'καὶ']
TLG2354.TXT ['διονύσιος', 'πρώτῳ', 'τοῦ', 'κύκλου', 'βύρσαν', 'αὐτὸν', 'ἠμφιέσθαι', 'φησὶ', 'καὶ', 'κύκλῳ']
TLG1125.TXT ['κήρυκες', 'ἀνδροτίων', 'πρώτῃ', 'ἀτθίδος', 'κέκροπος', 'γενέσθαι', 'τρεῖς', 'θυγατέρας', 'ἄγραυλον', 'ἄρσην']
TLG4293.TXT ['βούλησις', 'ἐπὶ', 'ἀγαθοῦ', 'προαίρεσις', 'ἐπὶ', 'ἀμφοτέρων', 'ἀγαθοῦ', 'καὶ', 'κακοῦ', 'εἰκὼν']


# Mk word dictionaries

In [7]:
# Open corpus iterator
doc_stream = (tokens for _, tokens in iter_tlg(tlg_preprocessed))

In [8]:
no_below = 20
no_above = 0.1

In [9]:
# store the dictionary, for future reference
dict_name = 'gensim_dict_id2word_tlg_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.dict'.format(no_below, 
                                                                                                            no_above, 
                                                                                                            TOK_MIN, 
                                                                                                            TOK_MAX, 
                                                                                                            DOC_MIN, 
                                                                                                            PREPROCESS_DEACCENT)
dict_path = os.path.join(user_dir, dict_name)

try:
    id2word_tlg = gensim.corpora.dictionary.Dictionary.load(dict_path)
except FileNotFoundError:
    t0 = time.time()
    # ~4 min on TLG corpus if rm accents; ~w min if not
    id2word_tlg = gensim.corpora.Dictionary(doc_stream)
    # this cutoff might lose too much info, we'll see
    # ignore words that appear in less than 20 documents or more than 10% documents
    id2word_tlg.filter_extremes(no_below=no_below, no_above=no_above)
    id2word_tlg.save(dict_path)
    print('Time to mk new corpus dictionary:', time.time() - t0)
print(id2word_tlg)

INFO : loading Dictionary object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_dict_id2word_tlg_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.dict
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_dict_id2word_tlg_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.dict


Dictionary(88039 unique tokens: ['γεωμετρεῖν', 'μαθόντα', 'καταγράψαι', 'ὀρθογώνιον', 'θῦσαι']...)


# Mk vectors

Now start again with the corpus, turning the actual words into integers from our map.

In [10]:
# Illustrate what this BoW space looks like with example doc
doc = "περὶ ποιητικῆς αὐτῆς τε καὶ τῶν εἰδῶν αὐτῆς, ἥν τινα δύναμιν ἕκαστον ἔχει, καὶ πῶς δεῖ συνίστασθαι τοὺς μύθους [10] εἰ μέλλει καλῶς ἕξειν ἡ ποίησις, ἔτι δὲ ἐκ πόσων καὶ ποίων ἐστὶ μορίων, ὁμοίως δὲ καὶ περὶ τῶν ἄλλων ὅσα τῆς αὐτῆς ἐστι μεθόδου, λέγωμεν ἀρξάμενοι κατὰ φύσιν πρῶτον ἀπὸ τῶν πρώτων."
doc = ' '.join(simple_preprocess(doc))
bow = id2word_tlg.doc2bow(tokenize(doc))
print(bow)  # words both in BoW dict and doc
print(id2word_tlg[bow[0][0]])  # map int back to str

[(42540, 1)]
ποιητικῆς


In [11]:
class TLGCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """Yield each document in turn, as a list of tokens (unicode strings).
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        self.titles = []
        for title, tokens in itertools.islice(iter_tlg(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

In [12]:
clip_docs_at = 25 # None for final
# make the BoW corpus
# creates a stream of bag-of-words vectors
corpus_bow_tlg = TLGCorpus(tlg_preprocessed, id2word_tlg, clip_docs=clip_docs_at)

# reduce corpus size for faster testing
#corpus_bow_tlg = gensim.utils.ClippedCorpus(corpus_bow_tlg, 100)

# vector = next(iter(corpus_bow_tlg))
# print(vector)  # print the first vector in the stream
# [(0, 1), (1, 1), (2, 1), ...]

# # what is the most common word in that first article?
# most_index, most_count = max(vector, key=lambda _tuple: _tuple[1])
# print(id2word_tlg[most_index], most_count)  # μιλησιοις 2

In [13]:
# Save BoW
# ~4 min on TLG corpus
bow_name = 'gensim_bow_tlg_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.mm'.format(no_below, 
                                                                                                no_above, 
                                                                                                TOK_MIN, 
                                                                                                TOK_MAX, 
                                                                                                DOC_MIN, 
                                                                                                PREPROCESS_DEACCENT)
bow_path = os.path.join(user_dir, bow_name)
t0 = time.time()
gensim.corpora.MmCorpus.serialize(bow_path, corpus_bow_tlg)
print('Time to save BoW space:', time.time() - t0)

# Later load saved corpus with:
# corpus_bow_tlg = gensim.corpora.MmCorpus(bow_path)

INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/lda_tlg/gensim_bow_tlg_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/lda_tlg/gensim_bow_tlg_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm
INFO : PROGRESS: saving document #0
INFO : saved 25x65830 matrix, density=2.124% (34950/1645750)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/lda_tlg/gensim_bow_tlg_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.mm.index


Time to save BoW space: 8.202030658721924


In [14]:
total_included_docs = len(corpus_bow_tlg.titles)  # used later for testing results

# LDA transformation

In [22]:
# Quick testing using just a part of the corpus

#NUM_TOPICS_LIST = [2, 3, 5, 10, 25, 50, 100]
NUM_TOPICS_LIST = [5, 10, 25, 50, 100]
NUM_TOPICS_LIST.append(len(get_epithets()))  # mk topics same number as traditional epithets
NUM_TOPICS_LIST = sorted(NUM_TOPICS_LIST)
PASSES = 1

In [23]:
for num_topics in NUM_TOPICS_LIST:
    print('Beginning training ...')
    print('... {} topics and {} passes ...'.format(num_topics, PASSES))
    t0 = time.time()
    lda_model = gensim.models.LdaMulticore(corpus_bow_tlg, num_topics=num_topics, id2word=id2word_tlg, passes=PASSES)
    
    # save LDA vector space
    lda_space_name = 'gensim_lda_space_tlg_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.mm'.format(num_topics, 
                                                                                                                                        PASSES, 
                                                                                                                                        no_below, 
                                                                                                                                        no_above, 
                                                                                                                                        TOK_MIN, 
                                                                                                                                        TOK_MAX, 
                                                                                                                                        DOC_MIN, 
                                                                                                                                        PREPROCESS_DEACCENT)
    path_lda = os.path.join(user_dir, lda_space_name)
    gensim.corpora.MmCorpus.serialize(path_lda, lda_model[corpus_bow_tlg])
    
    # save model
    lda_model_name = 'gensim_lda_model_tlg_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    path_lda = os.path.join(user_dir, lda_model_name)
    lda_model.save(path_lda)
    print('Time to train LDA model space:', time.time() - t0)

INFO : using symmetric alpha at 0.2
INFO : using symmetric eta at 1.1358602437556082e-05
INFO : using serial LDA version on this node


Beginning training ...
... 5 topics and 1 passes ...


INFO : running online LDA training, 5 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #0 (0.200): 0.003*"καῖσαρ" + 0.002*"ἵστανται" + 0.001*"ἱππεῖς" + 0.001*"σχολῶν" + 0.001*"ὕπατον" + 0.001*"πρεσβευτὰς" + 0.001*"βῆλον" + 0.001*"πτολεμαῖον" + 0.001*"ἀχαιοῖς" + 0.001*"βροῦτον"
INFO : topic #1 (0.200): 0.003*"καῖσαρ" + 0.002*"ἵστανται" + 0.001*"ἱππεῖς" + 0.001*"πρεσβευτὰς" + 0.001*"ξὺν" + 0.001*"βροῦτος" + 0.001*"οὐθὲν" + 0.001*"βῆλον" + 0.001*"σχολῶν" + 0.001*"θεμιστοκλῆς"
INFO : topic #2 (0.200): 0.003*"καῖσαρ" + 0.003*"ἵστανται" + 0.001*"σχολῶν" + 0.001*"ἱππεῖς" + 0.001*"πρεσβευτὰς" + 0.001*"βῆλον" + 0.001*"ξὺν" + 0.001*"ὕπατον" + 0.001*"νεῦμα" + 0.001*"σῶσον"
INFO : topic #3 (0.200): 0.002*"καῖσαρ" + 0.001*

Time to train LDA model space: 24.330283403396606
Beginning training ...
... 10 topics and 1 passes ...


INFO : running online LDA training, 10 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #6 (0.100): 0.002*"καῖσαρ" + 0.001*"ἵστανται" + 0.001*"ἱππεῖς" + 0.001*"σχολῶν" + 0.001*"βῆλον" + 0.001*"ὕπατον" + 0.001*"βροῦτος" + 0.001*"πρεσβευτὰς" + 0.001*"οὐθὲν" + 0.001*"ξὺν"
INFO : topic #2 (0.100): 0.002*"ἵστανται" + 0.002*"καῖσαρ" + 0.001*"ἱππεῖς" + 0.001*"πρεσβευτὰς" + 0.001*"βῆλον" + 0.001*"σχολῶν" + 0.001*"νεῦμα" + 0.001*"πτολεμαῖον" + 0.001*"σῶσον" + 0.001*"ἰουστινιανοῦ"
INFO : topic #7 (0.100): 0.004*"ἵστανται" + 0.003*"καῖσαρ" + 0.001*"ἱππεῖς" + 0.001*"πρεσβευτὰς" + 0.001*"σχολῶν" + 0.001*"βῆλον" + 0.001*"ξὺν" + 0.001*"σῶσον" + 0.001*"πρεσβευτῶν" + 0.001*"ὕπατος"
INFO : topic #0 (0.100): 0.003*"καῖσαρ" + 0.0

Time to train LDA model space: 32.89586019515991
Beginning training ...
... 25 topics and 1 passes ...


INFO : running online LDA training, 25 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #6 (0.040): 0.003*"καῖσαρ" + 0.002*"ἵστανται" + 0.001*"σχολῶν" + 0.001*"ἱππεῖς" + 0.001*"ξὺν" + 0.001*"πρεσβευτὰς" + 0.001*"βῆλον" + 0.001*"σῶσον" + 0.001*"ἰσραὴλ" + 0.001*"μιχαὴλ"
INFO : topic #17 (0.040): 0.002*"καῖσαρ" + 0.002*"ἵστανται" + 0.001*"βροῦτος" + 0.001*"ἱππεῖς" + 0.001*"ξὺν" + 0.001*"πρεσβευτὰς" + 0.001*"ὕπατον" + 0.001*"βῆλον" + 0.001*"φρουρὰν" + 0.001*"σχολῶν"
INFO : topic #23 (0.040): 0.002*"καῖσαρ" + 0.001*"ἵστανται" + 0.001*"ἱππεῖς" + 0.001*"οὐθὲν" + 0.001*"σχολῶν" + 0.001*"βῆλον" + 0.001*"βροῦτον" + 0.001*"πρεσβευτὰς" + 0.001*"ξὺν" + 0.001*"βροῦτος"
INFO : topic #20 (0.040): 0.002*"καῖσαρ" + 0.002*"ἵσταν

Time to train LDA model space: 45.17950201034546
Beginning training ...
... 50 topics and 1 passes ...


INFO : running online LDA training, 50 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #16 (0.020): 0.002*"καῖσαρ" + 0.002*"ἵστανται" + 0.001*"ἐθνικὸν" + 0.001*"πρεσβευτὰς" + 0.001*"ὕπατον" + 0.001*"βῆλον" + 0.001*"ἱππεῖς" + 0.001*"σχολῶν" + 0.001*"περικλῆς" + 0.001*"ὕπατος"
INFO : topic #6 (0.020): 0.002*"καῖσαρ" + 0.002*"ἵστανται" + 0.001*"σχολῶν" + 0.001*"ἱππεῖς" + 0.001*"βῆλον" + 0.001*"ξὺν" + 0.001*"βροῦτον" + 0.001*"πρεσβευτὰς" + 0.001*"ὑπολαβὼν" + 0.001*"βροῦτος"
INFO : topic #11 (0.020): 0.002*"ἵστανται" + 0.002*"καῖσαρ" + 0.001*"ἱππεῖς" + 0.001*"βῆλον" + 0.001*"ὕπατον" + 0.001*"σχολῶν" + 0.001*"κριτὴν" + 0.001*"πρεσβευτὰς" + 0.001*"ὕπατος" + 0.001*"βροῦτον"
INFO : topic #25 (0.020): 0.002*"καῖσαρ" + 

Time to train LDA model space: 76.87505435943604
Beginning training ...
... 55 topics and 1 passes ...


INFO : running online LDA training, 55 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #24 (0.018): 0.002*"καῖσαρ" + 0.002*"ἵστανται" + 0.001*"πρεσβευτὰς" + 0.001*"σχολῶν" + 0.001*"ὕπατον" + 0.001*"βῆλον" + 0.001*"νεῦμα" + 0.001*"ἱππεῖς" + 0.001*"πρεσβευτῶν" + 0.001*"στρατῷ"
INFO : topic #10 (0.018): 0.002*"καῖσαρ" + 0.002*"ἵστανται" + 0.001*"σχολῶν" + 0.001*"ἱππεῖς" + 0.001*"πρεσβευτὰς" + 0.001*"ἰδεῶν" + 0.001*"βῆλον" + 0.001*"σῶσον" + 0.001*"ὕπατον" + 0.001*"βροῦτον"
INFO : topic #43 (0.018): 0.002*"καῖσαρ" + 0.002*"ἵστανται" + 0.001*"ἱππεῖς" + 0.001*"ἄδωνιν" + 0.001*"ἄδωνις" + 0.001*"βροῦτος" + 0.001*"οὐθὲν" + 0.001*"νιν" + 0.001*"σχολῶν" + 0.001*"ὕπατον"
INFO : topic #41 (0.018): 0.003*"καῖσαρ" + 0.002*"ἵ

Time to train LDA model space: 80.9849042892456
Beginning training ...
... 100 topics and 1 passes ...


INFO : running online LDA training, 100 topics, 1 passes over the supplied corpus of 25 documents, updating every 2000 documents, evaluating every ~25 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 1 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #25/25, outstanding queue size 1
INFO : topic #4 (0.010): 0.003*"καῖσαρ" + 0.002*"ἵστανται" + 0.001*"πρεσβευτὰς" + 0.001*"ἱππεῖς" + 0.001*"ξὺν" + 0.001*"βροῦτος" + 0.001*"βῆλον" + 0.001*"βροῦτον" + 0.001*"σῶσον" + 0.001*"σχολῶν"
INFO : topic #48 (0.010): 0.002*"καῖσαρ" + 0.001*"ἵστανται" + 0.001*"σχολῶν" + 0.001*"ἱππεῖς" + 0.001*"ξὺν" + 0.001*"βῆλον" + 0.001*"βροῦτον" + 0.001*"πρεσβευτὰς" + 0.001*"βροῦτος" + 0.001*"ὕπατον"
INFO : topic #20 (0.010): 0.003*"καῖσαρ" + 0.002*"ἵστανται" + 0.001*"ἱππεῖς" + 0.001*"πρεσβευτὰς" + 0.001*"σχολῶν" + 0.001*"κηροὺς" + 0.001*"βῆλον" + 0.001*"σῶσον" + 0.001*"ἰουστινιανοῦ" + 0.001*"βροῦτον"
INFO : topic #64 (0.010): 0.003*"ἵστανται" +

Time to train LDA model space: 130.2624135017395


In [24]:
# # Examples of how to use the model
# lda_model.print_topics(-1)  # print a few most important words for each LDA topic
# # transform text into the bag-of-words space
# bow_vector = id2word_tlg.doc2bow(tokenize(doc))
# print([(id2word_tlg[id], count) for id, count in bow_vector])

# # transform into LDA space
# lda_vector = lda_model[bow_vector]
# print(lda_vector)

# # print the document's single most prominent LDA topic
# print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))

# Evaluation

## Word intrusion

> For each trained topic, they take its first ten words, then substitute one of them with another, randomly chosen word (intruder!) and see whether a human can reliably tell which one it was. If so, the trained topic is topically coherent (good); if not, the topic has no discernible theme (bad)

In [25]:
for num_topics in NUM_TOPICS_LIST:
    # load model
    lda_model_name = 'gensim_lda_model_tlg_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    print('Loading model: {} ...'.format(lda_model_name))
    print('... for word intrusion testing ...')
    path_lda = os.path.join(user_dir, lda_model_name)
    lda_model = gensim.models.LdaMulticore.load(path_lda)
    
    # select top 50 words for each of the LDA topics
    print('Top 50 words of each LDA model:')
    top_words = [[word for word, _ in lda_model.show_topic(topicno, topn=50)] for topicno in range(lda_model.num_topics)]
    print(top_words)
    print('')

    # get all top 50 words in all 20 topics, as one large set
    all_words = set(itertools.chain.from_iterable(top_words))
    print("Can you spot the misplaced word in each topic?")

    # for each topic, replace a word at a different index, to make it more interesting
    replace_index = np.random.randint(0, 10, lda_model.num_topics)

    replacements = []
    for topicno, words in enumerate(top_words):
        other_words = all_words.difference(words)
        replacement = np.random.choice(list(other_words))
        replacements.append((words[replace_index[topicno]], replacement))
        words[replace_index[topicno]] = replacement
        print("%i: %s" % (topicno, ' '.join(words[:10])))
    
    print("Actual replacements were:")
    print(list(enumerate(replacements)))
    print('')

INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : setting ignored attribute dispatcher to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics5_numpasse

Loading model: gensim_lda_model_tlg_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for word intrusion testing ...
Top 50 words of each LDA model:
[['καῖσαρ', 'ἵστανται', 'ἱππεῖς', 'σχολῶν', 'ὕπατον', 'πρεσβευτὰς', 'βῆλον', 'πτολεμαῖον', 'ἀχαιοῖς', 'βροῦτον', 'φρουρὰν', 'οὐθὲν', 'πρεσβευτῶν', 'ἰδεῶν', 'ἔπραττεν', 'νεῦμα', 'ἀντωνῖνος', 'σῶσον', 'περικλῆς', 'βροῦτος', 'θεμιστοκλῆς', 'κηροὺς', 'ξὺν', 'ὕπατος', 'γαλατῶν', 'ὑπολαβὼν', 'ἰουστινιανοῦ', 'ᾤοντο', 'ἀχαιοὺς', 'ἥδιστον', 'σπαρτιατῶν', 'ἔννοιαι', 'λυκοῦργος', 'βοῆς', 'σπονδῶν', 'εὐμενῆ', 'στρατηγοῖς', 'παρακαλεῖν', 'ἅγιε', 'ἀπολιπεῖν', 'σιωπῆς', 'ὕπατοι', 'τελεῖται', 'δελφοὺς', 'ἄρατον', 'περιορᾶν', 'βασιλικοὶ', 'στρατῷ', 'μεταβολαῖς', 'κωνσταντῖνος'], ['καῖσαρ', 'ἵστανται', 'ἱππεῖς', 'πρεσβευτὰς', 'ξὺν', 'βροῦτος', 'οὐθὲν', 'βῆλον', 'σχολῶν', 'θεμιστοκλῆς', 'τελεῖται', 'σῶσον', 'πτολεμαῖον', 'βροῦτον', 'ὕπατον', 'κηροὺς', 'σαρακηνῶν', 'ὑπολαβὼν', 'ὕπατος', 'πρεσβευτῶν', 'λυκοῦργος',

INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : setting ignored attribute dispatcher to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics25_num

Top 50 words of each LDA model:
[['καῖσαρ', 'ἱππεῖς', 'ἵστανται', 'πρεσβευτὰς', 'βροῦτος', 'σχολῶν', 'ὕπατον', 'ὕπατος', 'βροῦτον', 'ἀχαιοῖς', 'βῆλον', 'οὐθὲν', 'ξὺν', 'πτολεμαῖον', 'περικλῆς', 'ἄρατος', 'ὑπολαβὼν', 'ἔπεμψαν', 'ᾤοντο', 'θεμιστοκλῆς', 'νεῦμα', 'λυκοῦργος', 'φρουρὰν', 'σῶσον', 'στρατῷ', 'ἥδιστον', 'πρεσβευτὴν', 'εὐμενὴς', 'ἰδεῶν', 'δελφοὺς', 'ὕπνους', 'μηθὲν', 'ἐπανελθὼν', 'πρεσβευτῶν', 'ἀπῄει', 'ἔναγχος', 'κηροὺς', 'γαλατῶν', 'κῦρον', 'ἰουστινιανοῦ', 'ἀντωνῖνος', 'ἔννοιαι', 'ἄρατον', 'ἀπολιπεῖν', 'ἐπιφανῶν', 'σπαρτιατῶν', 'κωνσταντῖνος', 'ἔπραττεν', 'κατεῖχε', 'φανερὸς'], ['καῖσαρ', 'ἵστανται', 'πρεσβευτὰς', 'βῆλον', 'ἱππεῖς', 'σχολῶν', 'ὕπατον', 'ξὺν', 'οὐθὲν', 'βροῦτος', 'ἰουστινιανοῦ', 'στρατῷ', 'κηροὺς', 'νεῦμα', 'ἄρατος', 'φρουρὰν', 'θεμιστοκλῆς', 'πρεσβευτῶν', 'ὑπολαβὼν', 'ᾤοντο', 'βροῦτον', 'σῶσον', 'τελεῖται', 'ὕπατος', 'ἀπῄει', 'ἀχαιοῖς', 'φοροῦντες', 'ἐπανῆλθεν', 'κῦρον', 'σπαρτιατῶν', 'πτολεμαῖον', 'ἔπεμψαν', 'περικλῆς', 'ἰουστινιανὸς', 'περσεὺς', 'λυκοῦργος'

INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model


Top 50 words of each LDA model:
[['ἵστανται', 'καῖσαρ', 'ἱππεῖς', 'πρεσβευτὰς', 'βροῦτος', 'σχολῶν', 'ὕπατον', 'φρουρὰν', 'βροῦτον', 'σῶσον', 'οὐθὲν', 'βῆλον', 'ἀχαιοὺς', 'πρεσβευτῶν', 'κηροὺς', 'ἀντωνῖνος', 'ξὺν', 'ὕπατος', 'ἔπεμψαν', 'νεῦμα', 'ἀχαιοῖς', 'ᾤοντο', 'φοροῦντες', 'ἄδωνις', 'τελεῖται', 'θεμιστοκλῆς', 'ἄδωνιν', 'πτολεμαῖον', 'αἰτωλῶν', 'στρατῷ', 'ἰσραὴλ', 'ἰουστινιανοῦ', 'κῦρον', 'ἥδιστον', 'περιορᾶν', 'κεφ', 'ἴστρον', 'ἀπολιπεῖν', 'ὑπολαβὼν', 'ἐπανελθὼν', 'σπαρτιατῶν', 'ἀπῄει', 'ἡττηθεὶς', 'ἄρατος', 'παρακαλεῖν', 'σπονδῶν', 'λυκοῦργος', 'διαφυγεῖν', 'νιν', 'σαρακηνῶν'], ['ἵστανται', 'καῖσαρ', 'ἱππεῖς', 'πρεσβευτὰς', 'βῆλον', 'ὕπατον', 'σχολῶν', 'ξὺν', 'βροῦτος', 'οὐθὲν', 'σῶσον', 'ὕπατος', 'νεῦμα', 'ἰουστινιανοῦ', 'βροῦτον', 'πρεσβευτῶν', 'κωνσταντῖνος', 'σαρακηνῶν', 'ἔπεμψαν', 'κηροὺς', 'ἀντωνῖνος', 'πτολεμαῖον', 'τελεῖται', 'ἀχαιοὺς', 'περσεὺς', 'ἐπανῆλθεν', 'φρουρὰν', 'πρεσβευτὴν', 'ᾤοντο', 'ἰουστινιανὸς', 'στρατῷ', 'ἅγιε', 'μιχαὴλ', 'αἰτωλῶν', 'ἀπῄει', 'κῦρον', 'ἀχαιοῖ

INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : setting ignored attribute dispatcher to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state


Top 50 words of each LDA model:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : setting ignored attribute dispatcher to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics55_num

[['καῖσαρ', 'ἵστανται', 'ἱππεῖς', 'πρεσβευτὰς', 'ἰδεῶν', 'βῆλον', 'ἔννοιαι', 'σχολῶν', 'σῶσον', 'βροῦτος', 'κῶλα', 'κηροὺς', 'πτολεμαῖον', 'κωνσταντῖνος', 'οὐθὲν', 'βροῦτον', 'πρεσβευτῶν', 'ξὺν', 'περιβολῆς', 'νεῦμα', 'ἄρατος', 'ἀντωνῖνος', 'ὕπατον', 'δοκοῦντος', 'ὑπολαβὼν', 'ἀχαιοῖς', 'κῶλον', 'τεχνικὸς', 'ὕπατος', 'ἔπραττεν', 'περικλῆς', 'ἰουστινιανοῦ', 'ἐπανῆλθεν', 'ἥδιστον', 'σαρακηνῶν', 'πολιτικὸν', 'ᾔτει', 'ἀχαιοὺς', 'ἀπῄει', 'κῦρον', 'στρατῷ', 'προσκυνεῖ', 'πρεσβευτὴν', 'σπαρτιατῶν', 'αἰτωλῶν', 'περσεὺς', 'θεμιστοκλῆς', 'ᾤοντο', 'λυκοῦργος', 'ἔπεμψαν'], ['καῖσαρ', 'ἵστανται', 'πρεσβευτὰς', 'ἱππεῖς', 'σχολῶν', 'ξὺν', 'βροῦτος', 'πτολεμαῖον', 'οἴκαδ', 'ὕπατον', 'ὕπατος', 'βῆλον', 'βροῦτον', 'φρουρὰν', 'σῶσον', 'κηροὺς', 'ἀπῄει', 'ἀχαιοῖς', 'καταβαλὼν', 'νεῦμα', 'ᾤοντο', 'μιχαὴλ', 'πρεσβευτὴν', 'ἔπεμψαν', 'οὐθὲν', 'θεμιστοκλῆς', 'ἰουδαῖοι', 'ἀχαιοὺς', 'στρατῷ', 'σαρακηνῶν', 'ὑπολαβὼν', 'ὥρμησε', 'σπονδῶν', 'ναυσὶ', 'τουτονὶ', 'λυκοῦργος', 'ἀσελγῶς', 'περικλῆς', 'ἐνδελεχῶς', 'τελεῖτ

Top 50 words of each LDA model:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : setting ignored attribute dispatcher to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state


[['καῖσαρ', 'ἵστανται', 'ἱππεῖς', 'σχολῶν', 'βροῦτος', 'μυσῶν', 'οὐθὲν', 'καθολικῆς', 'φῄς', 'αὐτοῖσιν', 'βῆλον', 'ἰουδαῖοι', 'φρουρὰν', 'ὕπατος', 'πρεσβευτὰς', 'σαοὺλ', 'βροῦτον', 'ὕπατον', 'θεμιστοκλῆς', 'σαμουὴλ', 'ὑπολαβὼν', 'ξὺν', 'ἰδεῶν', 'ἀχαιοῖς', 'χριστιανοὺς', 'ἀπῄει', 'ἔννοιαι', 'νεῦμα', 'χριστιανὸς', 'σπαρτιατῶν', 'ἰησοῦν', 'ἥδιστον', 'ἀναγαγεῖν', 'ἔπραττεν', 'ἀντωνῖνος', 'σῶσον', 'κηροὺς', 'ἵππαρχος', 'θῦσαι', 'στρατῷ', 'ἔπεμψαν', 'πτολεμαῖον', 'μιχαὴλ', 'κωνσταντῖνος', 'οἴδατε', 'σπονδῶν', 'ᾤοντο', 'περικλῆς', 'ἤθει', 'δελφοὺς'], ['καῖσαρ', 'ἵστανται', 'ἱππεῖς', 'πρεσβευτὰς', 'σχολῶν', 'ξὺν', 'ὕπατον', 'βῆλον', 'βροῦτον', 'βροῦτος', 'ὑπολαβὼν', 'φρουρὰν', 'ὕπατος', 'πτολεμαῖον', 'οὐθὲν', 'θεμιστοκλῆς', 'μιχαὴλ', 'ἀχαιοῖς', 'ἀπῄει', 'σῶσον', 'τελεῖται', 'σπαρτιατῶν', 'λυκοῦργος', 'πρεσβευτῶν', 'ᾤοντο', 'ἥδιστον', 'νεῦμα', 'κηροὺς', 'ἔπεμψαν', 'περιορᾶν', 'κομνηνὸς', 'ἰσραὴλ', 'περικλῆς', 'ἄρατος', 'ἰουστινιανοῦ', 'παρακαλεῖν', 'εὐμενῆ', 'ἐπιφανῶν', 'κῦρον', 'διαφυγεῖν', 'ἐ

INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state


Top 50 words of each LDA model:
[['καῖσαρ', 'ἵστανται', 'πρεσβευτὰς', 'σχολῶν', 'ἱππεῖς', 'ξὺν', 'βῆλον', 'κηροὺς', 'σῶσον', 'θεᾶς', 'παῦλον', 'βροῦτος', 'ὕπατος', 'ὕπατον', 'βροῦτον', 'ἰουστινιανοῦ', 'ἀχαιοῖς', 'οὐθὲν', 'τελεῖται', 'πτολεμαῖον', 'αἰτιατικῇ', 'πρεσβευτῶν', 'αἰτωλῶν', 'ψιλοῦται', 'ἔπαρχος', 'στρατῷ', 'σπονδῶν', 'νεῦμα', 'γενικῇ', 'θεμιστοκλῆς', 'σαρακηνῶν', 'ἀχαιοὺς', 'ἀττικοὶ', 'νηῶν', 'φρουρὰν', 'ἔπεμψαν', 'ἰσραὴλ', 'γαλατῶν', 'ἀντωνῖνος', 'περικλῆς', 'σπαρτιατῶν', 'κεφ', 'κοιτῶνι', 'φοροῦντες', 'δεξιῷ', 'μιχαὴλ', 'χαλκῆς', 'γεννῶ', 'πρεσβευτὴν', 'ἔντευξιν'], ['καῖσαρ', 'ἵστανται', 'ἱππεῖς', 'πρεσβευτὰς', 'σχολῶν', 'ὕπατον', 'οὐθὲν', 'βροῦτον', 'βροῦτος', 'σῶσον', 'ὑπολαβὼν', 'ἰδεῶν', 'κῶλα', 'θεμιστοκλῆς', 'φρουρὰν', 'βῆλον', 'πτολεμαῖον', 'ἀντωνῖνος', 'σπαρτιατῶν', 'ὕπατος', 'νεῦμα', 'κηροὺς', 'ἔννοιαι', 'ᾤοντο', 'ἀχαιοῖς', 'μηθὲν', 'ἄρατος', 'ξὺν', 'τελεῖται', 'λυκοῦργος', 'στρατῷ', 'περικλῆς', 'σιωπῆς', 'κῶλον', 'πρεσβευτῶν', 'τεχνικὸς', 'περιβολῆς', 'ἰουστινιανοῦ

## Split doc

> We'll split each document into two parts, and check that 1) topics of the first half are similar to topics of the second 2) halves of different documents are mostly dissimilar

In [26]:
# evaluate on 1k documents **not** used in LDA training
tlg_preprocessed = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
doc_stream = (tokens for _, tokens in iter_tlg(tlg_preprocessed))  # generator
test_docs = list(itertools.islice(doc_stream, 100, 200))  # ['πανυ', 'καλως', ...], [...], ...]

In [27]:
def intra_inter(model, test_docs, num_pairs=10000):
    # split each test document into two halves and compute topics for each half
    part1 = [model[id2word_tlg.doc2bow(tokens[: len(tokens) // 2])] for tokens in test_docs]
    part2 = [model[id2word_tlg.doc2bow(tokens[len(tokens) // 2 :])] for tokens in test_docs]
    
    # print computed similarities (uses cossim)
    print("average cosine similarity between corresponding parts (higher is better):")
    print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))

    random_pairs = np.random.randint(0, len(test_docs), size=(num_pairs, 2))
    print("average cosine similarity between {} random parts (lower is better):".format(num_pairs))    
    print(np.mean([gensim.matutils.cossim(part1[i[0]], part2[i[1]]) for i in random_pairs]))

In [28]:
for num_topics in NUM_TOPICS_LIST:
    # load model
    lda_model_name = 'gensim_lda_model_tlg_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    print('Loading model: {} ...'.format(lda_model_name))
    print('... for testing split document topic matching ...')
    path_lda = os.path.join(user_dir, lda_model_name)
    lda_model = gensim.models.LdaMulticore.load(path_lda)

    print("LDA results:")
    # what should num_pairs be?
    intra_inter(lda_model, test_docs, num_pairs=total_included_docs)
    print('')

INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : setting ignored attribute dispatcher to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics5_numpasse

Loading model: gensim_lda_model_tlg_numtopics5_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : setting ignored attribute dispatcher to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics10_num

average cosine similarity between corresponding parts (higher is better):
0.631612277794
average cosine similarity between 25 random parts (lower is better):
0.558231977741

Loading model: gensim_lda_model_tlg_numtopics10_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : setting ignored attribute dispatcher to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics25_num

average cosine similarity between corresponding parts (higher is better):
0.628015450172
average cosine similarity between 25 random parts (lower is better):
0.572953679521

Loading model: gensim_lda_model_tlg_numtopics25_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : setting ignored attribute dispatcher to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics50_num

average cosine similarity between corresponding parts (higher is better):
0.648220892717
average cosine similarity between 25 random parts (lower is better):
0.560887151209

Loading model: gensim_lda_model_tlg_numtopics50_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : setting ignored attribute dispatcher to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics55_num

average cosine similarity between corresponding parts (higher is better):
0.630689727998
average cosine similarity between 25 random parts (lower is better):
0.464149373272

Loading model: gensim_lda_model_tlg_numtopics55_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...
LDA results:


INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading expElogbeta from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.expElogbeta.npy with mmap=None
INFO : setting ignored attribute state to None
INFO : setting ignored attribute id2word to None
INFO : setting ignored attribute dispatcher to None
INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model
INFO : loading LdaMulticore object from /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state


average cosine similarity between corresponding parts (higher is better):
0.61991447499
average cosine similarity between 25 random parts (lower is better):
0.501790390945

Loading model: gensim_lda_model_tlg_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model ...
... for testing split document topic matching ...


INFO : loaded /home/kyle/cltk_data/user_data/lda_tlg/gensim_lda_model_tlg_numtopics100_numpasses1_nobelow20_noabove0.1_tokmin3_tokmax20_docmin50_deaccentFalse.model.state


LDA results:
average cosine similarity between corresponding parts (higher is better):
0.585477666014
average cosine similarity between 25 random parts (lower is better):
0.417533334337



# Score all docs

In [29]:
id_auth_map = get_id_author()

In [None]:
# write to file topics for each doc
for num_topics in NUM_TOPICS_LIST:
    print('num topics', num_topics)
    # load model
    lda_model_name = 'gensim_lda_model_tlg_numtopics{}_numpasses{}_nobelow{}_noabove{}_tokmin{}_tokmax{}_docmin{}_deaccent{}.model'.format(num_topics, 
                                                                                                                                           PASSES, 
                                                                                                                                           no_below, 
                                                                                                                                           no_above, 
                                                                                                                                           TOK_MIN, 
                                                                                                                                           TOK_MAX, 
                                                                                                                                           DOC_MIN, 
                                                                                                                                           PREPROCESS_DEACCENT)
    print('Loading model: {} ...'.format(lda_model_name))
    print('... scoring topics of all TLG documents ...')
    path_lda = os.path.join(user_dir, lda_model_name)
    # https://radimrehurek.com/gensim/models/ldamodel.html#gensim.models.ldamodel.LdaModel.get_document_topics
    lda_model = gensim.models.LdaMulticore.load(path_lda)

    # mk save path name
    scores_name = lda_model_name.rstrip('.model') + '.scores'
    scores_path = os.path.join(user_dir, scores_name)
    doc_topics = ''
    for file_name, tokens in iter_tlg(tlg_preprocessed):
        # print(file_name, tokens[:10])  # print the article title and its first ten tokens
        # print(file_name)
        topic_distribution = str(lda_model[id2word_tlg.doc2bow(tokens)])
        # print(topic_distribution)
        
        # convert file name to author name, and get epithet
        auth_id = file_name.lstrip('TLG').rstrip('.TXT')
        auth_name = id_auth_map[auth_id]
        auth_epithet = str(get_epithet_of_author(auth_id))
        
        doc_topics += 'file: ' + file_name + '\n'
        doc_topics += 'author: ' + auth_name + '\n'
        doc_topics += 'epithet: ' + auth_epithet + '\n'
        doc_topics += topic_distribution + '\n\n'
    print('Writing file to: "{}"'.format(scores_path))
    with open(scores_path, 'w') as file_open:
        file_open.write(doc_topics)
    print('')