In [58]:
import itertools
import logging
import os
import pickle

import gensim
from gensim.utils import smart_open, simple_preprocess
import numpy as np
from cltk.stop.greek.stops import STOPS_LIST
STOPS_LIST = [simple_preprocess(stop, deacc=True)[0] for stop in STOPS_LIST if len(simple_preprocess(stop, deacc=True)) > 0]

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

# Load data

In [9]:
# Load pre-saved BoW
# Save BoW
user_dir = os.path.expanduser('~/cltk_data/user_data/')
try:
    os.makedirs(user_dir)
except FileExistsError:
    pass
bow_path = os.path.join(user_dir, 'bow_lda_gensim.mm')

mm_corpus = gensim.corpora.MmCorpus(bow_path)
print(mm_corpus)

MmCorpus(1484 documents, 84120 features, 3961081 non-zero entries)


In [10]:
print(next(iter(mm_corpus)))

[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 2.0), (8, 1.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0), (19, 1.0), (20, 1.0), (21, 1.0), (22, 1.0), (23, 1.0), (24, 1.0), (25, 1.0), (26, 1.0), (27, 1.0), (28, 1.0), (29, 1.0), (30, 1.0), (31, 1.0), (32, 1.0), (33, 1.0), (34, 1.0), (35, 1.0), (36, 1.0), (37, 1.0), (38, 1.0), (39, 1.0), (40, 1.0), (41, 1.0), (42, 1.0), (43, 1.0), (44, 1.0), (45, 1.0), (46, 1.0), (47, 1.0), (48, 2.0), (49, 1.0), (50, 1.0), (51, 1.0), (52, 1.0), (53, 1.0), (54, 1.0), (55, 1.0), (56, 1.0), (57, 1.0), (58, 1.0), (59, 1.0), (60, 1.0), (61, 1.0), (62, 1.0), (63, 1.0), (64, 1.0), (65, 1.0), (66, 1.0), (67, 1.0), (68, 1.0), (69, 1.0)]


# Semantic transformations

http://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html#Semantic-transformations

## LDA

In [15]:
# Save for reuse
with open(os.path.expanduser('~/cltk_data/user_data/tlg_bow_id2word.dict'), 'rb') as file_open:
    id2word_tlg = pickle.load(file_open)

In [16]:
# Quick testing using just a part of the corpus

# use fewer documents during training, LDA is slow
clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 100)
%time lda_model = gensim.models.LdaModel(clipped_corpus,
                                         num_topics=10,
                                         id2word=id2word_tlg,
                                         passes=4)

CPU times: user 4min 16s, sys: 54.4 s, total: 5min 10s
Wall time: 4min 7s


In [18]:
lda_model.print_topics(-1)  # print a few most important words for each LDA topic

[(0,
  '0.001*"διαφωνει" + 0.001*"διακεκαυμενη" + 0.000*"εναλλαγαι" + 0.000*"θερινη" + 0.000*"βουλομεθα" + 0.000*"οικετων" + 0.000*"καιροι" + 0.000*"λυσαωνθειηδεση" + 0.000*"ωργισμενους" + 0.000*"δεομενω"'),
 (1,
  '0.002*"νενευκασιν" + 0.001*"αδωμος" + 0.001*"μυουμενους" + 0.001*"νοτιωτερα" + 0.001*"διακειμενων" + 0.001*"αιστωτηριον" + 0.001*"προειρημενον" + 0.001*"επιταξ" + 0.001*"δηλουντος" + 0.001*"ασπετον"'),
 (2,
  '0.001*"αγνοιαν" + 0.001*"ακουσας" + 0.001*"ακρατον" + 0.001*"κινδυνευοντι" + 0.001*"λαμπτηρα" + 0.001*"εγενετο" + 0.000*"στοιχους" + 0.000*"ερωτος" + 0.000*"λαβομενη" + 0.000*"εναγισαντες"'),
 (3,
  '0.000*"ωργισμενους" + 0.000*"διαβοσκεσθαι" + 0.000*"απολυοντος" + 0.000*"ετοιμως" + 0.000*"σαρδονυχες" + 0.000*"νεα" + 0.000*"συμβαινει" + 0.000*"διαφωνει" + 0.000*"εξηρτημενων" + 0.000*"πενεσται"'),
 (4,
  '0.000*"νημερτεστατε" + 0.000*"φιλολογος" + 0.000*"απλανεσιν" + 0.000*"δηλαδη" + 0.000*"επιπλοκης" + 0.000*"θρασυλλωι" + 0.000*"διαβοσκεσθαι" + 0.000*"επωνομαζετο" + 0

## TI-IDF + LSI

In [25]:
# first train tfidf model
# this modifies the feature weights of each word
%time tfidf_model = gensim.models.TfidfModel(mm_corpus, id2word=id2word_tlg)

INFO : collecting document frequencies
INFO : PROGRESS: processing document #0
INFO : calculating IDF weights for 1484 documents and 84119 features (3961081 matrix non-zeros)


CPU times: user 6.34 s, sys: 12 ms, total: 6.35 s
Wall time: 6.29 s


In [26]:
# then run lsi, which reduces dimensionality
%time lsi_model = gensim.models.LsiModel(tfidf_model[mm_corpus], id2word=id2word_tlg, num_topics=200)

INFO : using serial LSI version on this node
INFO : updating model with new documents
INFO : preparing a new chunk of documents
INFO : using 100 extra samples and 2 power iterations
INFO : 1st phase: constructing (1161925, 300) action matrix
INFO : orthonormalizing (1161925, 300) action matrix
INFO : 2nd phase: running dense svd on (300, 1484) matrix
INFO : computing the final decomposition
INFO : keeping 200 factors (discarding 19.480% of energy spectrum)
INFO : processed documents up to #1484
INFO : topic #0(5.232): 0.097*"καταπελταισι" + 0.096*"νοτιωτερα" + 0.093*"ηνεχυρασμενον" + 0.091*"διακειμενων" + 0.077*"αδωμος" + 0.075*"απωλειαν" + 0.071*"βεβαιωσον" + 0.066*"ροιζησαν" + 0.064*"οικειον" + 0.058*"λυσαωνθειηδεση"
INFO : topic #1(3.674): -0.138*"καταπελταισι" + -0.135*"νοτιωτερα" + -0.130*"διακειμενων" + -0.126*"ηνεχυρασμενον" + -0.106*"απωλειαν" + -0.092*"ροιζησαν" + 0.089*"μουνον" + -0.081*"βεβαιωσον" + -0.081*"οικειον" + -0.072*"νοτιωτεραν"
INFO : topic #2(3.159): 0.174*"νεφους

CPU times: user 4min 2s, sys: 1min 4s, total: 5min 7s
Wall time: 58.3 s


In [27]:
# for the first doc of the TLG corpus, here are the LSI scores for each of the 200 topics
print(next(iter(lsi_model[tfidf_model[mm_corpus]])))

[(0, 0.024289037065511311), (1, 0.014077893150839215), (2, -0.019146078135534279), (3, -0.018767966018989272), (4, -0.00099884333481120103), (5, -0.0051913135174345487), (6, -0.0014277098382469583), (7, -0.018818824919966906), (8, 0.0056374952350573453), (9, -0.0023221873083177956), (10, -0.0081668241786406002), (11, -0.019996384848007791), (12, 0.014338300486609062), (13, -0.00010303379864312642), (14, 0.0021088994788161335), (15, 0.0098757069667030732), (16, 0.0090732013832107734), (17, 0.0081570602885538118), (18, -0.0098302970910077361), (19, 0.01134303047809858), (20, -0.0042868477183203814), (21, -0.0019021128579354079), (22, 0.0083599609888375184), (23, -0.0024645892529614947), (24, -0.0089570567476525975), (25, 0.013241931401700954), (26, -0.0079806310637484165), (27, 0.0037053228065251193), (28, 0.002579960226988357), (29, 0.016675254728906144), (30, 0.001139717059664551), (31, 0.017415051948121849), (32, 0.02155588300179399), (33, -0.014399643194274744), (34, 0.00864178216383

## Save corpora

In [30]:
# cache the transformed corpora to disk, for use in later notebooks
path_lda = os.path.join(user_dir, 'gensim_tlg_lda.mm')
path_tfidf = os.path.join(user_dir, 'gensim_tlg_tfidf.mm')
path_lsi= os.path.join(user_dir, 'gensim_tlg_lsa.mm')
%time gensim.corpora.MmCorpus.serialize(path_lda, lda_model[mm_corpus])
%time gensim.corpora.MmCorpus.serialize(path_tfidf, tfidf_model[mm_corpus])
%time gensim.corpora.MmCorpus.serialize(path_lsi, lsi_model[tfidf_model[mm_corpus]])

INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/gensim_tlg_lda.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/gensim_tlg_lda.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : saved 1484x10 matrix, density=56.617% (8402/14840)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/gensim_tlg_lda.mm.index
INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/gensim_tlg_tfidf.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/gensim_tlg_tfidf.mm
INFO : PROGRESS: saving document #0


CPU times: user 24.2 s, sys: 1min 11s, total: 1min 35s
Wall time: 12.6 s


INFO : PROGRESS: saving document #1000
INFO : saved 1484x84120 matrix, density=3.173% (3961081/124834080)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/gensim_tlg_tfidf.mm.index
INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/gensim_tlg_lsa.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/gensim_tlg_lsa.mm


CPU times: user 14 s, sys: 364 ms, total: 14.4 s
Wall time: 14 s


INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : saved 1484x200 matrix, density=100.000% (296800/296800)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/gensim_tlg_lsa.mm.index


CPU times: user 9.81 s, sys: 112 ms, total: 9.92 s
Wall time: 9.9 s


## Transform "unseen" documents

What is the best way to get LDA scores on a corpus used for training a model?

In [42]:
# LDA

def tokenize(text):
    # https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
    tokens = [token for token in simple_preprocess(text, deacc=True)]
    return [token for token in tokens if token not in STOPS_LIST]

doc = "ἐπειδὴ πᾶσαν πόλιν ὁρῶμεν κοινωνίαν τινὰ οὖσαν καὶ πᾶσαν κοινωνίαν ἀγαθοῦ τινος ἕνεκεν συνεστηκυῖαν （τοῦ γὰρ εἶναι δοκοῦντος ἀγαθοῦ χάριν πάντα πράττουσι πάντες）, δῆλον ὡς πᾶσαι μὲν ἀγαθοῦ τινος στοχάζονται, μάλιστα δὲ [5] καὶ τοῦ κυριωτάτου πάντων ἡ πασῶν κυριωτάτη καὶ πάσας περιέχουσα τὰς ἄλλας. αὕτη δ᾽ ἐστὶν ἡ καλουμένη πόλις καὶ ἡ κοινωνία ἡ πολιτική. ὅσοι μὲν οὖν οἴονται πολιτικὸν καὶ βασιλικὸν καὶ οἰκονομικὸν καὶ δεσποτικὸν εἶναι τὸν αὐτὸν οὐ καλῶς λέγουσιν （πλήθει γὰρ καὶ ὀλιγότητι νομίζουσι [10] διαφέρειν ἀλλ᾽ οὐκ εἴδει τούτων ἕκαστον, οἷον ἂν μὲν ὀλίγων, δεσπότην, ἂν δὲ πλειόνων, οἰκονόμον, ἂν δ᾽ ἔτι πλειόνων, πολιτικὸν ἢ βασιλικόν, ὡς οὐδὲν διαφέρουσαν μεγάλην οἰκίαν ἢ μικρὰν πόλιν: καὶ πολιτικὸν δὲ καὶ βασιλικόν, ὅταν μὲν αὐτὸς ἐφεστήκῃ, βασιλικόν, ὅταν [15] δὲ κατὰ τοὺς λόγους τῆς ἐπιστήμης τῆς τοιαύτης κατὰ μέρος ἄρχων καὶ ἀρχόμενος, πολιτικόν: ταῦτα δ᾽ οὐκ ἔστιν ἀληθῆ）: δῆλον δ᾽ ἔσται τὸ λεγόμενον ἐπισκοποῦσι κατὰ τὴν ὑφηγημένην μέθοδον. ὥσπερ γὰρ ἐν τοῖς ἄλλοις τὸ σύνθετον μέχρι τῶν ἀσυνθέτων ἀνάγκη διαιρεῖν （ταῦτα γὰρ ἐλάχιστα [20] μόρια τοῦ παντός）, οὕτω καὶ πόλιν ἐξ ὧν σύγκειται σκοποῦντες ὀψόμεθα καὶ περὶ τούτων μᾶλλον, τί τε διαφέρουσιν ἀλλήλων καὶ εἴ τι τεχνικὸν ἐνδέχεται λαβεῖν περὶ ἕκαστον τῶν ῥηθέντων."
doc = ' '.join(simple_preprocess(doc))

In [43]:
# transform text into the bag-of-words space
bow_vector = id2word_tlg.doc2bow(tokenize(doc))
print([(id2word_tlg[id], count) for id, count in bow_vector])

[('αυτη', 1), ('τινα', 1), ('τοις', 1), ('ουσαν', 1), ('τινος', 2), ('ουδεν', 1), ('τας', 1), ('πολιν', 3), ('παντες', 1), ('παντα', 1), ('αλλας', 1), ('ειναι', 2), ('πασαν', 2), ('παντων', 1), ('λεγουσιν', 1), ('αυτον', 1), ('ων', 1), ('μερος', 1), ('ουτω', 1), ('οιον', 1), ('εστιν', 2), ('δηλον', 2), ('λαβειν', 1), ('ταυτα', 2), ('διαφερουσαν', 1), ('λογους', 1), ('ωσπερ', 1), ('πολις', 1), ('τουτων', 2), ('ρηθεντων', 1), ('οταν', 2), ('μαλλον', 1), ('λεγομενον', 1), ('επειδη', 1), ('αληθη', 1), ('πραττουσι', 1), ('χαριν', 1), ('οικιαν', 1), ('μαλιστα', 1), ('αλλοις', 1), ('ενεκεν', 1), ('καλουμενη', 1), ('μεγαλην', 1), ('εκαστον', 2), ('αρχων', 1), ('αλληλων', 1), ('μεχρι', 1), ('πληθει', 1), ('οιονται', 1), ('παντος', 1), ('συνθετον', 1), ('αρχομενος', 1), ('κοινωνιαν', 2), ('τοιαυτης', 1), ('καλως', 1), ('πλειονων', 2), ('νομιζουσι', 1), ('αναγκη', 1), ('πασας', 1), ('περιεχουσα', 1), ('δεσποτην', 1), ('πασαι', 1), ('οσοι', 1), ('μεθοδον', 1), ('ειδει', 1), ('επιστημης', 1), ('πασ

In [45]:
# transform into LDA space
lda_vector = lda_model[bow_vector]
print(lda_vector)

[(2, 0.62834060088914145), (4, 0.075956794886348669), (5, 0.07265613621925443), (7, 0.036576776832934622), (9, 0.18233695136950517)]
0.001*"αγνοιαν" + 0.001*"ακουσας" + 0.001*"ακρατον" + 0.001*"κινδυνευοντι" + 0.001*"λαμπτηρα" + 0.001*"εγενετο" + 0.000*"στοιχους" + 0.000*"ερωτος" + 0.000*"λαβομενη" + 0.000*"εναγισαντες"


In [46]:
# print the document's single most prominent LDA topic
print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))

0.001*"αγνοιαν" + 0.001*"ακουσας" + 0.001*"ακρατον" + 0.001*"κινδυνευοντι" + 0.001*"λαμπτηρα" + 0.001*"εγενετο" + 0.000*"στοιχους" + 0.000*"ερωτος" + 0.000*"λαβομενη" + 0.000*"εναγισαντες"


In [47]:
# transform into LSI space
lsi_vector = lsi_model[tfidf_model[bow_vector]]
print(lsi_vector)

[(0, 0.024566044502988167), (1, 0.015083771441513003), (2, -0.0053303162617581688), (3, -0.0049469294214897746), (4, -0.004304577822397548), (5, -0.0019232322736995517), (6, 0.0012844223351456006), (7, -0.014361304541287518), (8, 0.0029628230116745817), (9, -0.001991805827179696), (10, -0.0038961163189403162), (11, -0.0017520317514794631), (12, -0.003289379651920948), (13, 0.013984478154812633), (14, -0.0042467976096142246), (15, -0.001134755589540278), (16, -3.7748879427212047e-05), (17, 0.0026908339487799688), (18, 0.0008648525754869072), (19, 0.0041513971222094598), (20, -0.0089440015914239723), (21, -0.0039957917322863508), (22, -0.0039182934453001663), (23, -0.0016527724004994603), (24, -0.00052822506831432989), (25, 0.0080844104724639085), (26, -0.0062247602598571213), (27, -0.0044917328067377452), (28, -0.0037489807849336193), (29, 0.00082110267265825731), (30, -0.00062982415302895579), (31, 0.0012224945919592724), (32, 0.00034178180065024514), (33, -0.0035051594171728109), (34,

In [48]:
# print the document's single most prominent LSI topic (not interpretable like LDA!)
print(lsi_model.print_topic(max(lsi_vector, key=lambda item: abs(item[1]))[0]))

0.140*"περιαυχενιον" + -0.082*"λεγοιεν" + 0.072*"απονοηθεντων" + -0.065*"κριθαι" + -0.064*"κλαπεντας" + 0.063*"ζωνας" + -0.062*"περιπαθης" + -0.061*"αμμωνος" + 0.061*"λυσικλει" + 0.061*"λαμβανοντα"


## Save models

In [49]:
path_lda = os.path.join(user_dir, 'gensim_tlg_lda.model')
path_tfidf = os.path.join(user_dir, 'gensim_tlg_tfidf.model')
path_lsi= os.path.join(user_dir, 'gensim_tlg_lsa.model')

# store all trained models to disk
lda_model.save(path_lda)
lsi_model.save(path_lsi)
tfidf_model.save(path_tfidf)

INFO : saving LdaState object under /home/kyle/cltk_data/user_data/gensim_tlg_lda.model.state, separately None
INFO : storing np array 'sstats' to /home/kyle/cltk_data/user_data/gensim_tlg_lda.model.state.sstats.npy
INFO : saved /home/kyle/cltk_data/user_data/gensim_tlg_lda.model.state
INFO : saving LdaModel object under /home/kyle/cltk_data/user_data/gensim_tlg_lda.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to /home/kyle/cltk_data/user_data/gensim_tlg_lda.model.expElogbeta.npy
INFO : not storing attribute dispatcher
INFO : not storing attribute id2word
INFO : not storing attribute state
INFO : saved /home/kyle/cltk_data/user_data/gensim_tlg_lda.model
INFO : saving Projection object under /home/kyle/cltk_data/user_data/gensim_tlg_lsa.model.projection, separately None
INFO : storing np array 'u' to /home/kyle/cltk_data/user_data/gensim_tlg_lsa.model.projection.u.npy
INFO : saved /home/kyle/cltk_data/user_data/gensim_tlg_lsa.model.projection
INFO : 

# Evaluation

Read insighful intro at http://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html#Evaluation

# Word intrusion

> For each trained topic, they take its first ten words, then substitute one of them with another, randomly chosen word (intruder!) and see whether a human can reliably tell which one it was. If so, the trained topic is topically coherent (good); if not, the topic has no discernible theme (bad)

In [54]:
# select top 50 words for each of the 20 LDA topics
top_words = [[word for word, _ in lda_model.show_topic(topicno, topn=50)] for topicno in range(lda_model.num_topics)]
print(top_words)

[['διαφωνει', 'διακεκαυμενη', 'εναλλαγαι', 'θερινη', 'βουλομεθα', 'οικετων', 'καιροι', 'λυσαωνθειηδεση', 'ωργισμενους', 'δεομενω', 'απολαβουσα', 'αφοβοι', 'διεσφαλται', 'αντεστραμμενην', 'πεντακοσιων', 'ευτελων', 'αποτυφλωσιν', 'συμμαρτυρει', 'νεκρωσιν', 'αγχονη', 'δυνατωτερου', 'αγορητης', 'διεγγυηθεισα', 'εκουσιως', 'θα', 'μελικερτης', 'εγγραφηναι', 'απολλω', 'πολλοστον', 'συγκριτικον', 'οριζοντα', 'υπειπον', 'ελκτικη', 'ψαλτριων', 'αξιοπιστω', 'διαβολην', 'επανελαβεν', 'πολυφημην', 'εξισασουσι', 'εριζει', 'πυθαγορειων', 'μεμνησθε', 'κεγχρεων', 'δηλουντος', 'ολιγως', 'βασεσιν', 'αυριον', 'μηθενος', 'αρκτου', 'μανθανοντας'], ['νενευκασιν', 'αδωμος', 'μυουμενους', 'νοτιωτερα', 'διακειμενων', 'αιστωτηριον', 'προειρημενον', 'επιταξ', 'δηλουντος', 'ασπετον', 'ζωδιακος', 'αντεστραμμενην', 'νοτιωτεραν', 'απωλειαν', 'ειρημενας', 'ενθεντι', 'ανετελλεν', 'αλλοφυλω', 'συνουσιαν', 'μεταλλα', 'αμυριος', 'αφορωσης', 'κωπηλατας', 'τριακοντ', 'ευφοριαν', 'προσδοκωση', 'πηγη', 'ιστορει', 'εδιδαξαμεν'

In [59]:
# get all top 50 words in all 20 topics, as one large set
all_words = set(itertools.chain.from_iterable(top_words))

print("Can you spot the misplaced word in each topic?")

# for each topic, replace a word at a different index, to make it more interesting
replace_index = np.random.randint(0, 10, lda_model.num_topics)

replacements = []
for topicno, words in enumerate(top_words):
    other_words = all_words.difference(words)
    replacement = np.random.choice(list(other_words))
    replacements.append((words[replace_index[topicno]], replacement))
    words[replace_index[topicno]] = replacement
    print("%i: %s" % (topicno, ' '.join(words[:10])))

Can you spot the misplaced word in each topic?
0: προκομισαντας διακεκαυμενη εναλλαγαι θερινη βουλομεθα οικετων καιροι λυσαωνθειηδεση ωργισμενους δεομενω
1: νενευκασιν αδωμος μυουμενους νοτιωτερα διακειμενων διακεκαυμενη προειρημενον επιταξ δηλουντος ασπετον
2: αγνοιαν ακουσας ακρατον νωχος λαμπτηρα εγενετο στοιχους ερωτος λαβομενη εναγισαντες
3: ωργισμενους διαβοσκεσθαι απολυοντος ετοιμως σαρδονυχες παθεσιν συμβαινει διαφωνει εξηρτημενων πενεσται
4: νημερτεστατε φιλολογος απλανεσιν δηλαδη επιπλοκης θρασυλλωι διαβοσκεσθαι μανθανοντας κρειτ φερωνυμον
5: ρυπος περιτρεπομενα καταπελταισι διαφωνει σικελιωτας διακειμενων αποκτενουσι αποπλυσεως καθιεσαν εχομεν
6: διαβοσκεσθαι δηλαδη παρακαθηται τυχης σχηματιζομενου εμφαινουσιν ανισχη ελληνικη κεκληκας υποκριτη
7: κριθαι συντεμνε καθαυανει αναστησας καθισταντο εκκλητευεσθαι βαδιζουσιν πολυευκτον ταφον αρασκη
8: σαμαιου λακεδαιμονα ωρμημενους εμπεδοκλεους ακρατον αναπεπεικως διηπορησθαι καθηστο βοηθησειαν βαινειν
9: διαφωνει οικετων θερινη δια

In [60]:
print("Actual replacements were:")
print(list(enumerate(replacements)))

Actual replacements were:
[(0, ('διαφωνει', 'προκομισαντας')), (1, ('αιστωτηριον', 'διακεκαυμενη')), (2, ('κινδυνευοντι', 'νωχος')), (3, ('νεα', 'παθεσιν')), (4, ('επωνομαζετο', 'μανθανοντας')), (5, ('νοτιωτερα', 'διαφωνει')), (6, ('πετευρον', 'ελληνικη')), (7, ('κραυαλλιδαι', 'αναστησας')), (8, ('αγομενης', 'ακρατον')), (9, ('καιροι', 'πυλωροι'))]


## Split doc

> We'll split each document into two parts, and check that 1) topics of the first half are similar to topics of the second 2) halves of different documents are mostly dissimilar

In [68]:
# this function first defined in pt. 1
def iter_tlg(tlg_dir):
    file_names = os.listdir(tlg_dir)
    for file_name in file_names:
        file_path = os.path.join(tlg_dir, file_name)
        with open(file_path) as file_open:
            file_read = file_open.read()
        tokens = tokenize(file_read)
        # ignore short docs
        if len(tokens) < 50:
            continue
        yield file_name, tokens

# evaluate on 1k documents **not** used in LDA training
tlg_preprocessed = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
doc_stream = (tokens for _, tokens in iter_tlg(tlg_preprocessed))  # generator
test_docs = list(itertools.islice(doc_stream, 100, 200))  # ['πανυ', 'καλως', ...], [...], ...]

In [93]:
def intra_inter(model, test_docs, num_pairs=10000):
    # split each test document into two halves and compute topics for each half
    part1 = [model[id2word_tlg.doc2bow(tokens[: len(tokens) // 2])] for tokens in test_docs]
    part2 = [model[id2word_tlg.doc2bow(tokens[len(tokens) // 2 :])] for tokens in test_docs]
    
    # print computed similarities (uses cossim)
    print("average cosine similarity between corresponding parts (higher is better):")
    print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))

    random_pairs = np.random.randint(0, len(test_docs), size=(num_pairs, 2))
    print("average cosine similarity between {} random parts (lower is better):".format(num_pairs))    
    print(np.mean([gensim.matutils.cossim(part1[i[0]], part2[i[1]]) for i in random_pairs]))

In [94]:
print("LDA results:")
intra_inter(lda_model, test_docs)

LDA results:
average cosine similarity between corresponding parts (higher is better):
0.979140673252
average cosine similarity between 10000 random parts (lower is better):
0.967707618176


In [95]:
print("LSI results:")
intra_inter(lsi_model, test_docs)

LSI results:
average cosine similarity between corresponding parts (higher is better):
0.759159191756
average cosine similarity between 10000 random parts (lower is better):
0.556656437511


# How to use on corpus

Just send convert list of tokens to bow, then send to model: `print(lda_model[id2word_tlg.doc2bow(tokens)])`

In [110]:
for title, tokens in iter_tlg(tlg_preprocessed):
    #print(title, tokens[:10])  # print the article title and its first ten tokens
    print(title)
    print(lda_model[id2word_tlg.doc2bow(tokens)])
    print('')


TLG2346.TXT
[(2, 0.77020063255871529), (4, 0.050162662423303819), (7, 0.17759554457537929)]

TLG1389.TXT
[(1, 0.077333330171680217), (2, 0.81018572424497037), (4, 0.029298653619360696), (7, 0.021474921514088502), (9, 0.050474519337949199)]

TLG0404.TXT
[(1, 0.02048927219191567), (2, 0.94907945485028755), (4, 0.029755487376673027)]

TLG0235.TXT
[(2, 0.9770261992926299), (4, 0.015441547453474994)]

TLG0535.TXT
[(1, 0.034813206783092389), (2, 0.71772563652669896), (3, 0.014290175511093661), (4, 0.090466392452987279), (5, 0.0652139733662397), (7, 0.018721921830686009), (9, 0.055341464745294021)]

TLG0507.TXT
[(1, 0.10259625305824292), (2, 0.41373140862637803), (4, 0.14912418250002712), (5, 0.12201859250883705), (7, 0.16176346421143439), (9, 0.050147784476549384)]

TLG1816.TXT
[(1, 0.024000415375272359), (2, 0.86327207953334895), (4, 0.028950050907302946), (5, 0.02272453752338216), (7, 0.035543720007879025), (9, 0.013945883813084248)]

TLG0476.TXT
[(1, 0.038183750715045969), (2, 0.927640218

[(1, 0.069888217070998571), (2, 0.64448915701583642), (4, 0.089020211952983888), (5, 0.042890057823670724), (6, 0.011129856079415947), (7, 0.011270471246184608), (9, 0.12631930064772245)]

TLG1981.TXT
[(1, 0.045691126629257443), (2, 0.58258777952129892), (4, 0.14980354368691123), (5, 0.053443544905473178), (6, 0.013665414301995341), (7, 0.011206202183390277), (9, 0.14122732778841451)]

TLG0536.TXT
[(1, 0.06322154078436043), (2, 0.63305787536916736), (4, 0.091796696406017286), (5, 0.041752574137819749), (6, 0.010607335927409351), (7, 0.022514360540822131), (9, 0.12916746112920538)]

TLG2892.TXT
[(1, 0.072246425111774648), (2, 0.60421214549573854), (4, 0.099253071342821977), (5, 0.042318682870410296), (6, 0.01113069321787143), (7, 0.013314380492660937), (9, 0.14904826039828561)]

TLG0708.TXT
[(1, 0.065907188974519595), (2, 0.59699961161037252), (4, 0.068045913347469303), (5, 0.062391041627117928), (7, 0.014126885586319825), (9, 0.17621434088462185)]

TLG0560.TXT
[(1, 0.07556054987674983)

TLG2053.TXT
[(1, 0.075322130943838703), (2, 0.57416739529996885), (4, 0.079105519699793861), (5, 0.12076588969837167), (7, 0.018327324544550362), (9, 0.11942905780024501)]

TLG2724.TXT
[(1, 0.066612060759416974), (2, 0.64550208869589509), (4, 0.09093727284058635), (5, 0.029332802477220511), (6, 0.012021685633571071), (7, 0.011136100141568907), (9, 0.1412276041289014)]

TLG2409.TXT
[(1, 0.091872512730068079), (2, 0.56785348476337216), (4, 0.1299139998037778), (5, 0.061409036023643081), (7, 0.013516054805162887), (9, 0.12319559413957686)]

TLG3047.TXT
[(1, 0.079553014250171167), (2, 0.59392037446965384), (4, 0.10982638598638292), (5, 0.043900471948448018), (7, 0.016207462016938727), (9, 0.13993572191847267)]

TLG2016.TXT
[(1, 0.088889005475262003), (2, 0.59856603324982582), (4, 0.080842742480600749), (5, 0.05782779229088654), (6, 0.014043343433780258), (7, 0.014009185171581534), (9, 0.13855264820115626)]

TLG0492.TXT
[(0, 0.21817828478628989), (1, 0.098637978497196327), (2, 0.38927102897

TLG0385.TXT
[(0, 0.011783909500877545), (1, 0.094537420973154526), (2, 0.56578916988255934), (4, 0.13664390544132948), (5, 0.053483113175003746), (6, 0.011900993497750027), (7, 0.014868179279539264), (9, 0.10272253131762678)]

TLG2233.TXT
[(1, 0.087460841809573406), (2, 0.6304066905054706), (4, 0.081859773592593596), (5, 0.038095833920206261), (9, 0.14426869280952115)]

TLG0022.TXT
[(0, 0.020878742329852568), (1, 0.066927606483027452), (2, 0.44280879235894194), (4, 0.1526767665840684), (5, 0.076975935032917844), (6, 0.043812223072308303), (7, 0.053093038987204653), (9, 0.1367715911057765)]

TLG4153.TXT
[(1, 0.091842361799974459), (2, 0.55949424311748308), (4, 0.11929343462492914), (5, 0.060113225152617661), (6, 0.013414500585467412), (7, 0.019228331060384955), (9, 0.12051721789336259)]

TLG0488.TXT
[(2, 0.8201098688626911), (4, 0.042496904249366911), (5, 0.082420878494971719), (9, 0.049363633289034745)]

TLG2031.TXT
[(1, 0.071003031247748219), (2, 0.60713013203693689), (4, 0.0874124457

TLG2702.TXT
[(1, 0.074124290873043813), (2, 0.59312994396837859), (4, 0.099812944286737287), (5, 0.043377626477459449), (6, 0.012754476217933941), (7, 0.01429182964013773), (9, 0.1527847171922124)]

TLG1139.TXT
[(1, 0.073226133939135449), (2, 0.59336044004681276), (4, 0.095858356184563967), (5, 0.05776580405459366), (6, 0.017698095929933357), (7, 0.025448469677961091), (9, 0.12442170994044105)]

TLG0517.TXT
[(2, 0.68490652663200779), (4, 0.054463269137564288), (7, 0.051727295894395349), (9, 0.19818633379606684)]

TLG3069.TXT
[(1, 0.11039553073632791), (2, 0.49625162715738441), (4, 0.15268845009546997), (5, 0.072184647618962927), (6, 0.01343123501523541), (7, 0.022443669517204775), (9, 0.11356906575345808)]

TLG0554.TXT
[(1, 0.08484572710824527), (2, 0.57500356252226126), (4, 0.11294186043169888), (5, 0.069448748660001933), (6, 0.010244220563705822), (7, 0.013487442720421888), (9, 0.12130340076716016)]

TLG3128.TXT
[(0, 0.010029053551449389), (1, 0.066802861521995077), (2, 0.54284556620

TLG4024.TXT
[(1, 0.087159543418776492), (2, 0.52705914936193532), (4, 0.14900149669730428), (5, 0.065637857038279124), (6, 0.016543556751597865), (7, 0.014409471056134247), (9, 0.12875387331712501)]

TLG0327.TXT
[(1, 0.044566448411104465), (2, 0.64668727617775446), (4, 0.13299477840392401), (5, 0.074653008702251181), (8, 0.011440991907323625), (9, 0.086917182201838389)]

TLG0609.TXT
[(1, 0.059590003456916896), (2, 0.60403174323407494), (4, 0.06459967071816812), (5, 0.024579799655638321), (9, 0.22121012005837076)]

TLG2029.TXT
[(1, 0.06199034610521513), (2, 0.56075485481357079), (4, 0.062310419228080442), (5, 0.11466201489932226), (9, 0.1911241230866266)]

TLG4138.TXT
[(1, 0.085321340718352401), (2, 0.58219457762628712), (4, 0.11368640152248784), (5, 0.049987441932337506), (9, 0.14044776913229087)]

TLG4027.TXT
[(1, 0.047991745728778928), (2, 0.65757580595659126), (4, 0.073505329161283064), (5, 0.018827627174530268), (7, 0.022401496486165528), (9, 0.1676865622726933)]

TLG0580.TXT
[(1, 

TLG0718.TXT
[(0, 0.012088533024942767), (1, 0.090236766274352245), (2, 0.4621107156460672), (4, 0.15933093815472568), (5, 0.051600708412672558), (6, 0.018923224601594728), (7, 0.085293713783068437), (9, 0.11488694852410158)]

TLG0752.TXT
[(1, 0.084042704561501508), (2, 0.55317132668201574), (4, 0.13490030262495351), (5, 0.058952306609900446), (7, 0.013547075492958882), (9, 0.13782428617987452)]

TLG0276.TXT
[(2, 0.71357494737590454), (5, 0.082617702531398551), (9, 0.19619650491836668)]

TLG9018.TXT
[(1, 0.074132460696395716), (2, 0.62268921866048121), (4, 0.089952966751028804), (5, 0.050484583506734855), (6, 0.015272064847591063), (7, 0.011566529988661475), (9, 0.12923384483224287)]

TLG1275.TXT
[(2, 0.72839140440601824), (4, 0.029788825294566178), (5, 0.074624535461745148), (6, 0.035400093180171247), (9, 0.1277289754189449)]

TLG1308.TXT
[(1, 0.056604773667240316), (2, 0.63692590931472171), (4, 0.11396165019788992), (5, 0.074443023339470657), (7, 0.010932193912753164), (9, 0.105941698

[(1, 0.087991065942080268), (2, 0.60785544819294579), (4, 0.1152163312279256), (5, 0.052202737750834845), (6, 0.011416671235217144), (7, 0.010539785502362503), (9, 0.10586078241648228)]

TLG0293.TXT
[(1, 0.061730394046453474), (2, 0.5773850188880999), (4, 0.11598585025995174), (5, 0.041295791394977355), (7, 0.040528185511340999), (9, 0.15993172916682979)]

TLG1602.TXT
[(1, 0.06352149466693538), (2, 0.59754215307446445), (4, 0.078420789419493583), (5, 0.054880580550264436), (9, 0.17556332254537085)]

TLG1524.TXT
[(1, 0.088392928977959057), (2, 0.56062338967391623), (4, 0.11005082834685459), (5, 0.067349832107226157), (7, 0.021485536057381006), (9, 0.15108202778592791)]

TLG1553.TXT
[(1, 0.096428884347784322), (2, 0.60922302573032949), (4, 0.090587476441938625), (5, 0.041462395144755818), (7, 0.021114881620234605), (9, 0.12685243577440569)]

TLG1804.TXT
[(1, 0.063652584313013558), (2, 0.55696390262385576), (4, 0.10584856243523855), (5, 0.077648194143423749), (6, 0.020859086352183302), (7

TLG3020.TXT
[(1, 0.094147063023858973), (2, 0.54754858319633704), (4, 0.15092487111844011), (5, 0.062235792736091783), (6, 0.010879813088351151), (7, 0.017814183334578106), (9, 0.10597652591912081)]

TLG2631.TXT
[(0, 0.027670289613462833), (2, 0.63171361414801652), (4, 0.10679719125669855), (5, 0.11811593470168338), (6, 0.030158876011984186), (9, 0.080664866867688742)]

TLG1591.TXT
[(1, 0.088514620548272499), (2, 0.66066370910169292), (4, 0.098579454094778737), (5, 0.044343261001273143), (9, 0.0920026757085977)]

TLG3158.TXT
[(1, 0.075158045554883174), (2, 0.5879648869753844), (4, 0.097401075278275656), (5, 0.060468354235857329), (6, 0.014306088157045269), (9, 0.14691290419859221)]

TLG0044.TXT
[(1, 0.066042894668739249), (2, 0.50980800680993033), (4, 0.11301195301791755), (5, 0.2500928389996992), (9, 0.053792911641337531)]

TLG1216.TXT
[(1, 0.068860521419040116), (2, 0.61897510709635917), (4, 0.10211443941967856), (5, 0.047212405069623944), (6, 0.021381485246103008), (9, 0.12610059116

TLG3014.TXT
[(1, 0.058917286174026015), (2, 0.59829457645449013), (4, 0.0847394829474412), (5, 0.084776700527778123), (7, 0.018948430866195572), (9, 0.14414418937129392)]

TLG1230.TXT
[(2, 0.6243097298182928), (4, 0.11551220664332042), (7, 0.049414003835667707), (9, 0.20701324520834741)]

TLG2934.TXT
[(1, 0.070210197603245858), (2, 0.62231242689213062), (4, 0.093094666284804894), (5, 0.041698117369685345), (6, 0.010291495761120931), (7, 0.010281052139466635), (9, 0.14439286670849613)]

TLG4291.TXT
[(0, 0.021433913289981253), (1, 0.1109211596813839), (2, 0.44645385314672709), (4, 0.17604332975780848), (5, 0.032217880712383795), (6, 0.013158357515326615), (7, 0.014960995672103882), (9, 0.1748161042876078)]

TLG2218.TXT
[(1, 0.075060118048149146), (2, 0.54521091109215936), (4, 0.11626775378689175), (5, 0.059216464700161564), (7, 0.030725135360692307), (9, 0.17267196632761742)]

TLG3115.TXT
[(1, 0.068600238637797489), (2, 0.65395145100655649), (4, 0.065585666168628584), (5, 0.0532998364506

TLG2200.TXT
[(1, 0.084582924200197362), (2, 0.60348312948568095), (4, 0.1168749943858621), (5, 0.050266852700469669), (7, 0.011441813062648534), (9, 0.11597806884873374)]

TLG0605.TXT
[(0, 0.021286603575783929), (1, 0.058340747975476066), (2, 0.65597286418135858), (4, 0.04500963249260817), (5, 0.016897145744552922), (6, 0.015500991058272597), (9, 0.17759834183745027)]

TLG2289.TXT
[(1, 0.091823397078308527), (2, 0.50843597014730346), (4, 0.11614528422653705), (9, 0.27227144146629162)]

TLG3168.TXT
[(0, 0.010272241980831889), (1, 0.074018424004524602), (2, 0.59363894359875513), (4, 0.089044731927792345), (5, 0.055737294086628905), (6, 0.010359391135054278), (7, 0.032503404019954871), (9, 0.1292636219314518)]

TLG1105.TXT
[(1, 0.097690620022044677), (2, 0.51441564638047887), (4, 0.11250947018794158), (5, 0.12081062028939751), (7, 0.015896860102462083), (9, 0.12931286240977655)]

TLG0736.TXT
[(1, 0.063560033278813766), (2, 0.56238760339694505), (4, 0.11810525692121994), (5, 0.055933297485

TLG4149.TXT
[(0, 0.031716896387725881), (1, 0.061302656752810991), (2, 0.60075554788486574), (4, 0.064953772431267612), (5, 0.024537581151926689), (6, 0.010214287168225169), (7, 0.013007461530928676), (9, 0.19085157051854479)]

TLG5022.TXT
[(1, 0.052496105021844153), (2, 0.60721560005660768), (4, 0.033188150864124952), (5, 0.052274308500994807), (6, 0.014012676055197097), (9, 0.22587494259292229)]

TLG0002.TXT
[(1, 0.065292037008709794), (2, 0.61077107779593787), (4, 0.096393731356821305), (5, 0.046330549836571283), (7, 0.023057171167978865), (9, 0.14572029318332608)]

TLG1167.TXT
[(0, 0.011383222522274722), (1, 0.078247608685345235), (2, 0.6870419336328828), (4, 0.12900837994732611), (7, 0.029191074263715297), (9, 0.056910864617936714)]

TLG2412.TXT
[(0, 0.18298436796431866), (1, 0.12020693724925918), (2, 0.57007525282067883), (5, 0.082003863083803447), (8, 0.034923547635686468)]

TLG1128.TXT
[(1, 0.082919820488552343), (2, 0.62172630911429738), (4, 0.053170609610307958), (5, 0.031649

TLG4028.TXT
[(1, 0.049758280934543965), (2, 0.59546341318762164), (4, 0.093743959200195376), (5, 0.1097646182536324), (6, 0.010255494558855455), (7, 0.01628964729134981), (9, 0.11829331348763013)]

TLG0438.TXT
[(1, 0.083488186449003382), (2, 0.64016236172453067), (4, 0.052920291440590107), (5, 0.034564400734148806), (7, 0.022918889635170463), (9, 0.16539556392934829)]

TLG2160.TXT
[(1, 0.17900619603188322), (2, 0.55316578735542865), (4, 0.077082088697897039), (5, 0.058803998663447217), (9, 0.12784306512082197)]

TLG2946.TXT
[(1, 0.10497837979649599), (2, 0.50511611725078953), (4, 0.16513423588922488), (5, 0.079481780610345693), (6, 0.010283895929327104), (7, 0.022747385906353758), (9, 0.10315251953881641)]

TLG1702.TXT
[(1, 0.062103858491509401), (2, 0.61452218479231291), (4, 0.11345479191177961), (5, 0.041992540141349573), (7, 0.017870086901035104), (9, 0.13767465097940559)]

TLG0338.TXT
[(1, 0.1244784707118067), (2, 0.60334389542383182), (4, 0.10772795423775654), (5, 0.10122654217372

TLG2043.TXT
[(1, 0.070622422166903578), (2, 0.54957915065882723), (4, 0.10541032090351668), (5, 0.10192559556453812), (6, 0.017140135220715277), (7, 0.023000158353317781), (9, 0.12665213553023205)]

TLG2322.TXT
[(1, 0.12152408871685759), (2, 0.52835360317843139), (4, 0.086945498417413489), (5, 0.11798316256564009), (7, 0.079987033878560751), (9, 0.059323341654297861)]

TLG1992.TXT
[(1, 0.12376711637684168), (2, 0.57245271977720713), (4, 0.074902816130350283), (5, 0.090894392978029662), (7, 0.033680450096952003), (8, 0.014631790389385129), (9, 0.087448134284149256)]

TLG5037.TXT
[(1, 0.074223516230030789), (2, 0.6009328320694749), (4, 0.10218730220163647), (5, 0.05112036443772143), (6, 0.010209270281957925), (7, 0.014558245293517494), (9, 0.14036261737051797)]

TLG0079.TXT
[(0, 0.042596461710152515), (1, 0.061754620279163348), (2, 0.68459423503761618), (4, 0.10497004463965499), (9, 0.10226708903657925)]

TLG2030.TXT
[(1, 0.049048723923492389), (2, 0.39050554876700239), (4, 0.26529772755

TLG5029.TXT
[(0, 0.010154897046525899), (1, 0.063961511813910812), (2, 0.61319843760278969), (4, 0.083401782867752433), (5, 0.04520742699058547), (6, 0.012203917147427686), (7, 0.015364886479305104), (9, 0.15415797832828645)]

TLG0011.TXT
[(1, 0.082027127563419483), (2, 0.56288836461257929), (4, 0.12230949532245738), (5, 0.050735311365446886), (7, 0.014857886158690931), (9, 0.15022811441450065)]

TLG5009.TXT
[(1, 0.066494641708476951), (2, 0.67260967513083436), (4, 0.085488817669719025), (5, 0.034062437127510913), (7, 0.014235258740234555), (9, 0.10978988801924035)]

TLG1224.TXT
[(1, 0.099937496761936528), (2, 0.62478579502215459), (4, 0.10711971888519951), (5, 0.042122771416096008), (6, 0.012108293299360941), (9, 0.098791044416019713)]

TLG1917.TXT
[(2, 0.5804340902337618), (4, 0.18252453195880147), (5, 0.051229281511589474), (6, 0.021300559756333506), (7, 0.012940623533174319), (9, 0.15012136022491276)]

TLG0644.TXT
[(1, 0.081612847192987517), (2, 0.59451752948355385), (4, 0.10191292

TLG4089.TXT
[(1, 0.081261813916922124), (2, 0.59495717729942688), (4, 0.11572331102187523), (5, 0.050678508226769128), (7, 0.010777940631280269), (9, 0.12963952030824974)]

TLG0690.TXT
[(1, 0.06121774289156439), (2, 0.64861983115671629), (4, 0.09196274703940395), (5, 0.055139475776170656), (7, 0.019719812143922863), (9, 0.10587111264445662)]

TLG1627.TXT
[(1, 0.10507273137495465), (2, 0.60204420064776187), (4, 0.15114951548878688), (5, 0.046581060057070696), (6, 0.041049654464996636), (9, 0.05100159306395733)]

TLG2020.TXT
[(0, 0.021800116313322605), (1, 0.06193449033521977), (2, 0.57207305644879647), (4, 0.067950975501318289), (5, 0.028058053306655638), (6, 0.018790563299343324), (7, 0.01475324303659996), (9, 0.20997762028894409)]

TLG2417.TXT
[(1, 0.045983123100400869), (2, 0.69783473629105219), (4, 0.14911973178114446), (5, 0.014456304604684738), (7, 0.02450851905905985), (9, 0.065198347582191132)]

TLG4081.TXT
[(1, 0.071581041956865307), (2, 0.557897157848534), (4, 0.12907812831938

TLG1158.TXT
[(1, 0.069393306759658921), (2, 0.6381266722815212), (4, 0.079185578172893997), (5, 0.023108487384620194), (7, 0.036581804087764014), (9, 0.14461604757331184)]

TLG1125.TXT
[(1, 0.070299508065626695), (2, 0.66641190358853575), (4, 0.076788716942588597), (5, 0.017219269541536887), (7, 0.017716633488108736), (9, 0.14376217884936449)]

TLG2189.TXT
[(0, 0.02629313868643899), (1, 0.079367592347603377), (2, 0.59913128401627702), (4, 0.14657577591113821), (5, 0.035129865186887092), (7, 0.018012076774062278), (9, 0.094068204525613514)]

TLG1969.TXT
[(1, 0.081605361753670849), (2, 0.61478717319810283), (4, 0.10432930149360017), (5, 0.029682896650507256), (7, 0.013462409725351463), (9, 0.13629561821744618)]

TLG0569.TXT
[(1, 0.085902788286888548), (2, 0.62878488308428659), (4, 0.092355797294482192), (5, 0.046561994205897628), (7, 0.032501055323735747), (9, 0.095931140080355096)]

TLG2945.TXT
[(1, 0.082222317701248548), (2, 0.62957410999573127), (4, 0.10202676641039173), (5, 0.0371046

TLG0656.TXT
[(0, 0.024478628646008135), (1, 0.083432733605879286), (2, 0.45624802287613603), (4, 0.17800262824754393), (5, 0.065985199821249549), (6, 0.026294150023466367), (7, 0.039356095200512725), (9, 0.12147375306599453)]

TLG0440.TXT
[(1, 0.095822760055846723), (2, 0.57610702667641611), (4, 0.14217802934959806), (7, 0.11686805971850411), (9, 0.063341188874831339)]

TLG1397.TXT
[(1, 0.097880625294724827), (2, 0.51438649392340818), (4, 0.14262541976438184), (5, 0.091065182534145667), (7, 0.02118575048595555), (9, 0.1131453829201082)]

TLG0539.TXT
[(1, 0.058811900524500332), (2, 0.63945057483659151), (4, 0.097048917292239217), (5, 0.055169769471353082), (7, 0.023332414246648229), (9, 0.11022224409489886)]

TLG0660.TXT
[(1, 0.0877579808341752), (2, 0.57613062384043634), (4, 0.10589234258494487), (5, 0.045676280809934147), (7, 0.032460126022760862), (9, 0.1350699285822079)]

TLG1121.TXT
[(1, 0.12477331939802171), (2, 0.47316313208301447), (6, 0.10207799703830572), (7, 0.043448169041310

TLG0658.TXT
[(1, 0.10630415898516844), (2, 0.53417811081097144), (4, 0.13052382443170241), (5, 0.059804511398862344), (6, 0.016576868976298752), (7, 0.013355915999564451), (9, 0.12894221322962302)]

TLG0334.TXT
[(1, 0.040338085342692391), (2, 0.69481305014642358), (4, 0.083285080220061564), (7, 0.017020963652882831), (9, 0.16040967478078086)]

TLG1475.TXT
[(2, 0.5859777336020523), (4, 0.07617020012638713), (5, 0.088762961612210409), (7, 0.05545694001930019), (9, 0.19218248919590383)]

TLG1687.TXT
[(1, 0.068351514507567782), (2, 0.64071757043865318), (4, 0.11328513939814698), (5, 0.022103766501376555), (6, 0.018079745710927971), (7, 0.012521108890620958), (9, 0.12171796027966782)]

TLG2202.TXT
[(2, 0.56947596749931717), (4, 0.16286539459512508), (5, 0.1633161769534798), (9, 0.097197759095725686)]

TLG1488.TXT
[(1, 0.047191623115833331), (2, 0.70713107059494196), (4, 0.039942853903384798), (5, 0.031421876633694792), (7, 0.023231865327030688), (9, 0.14625740932729206)]

TLG2571.TXT
[(1, 0

TLG0067.TXT
[(1, 0.085183505029267748), (2, 0.57473716848417111), (4, 0.12315781601880539), (5, 0.057887735198168387), (7, 0.015834179736531055), (9, 0.13065353281342862)]

TLG0541.TXT
[(1, 0.073853572490249347), (2, 0.63718551303456705), (4, 0.090423602691035998), (5, 0.047105490777617359), (7, 0.018237773173208925), (9, 0.11760542791795801)]

TLG2948.TXT
[(1, 0.070348927811744066), (2, 0.64901465142490466), (4, 0.11077477115956436), (5, 0.042843570967894568), (7, 0.01050226251391021), (9, 0.10078502879600575)]

TLG2742.TXT
[(1, 0.073639661386727354), (2, 0.64502778626036361), (4, 0.11058222326190366), (5, 0.035087825783973681), (7, 0.012271134908766419), (9, 0.10449080877670892)]

TLG1183.TXT
[(1, 0.072608479940852372), (2, 0.58149769548894525), (4, 0.13563388540965154), (5, 0.076006210683269543), (7, 0.011471695115498681), (9, 0.10733686051355674)]

TLG1544.TXT
[(1, 0.086439593380960425), (2, 0.57876432015755408), (4, 0.13584549845162019), (5, 0.061953167081299935), (6, 0.0203950848

TLG0086.TXT
[(1, 0.062209576064715805), (2, 0.68112806212668009), (4, 0.084666067189141769), (5, 0.02930169953817862), (7, 0.016516761290493603), (9, 0.11291273540094096)]

TLG0505.TXT
[(1, 0.040256888148590554), (2, 0.62159976946510309), (4, 0.20974447680123029), (9, 0.12668417256994602)]

TLG2255.TXT
[(1, 0.049686286720545335), (2, 0.62855450603978968), (5, 0.031335174188029009), (6, 0.022838712841580233), (7, 0.10599781751696706), (8, 0.024124383024580368), (9, 0.1347597331949445)]

TLG2333.TXT
[(2, 0.4841774302945408), (4, 0.15116415130866023), (5, 0.099266353930247006), (9, 0.2542783473837858)]

TLG2798.TXT
[(1, 0.055176998729997893), (2, 0.60319439354423565), (4, 0.10438347265041682), (5, 0.051281472734927162), (7, 0.017428631356310492), (9, 0.15011801979063982)]

TLG0888.TXT
[(2, 0.45171416310973783), (4, 0.1552197090814291), (5, 0.13475998805396333), (7, 0.053561462261933909), (9, 0.19849300666551006)]

TLG4083.TXT
[(1, 0.068291217133155446), (2, 0.62067022435525177), (4, 0.087

TLG1765.TXT
[(1, 0.127499721939968), (2, 0.55339290835568644), (4, 0.13275963618693337), (5, 0.046003844459157048), (7, 0.016902623760077878), (9, 0.11252019284289162)]

TLG2632.TXT
[(1, 0.038470386563353887), (2, 0.62157267502347591), (4, 0.1058577651060606), (5, 0.045806948371120752), (7, 0.020193894479892786), (9, 0.15836729284445222)]

TLG1814.TXT
[(2, 0.6916439572983083), (4, 0.1151630812488046), (5, 0.036957675240625737), (7, 0.017574085147002479), (9, 0.11554166415976053)]

TLG1649.TXT
[(1, 0.12595199643992658), (2, 0.70590813700051702), (4, 0.026633952899899973), (9, 0.13654608613370983)]

TLG0012.TXT
[(1, 0.072073869294930593), (2, 0.59136451943658663), (4, 0.087571253081263997), (5, 0.042351656567488405), (6, 0.014858238342566887), (7, 0.031968557910000625), (9, 0.15263374708305527)]

TLG0363.TXT
[(1, 0.077623317291998534), (2, 0.58196424721379092), (4, 0.047870560302019541), (5, 0.089487330182497368), (6, 0.023700072717389359), (7, 0.01125579015297859), (9, 0.157165981221766

TLG9019.TXT
[(1, 0.052253890086007754), (2, 0.67884835514625474), (4, 0.046259755574672982), (5, 0.021718931239235583), (9, 0.18662590988659256)]

TLG2646.TXT
[(0, 0.037675043362549267), (1, 0.061756377131123273), (2, 0.52831804405804872), (4, 0.11136776134180845), (5, 0.06960599040069905), (9, 0.17154819781890396)]

TLG1414.TXT
[(1, 0.082238176546327063), (2, 0.61788419202829337), (4, 0.092691374431194754), (5, 0.044715882002334617), (7, 0.020628418189878834), (9, 0.12880466823533768)]

TLG0722.TXT
[(1, 0.068968206774302668), (2, 0.55347418196936848), (4, 0.13439105710579791), (5, 0.051849957089202772), (6, 0.013364097002592685), (7, 0.023616373374318385), (9, 0.14375776252151085)]

TLG0594.TXT
[(1, 0.065874169184602649), (2, 0.65845564925292643), (4, 0.061998380199827972), (5, 0.040816271641465819), (9, 0.1488856343709816)]

TLG0099.TXT
[(1, 0.078546428205501936), (2, 0.57961004039204544), (4, 0.13130505740475662), (5, 0.046319612997155667), (6, 0.013981722792211536), (7, 0.015806422

TLG5017.TXT
[(1, 0.070779570225845495), (2, 0.62698026150313613), (4, 0.086984747190457748), (5, 0.037565258969609654), (7, 0.013842219633920695), (9, 0.14420110225943836)]

TLG2354.TXT
[(1, 0.072195234828159066), (2, 0.60228310339358848), (4, 0.11336448476364562), (5, 0.079262263481587242), (7, 0.01848145179109743), (9, 0.10978282246484293)]

TLG0040.TXT
[(1, 0.025471126494602106), (2, 0.66964424686718926), (5, 0.048448449762189261), (6, 0.067958241257104438), (7, 0.028312621750926595), (9, 0.15436724342587088)]

TLG4032.TXT
[(1, 0.060266685365184397), (2, 0.6498422437015362), (4, 0.056706037336601935), (5, 0.034884751728333177), (9, 0.17983141895350685)]

TLG0243.TXT
[(2, 0.78486395766914141), (4, 0.065091717311257541), (5, 0.094039429107600631), (9, 0.051045019261350758)]

TLG1252.TXT
[(1, 0.075127992857088349), (2, 0.6544199756285769), (4, 0.055168384662593939), (5, 0.04266599961320125), (7, 0.023688674097776016), (9, 0.13621530328459186)]

TLG2115.TXT
[(1, 0.06709020997434792), (2

TLG2877.TXT
[(1, 0.083781778513978125), (2, 0.55527870224200571), (4, 0.088689820502305838), (5, 0.065555453255476695), (7, 0.031753353770517585), (9, 0.14999842420727061)]

TLG2112.TXT
[(1, 0.087757625285335705), (2, 0.58980475305726987), (4, 0.10713067279124511), (5, 0.044051343522783212), (7, 0.014391223017093018), (9, 0.13942554405092705)]

TLG0337.TXT
[(1, 0.056744553601037917), (2, 0.69699293945446628), (3, 0.022988886524060063), (4, 0.10838632885725663), (5, 0.024375886154490791), (7, 0.060526647740355835), (8, 0.026186545452684555)]

TLG0267.TXT
[(1, 0.052703325036608274), (2, 0.7049989297421505), (4, 0.049602956000240339), (5, 0.035852987945909999), (6, 0.013306304795297265), (7, 0.021502746923197136), (9, 0.12012696119890025)]

TLG1305.TXT
[(1, 0.033489629551728531), (2, 0.61205867176800932), (4, 0.16684554778035179), (6, 0.067894996594548743), (8, 0.048201889397730062), (9, 0.067734671579466679)]

TLG0417.TXT
[(1, 0.18191188400517205), (2, 0.57870010658487248), (4, 0.0932951

TLG3127.TXT
[(1, 0.072250814360547602), (2, 0.60085081122064765), (4, 0.11604686377282375), (5, 0.044216271154256657), (7, 0.014167791581972303), (9, 0.13769651440395542)]

TLG1506.TXT
[(0, 0.016469797760557084), (1, 0.11064102717119098), (2, 0.53822420329494403), (4, 0.1203893395838479), (5, 0.051007107226804037), (6, 0.014831565406747889), (7, 0.025349668014377193), (9, 0.1227505433852724)]

TLG2051.TXT
[(1, 0.074312791318145166), (2, 0.59574195206535741), (4, 0.11509253651938467), (5, 0.048683667730523107), (6, 0.013467700797886712), (7, 0.012269826573404195), (9, 0.13281389298960353)]

TLG4332.TXT
[(1, 0.099457496746052504), (2, 0.58466743710553981), (4, 0.10584064710039097), (5, 0.048574384778755662), (7, 0.013402235913480736), (9, 0.14136923285662217)]

TLG1813.TXT
[(1, 0.02985149563257547), (2, 0.62293637222019571), (7, 0.12285339961158788), (9, 0.2192292067181299)]

TLG1447.TXT
[(1, 0.072686780997325742), (2, 0.61991989486327403), (4, 0.090253587405358077), (5, 0.04417068402397

TLG2800.TXT
[(1, 0.089626288587359576), (2, 0.55143451412445332), (4, 0.12416130685899024), (5, 0.05904812711850238), (6, 0.010034078614709107), (7, 0.012188161328228345), (9, 0.14525348297754678)]

TLG4000.TXT
[(1, 0.082322304974555144), (2, 0.5651112959518968), (4, 0.098981806207071879), (5, 0.060573345423419855), (7, 0.013523102540927586), (9, 0.16046301781085892)]

TLG9021.TXT
[(1, 0.060808239981321725), (2, 0.63858371887095855), (4, 0.081861853422160902), (5, 0.03837692686405135), (7, 0.016045084273885901), (9, 0.14939204912371418)]

TLG3130.TXT
[(1, 0.10205693431557011), (2, 0.50456662450025536), (4, 0.15023742925893488), (5, 0.072487603654309313), (6, 0.020188723693626048), (7, 0.021697851852905934), (9, 0.1112679748875387)]

TLG1588.TXT
[(0, 0.021779381085833733), (1, 0.10152560310881353), (2, 0.43762449357168798), (4, 0.14981108719117084), (5, 0.052760938531821198), (6, 0.020975436870609623), (7, 0.055184046551904037), (9, 0.15957542105609621)]

TLG0502.TXT
[(1, 0.039471420308

[(1, 0.053135780450873207), (2, 0.67169222441252063), (4, 0.077074050563865529), (5, 0.070252092005890998), (7, 0.02018425174079426), (9, 0.1008239086570525)]

TLG2298.TXT
[(1, 0.13399062370687953), (2, 0.48534192992296321), (4, 0.17035042881382584), (9, 0.20317272192137156)]

TLG1801.TXT
[(1, 0.07429456549556411), (2, 0.51962134956832129), (4, 0.14578375710234584), (5, 0.071939195583001936), (6, 0.021276119553921068), (7, 0.030696553450857107), (9, 0.13530905376522884)]

TLG1737.TXT
[(2, 0.49883521309673146), (4, 0.3934949690773677), (5, 0.099131022266088623)]

TLG0540.TXT
[(1, 0.076731805735002448), (2, 0.65827369399848568), (4, 0.098931455732204132), (5, 0.043223887054170067), (7, 0.016823476227627598), (9, 0.089397217062593806)]

TLG2460.TXT
[(1, 0.32351123164574919), (2, 0.46500747305694679), (4, 0.16992638374625649), (5, 0.017019697697845265), (9, 0.020502164303066062)]

TLG4110.TXT
[(1, 0.068343070877419879), (2, 0.59537764639334878), (4, 0.10085577060343756), (5, 0.053560081318

TLG0748.TXT
[(1, 0.070811747849655729), (2, 0.56847992740322539), (4, 0.10359278568818188), (5, 0.059694978637492145), (6, 0.011361102048722167), (7, 0.024423491038550228), (9, 0.15185911553469994)]

TLG2012.TXT
[(0, 0.018936402941099967), (1, 0.067820510083906771), (2, 0.65115845847230036), (4, 0.15297936575553536), (5, 0.071465570037818421), (9, 0.016772018396264937)]

TLG1632.TXT
[(1, 0.074638132669363672), (2, 0.5414014685892089), (4, 0.12196103116907622), (5, 0.048613477603846836), (6, 0.011996227601220108), (7, 0.016998774310317939), (9, 0.16892844488852712)]

TLG1747.TXT
[(1, 0.046107476520064458), (2, 0.6560226449628962), (4, 0.13369262651416747), (5, 0.027146723425140878), (9, 0.12513671395909215)]

TLG0063.TXT
[(1, 0.078502683031564324), (2, 0.55438277409502812), (4, 0.12666502504429), (5, 0.028834055019344797), (6, 0.012397110154501786), (7, 0.03274838764048648), (9, 0.15912101755316163)]

TLG2034.TXT
[(1, 0.068273091180197712), (2, 0.62997149647449846), (4, 0.08710606088808

TLG7051.TXT
[(1, 0.05932988043034338), (2, 0.63516326980574322), (4, 0.063178161178252104), (5, 0.028884362465224522), (6, 0.012108927972110915), (7, 0.0102571794939931), (9, 0.18462881803275175)]

TLG4338.TXT
[(1, 0.087500793637327268), (2, 0.59928926595003573), (4, 0.10019781441453457), (5, 0.08137655186663989), (9, 0.12777161579021934)]

TLG2168.TXT
[(1, 0.066127963121050598), (2, 0.64664249260445594), (5, 0.049139050542971789), (9, 0.22791878550065403)]

TLG0380.TXT
[(1, 0.072975687879785853), (2, 0.58308844162238083), (4, 0.082359468612125453), (5, 0.077416451032025033), (6, 0.02472596919966729), (7, 0.052928094238478232), (8, 0.011389277737998903), (9, 0.09467597285362471)]

TLG0066.TXT
[(1, 0.070926759881074219), (2, 0.62233609085542696), (4, 0.10217816938306894), (5, 0.051874477453048795), (6, 0.012100012768809676), (7, 0.019524990447554239), (9, 0.11401660424980495)]

TLG0059.TXT
[(1, 0.075790106688360448), (2, 0.65948977646423912), (4, 0.090263533709317556), (5, 0.03485848828

TLG2023.TXT
[(1, 0.059913382847830786), (2, 0.62176741291080373), (4, 0.092582038946801531), (5, 0.040220682420336662), (6, 0.010833881689896536), (7, 0.015363510157864628), (9, 0.15330120814558604)]

TLG2506.TXT
[(1, 0.13023901175386599), (2, 0.37525000416894827), (4, 0.084523205047392622), (5, 0.35547150150978779), (9, 0.051429182400569991)]

TLG1127.TXT
[(1, 0.072944521435799284), (2, 0.56993100556749587), (4, 0.034782337426070599), (5, 0.03492663204818381), (6, 0.033670854086981464), (7, 0.020608134726592696), (9, 0.2330145871700316)]

TLG1736.TXT
[(1, 0.03044925772921752), (2, 0.586298357591303), (4, 0.1453162719668486), (7, 0.046253071192684514), (9, 0.1849243996256078)]

TLG3002.TXT
[(1, 0.062662232750699418), (2, 0.59669790829531633), (4, 0.072784786312179134), (5, 0.038745265938923953), (9, 0.21860384474020761)]

TLG4139.TXT
[(1, 0.074700155265828586), (2, 0.61731319376938032), (4, 0.10174765415046903), (5, 0.043470707346181151), (7, 0.010429989251275539), (9, 0.13576910670161

# Visualization

Following: http://nbviewer.jupyter.org/github/bmabey/pyLDAvis/blob/master/notebooks/pyLDAvis_overview.ipynb

In [112]:
lda_model.show_topics()

[(0,
  '0.001*"διαφωνει" + 0.001*"διακεκαυμενη" + 0.000*"εναλλαγαι" + 0.000*"θερινη" + 0.000*"βουλομεθα" + 0.000*"οικετων" + 0.000*"καιροι" + 0.000*"λυσαωνθειηδεση" + 0.000*"ωργισμενους" + 0.000*"δεομενω"'),
 (1,
  '0.002*"νενευκασιν" + 0.001*"αδωμος" + 0.001*"μυουμενους" + 0.001*"νοτιωτερα" + 0.001*"διακειμενων" + 0.001*"αιστωτηριον" + 0.001*"προειρημενον" + 0.001*"επιταξ" + 0.001*"δηλουντος" + 0.001*"ασπετον"'),
 (2,
  '0.001*"αγνοιαν" + 0.001*"ακουσας" + 0.001*"ακρατον" + 0.001*"κινδυνευοντι" + 0.001*"λαμπτηρα" + 0.001*"εγενετο" + 0.000*"στοιχους" + 0.000*"ερωτος" + 0.000*"λαβομενη" + 0.000*"εναγισαντες"'),
 (3,
  '0.000*"ωργισμενους" + 0.000*"διαβοσκεσθαι" + 0.000*"απολυοντος" + 0.000*"ετοιμως" + 0.000*"σαρδονυχες" + 0.000*"νεα" + 0.000*"συμβαινει" + 0.000*"διαφωνει" + 0.000*"εξηρτημενων" + 0.000*"πενεσται"'),
 (4,
  '0.000*"νημερτεστατε" + 0.000*"φιλολογος" + 0.000*"απλανεσιν" + 0.000*"δηλαδη" + 0.000*"επιπλοκης" + 0.000*"θρασυλλωι" + 0.000*"διαβοσκεσθαι" + 0.000*"επωνομαζετο" + 0

In [115]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

In [116]:
pyLDAvis.gensim.prepare(lda_model, mm_corpus, id2word_tlg)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]
