In [1]:
import itertools
import logging
import os
import pickle

from cltk.stop.greek.stops import STOPS_LIST
import gensim
# from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.corpora.mmcorpus import MmCorpus
from gensim.utils import smart_open, simple_preprocess
import numpy as np

In [2]:
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

In [3]:
STOPS_LIST = [simple_preprocess(stop, deacc=True)[0] for stop in STOPS_LIST if len(simple_preprocess(stop, deacc=True)) > 0]
STOPS_LIST += ["τηϲ", "τοιϲ", "εϲτι", "προϲ", "ειϲ", "ταϲ", "ωϲ", "τουϲ"]

In [4]:
def tokenize(text):
    # https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
    #! todo: rm words shorter than 3
    tokens = [token for token in simple_preprocess(text, deacc=True)]
    return [token for token in tokens if token not in STOPS_LIST]
    

def iter_tlg(tlg_dir):
    file_names = os.listdir(tlg_dir)
    for file_name in file_names:
        file_path = os.path.join(tlg_dir, file_name)
        with open(file_path) as file_open:
            file_read = file_open.read()
        tokens = tokenize(file_read)
        # ignore short docs
        if len(tokens) < 50:
            continue
        yield file_name, tokens

In [5]:
tlg_preprocessed = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
stream = iter_tlg(tlg_preprocessed)

In [6]:
for title, tokens in itertools.islice(iter_tlg(tlg_preprocessed), 8):
    print(title, tokens[:10])  # print the article title and its first ten tokens

TLG2346.TXT ['πολυκριτης', 'ιστορια', 'αυτη', 'εληφθη', 'αʹ', 'ανδρισκου', 'ναξιακων', 'γραφει', 'αυτης', 'θεοφραστος']
TLG1389.TXT ['αβαρις', 'ονομα', 'κυριον', 'λοιμου', 'φασι', 'πασαν', 'οικου', 'μενην', 'γεγονοτος', 'ανειλεν']
TLG0404.TXT ['ειτ', 'γυναικος', 'εστιν', 'ευνοικωτερον', 'γαμετης', 'εταιρα', 'πολυ', 'μαλ', 'εικοτως', 'νομω']
TLG0235.TXT ['εκητι', 'συλοσωντος', 'ευρυχωριη', 'πολλα', 'μεταιχμιωι', 'νοτος', 'κυλινδει', 'κυματ', 'ευρειης', 'αλος']
TLG0535.TXT ['αναιτιον', 'αιτιον', 'οιον', 'αμα', 'τουτο', 'γεγονεναι', 'τουτο', 'λαμβανουσιν', 'μαλιστα', 'ταις']
TLG0507.TXT ['σφιγγ', 'αρρεν', 'μαγειρον', 'οικιαν', 'ειληφ', 'απλως', 'μα', 'θεους', 'ων', 'λεγη']
TLG1816.TXT ['φωσφορε', 'φωσφορεουσα', 'φιλων', 'φως', 'φως', 'φερε', 'λαμπας', 'μοι', 'τεον', 'αεισαι']
TLG0476.TXT ['πυθαγοριστι', 'θυομεν', 'λοξια', 'εμψυχον', 'ουδεν', 'εσθιοντες', 'παντελως', 'βοιωτιος', 'ολιγα', 'αλλων']


# Mk word dictionaries

In [7]:
doc_stream = (tokens for _, tokens in iter_tlg(tlg_preprocessed))

In [8]:
%time id2word_tlg = gensim.corpora.Dictionary(doc_stream)
print(id2word_tlg)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : built Dictionary(1161917 unique tokens: ['πολυκριτης', 'ιστορια', 'αυτη', 'εληφθη', 'αʹ']...) from 1484 documents (total 44655183 corpus positions)


CPU times: user 4min 5s, sys: 2.17 s, total: 4min 7s
Wall time: 4min 7s
Dictionary(1161917 unique tokens: ['πολυκριτης', 'ιστορια', 'αυτη', 'εληφθη', 'αʹ']...)


In [9]:
# this cutoff might lose too much info, we'll see
# ignore words that appear in less than 20 documents or more than 10% documents
id2word_tlg.filter_extremes(no_below=20, no_above=0.1)
print(id2word_tlg)

INFO : discarding 1077805 tokens: [('πολυκριτης', 4), ('ιστορια', 280), ('αυτη', 876), ('εληφθη', 150), ('αʹ', 319), ('ανδρισκου', 7), ('ναξιακων', 4), ('γραφει', 417), ('αυτης', 873), ('θεοφραστος', 150)]...
INFO : keeping 84112 tokens which were in no less than 20 and no more than 148 (=10.0%) documents
INFO : resulting dictionary: Dictionary(84112 unique tokens: ['συνεβησαν', 'ετεμνον', 'εφρουρουν', 'δηλιω', 'ερυθραιων']...)


Dictionary(84112 unique tokens: ['συνεβησαν', 'ετεμνον', 'εφρουρουν', 'δηλιω', 'ερυθραιων']...)


# Mk vectors

Now start again with the corpus, turning the actual words into integers from our map.

In [10]:
doc = "περὶ ποιητικῆς αὐτῆς τε καὶ τῶν εἰδῶν αὐτῆς, ἥν τινα δύναμιν ἕκαστον ἔχει, καὶ πῶς δεῖ συνίστασθαι τοὺς μύθους [10] εἰ μέλλει καλῶς ἕξειν ἡ ποίησις, ἔτι δὲ ἐκ πόσων καὶ ποίων ἐστὶ μορίων, ὁμοίως δὲ καὶ περὶ τῶν ἄλλων ὅσα τῆς αὐτῆς ἐστι μεθόδου, λέγωμεν ἀρξάμενοι κατὰ φύσιν πρῶτον ἀπὸ τῶν πρώτων."
doc = ' '.join(simple_preprocess(doc))
bow = id2word_tlg.doc2bow(tokenize(doc))
print(bow)

[(6873, 1), (12316, 1)]


In [11]:
print(id2word_tlg[6880], id2word_tlg[12323])

ρητορικα εργασιων


In [12]:
class TLGCorpus(object):
    def __init__(self, dump_file, dictionary, clip_docs=None):
        """Yield each document in turn, as a list of tokens (unicode strings).
        """
        self.dump_file = dump_file
        self.dictionary = dictionary
        self.clip_docs = clip_docs
    
    def __iter__(self):
        self.titles = []
        for title, tokens in itertools.islice(iter_tlg(self.dump_file), self.clip_docs):
            self.titles.append(title)
            yield self.dictionary.doc2bow(tokens)
    
    def __len__(self):
        return self.clip_docs

In [13]:
# create a stream of bag-of-words vectors
tlg_corpus = TLGCorpus(tlg_preprocessed, id2word_tlg)
vector = next(iter(tlg_corpus))
print(vector)  # print the first vector in the stream

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 2), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1)]


In [14]:
# what is the most common word in that first article?
most_index, most_count = max(vector, key=lambda _tuple: _tuple[1])
print(id2word_tlg[most_index], most_count)

μιλησιοις 2


In [15]:
# Save BoW
user_dir = os.path.expanduser('~/cltk_data/user_data/')
try:
    os.makedirs(user_dir)
except FileExistsError:
    pass
bow_path = os.path.join(user_dir, 'bow_lda_gensim.mm')

%time gensim.corpora.MmCorpus.serialize(bow_path, tlg_corpus)

INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/bow_lda_gensim.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/bow_lda_gensim.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : saved 1484x84112 matrix, density=3.173% (3960894/124822208)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/bow_lda_gensim.mm.index


CPU times: user 3min 59s, sys: 2.37 s, total: 4min 1s
Wall time: 4min 1s


In [16]:
mm_corpus = gensim.corpora.MmCorpus(bow_path)
print(mm_corpus)

INFO : loaded corpus index from /home/kyle/cltk_data/user_data/bow_lda_gensim.mm.index
INFO : initializing corpus reader from /home/kyle/cltk_data/user_data/bow_lda_gensim.mm
INFO : accepted corpus with 1484 documents, 84112 features, 3960894 non-zero entries


MmCorpus(1484 documents, 84112 features, 3960894 non-zero entries)


In [17]:
print(next(iter(mm_corpus)))

[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 2.0), (8, 1.0), (9, 1.0), (10, 1.0), (11, 1.0), (12, 1.0), (13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0), (18, 1.0), (19, 1.0), (20, 1.0), (21, 1.0), (22, 1.0), (23, 1.0), (24, 1.0), (25, 1.0), (26, 1.0), (27, 1.0), (28, 1.0), (29, 1.0), (30, 1.0), (31, 1.0), (32, 1.0), (33, 1.0), (34, 1.0), (35, 1.0), (36, 1.0), (37, 1.0), (38, 1.0), (39, 1.0), (40, 1.0), (41, 1.0), (42, 1.0), (43, 1.0), (44, 1.0), (45, 1.0), (46, 1.0), (47, 1.0), (48, 2.0), (49, 1.0), (50, 1.0), (51, 1.0), (52, 1.0), (53, 1.0), (54, 1.0), (55, 1.0), (56, 1.0), (57, 1.0), (58, 1.0), (59, 1.0), (60, 1.0), (61, 1.0), (62, 1.0), (63, 1.0), (64, 1.0), (65, 1.0), (66, 1.0), (67, 1.0), (68, 1.0), (69, 1.0)]


# LDA transformation

In [18]:
# Save for reuse
with open(os.path.expanduser('~/cltk_data/user_data/tlg_bow_id2word.dict'), 'rb') as file_open:
    id2word_tlg = pickle.load(file_open)

In [37]:
# Quick testing using just a part of the corpus

# use fewer documents during training, LDA is slow
# clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, 100)
%time lda_model = gensim.models.LdaMulticore(mm_corpus, num_topics=50, id2word=id2word_tlg, passes=100)

INFO : using symmetric alpha at 0.02
INFO : using symmetric eta at 1.1887779362815025e-05
INFO : using serial LDA version on this node
INFO : running online LDA training, 50 topics, 100 passes over the supplied corpus of 1484 documents, updating every 14000 documents, evaluating every ~1484 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 7 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1484/1484, outstanding queue size 1
INFO : topic #48 (0.020): 0.001*"συνεργους" + 0.001*"στοχαστικως" + 0.001*"ρυθμου" + 0.000*"αλφα" + 0.000*"επισκοπος" + 0.000*"μο" + 0.000*"ϲε" + 0.000*"στον" + 0.000*"εγνωριζετο" + 0.000*"πεντεκαιδεκατον"
INFO : topic #13 (0.020): 0.001*"ροβοαμ" + 0.001*"ρυθμου" + 0.001*"αναληψιν" + 0.001*"αττικοι" + 0.001*"προστεταγμενον" + 0.001*"ευτελεις" + 0.001*"κυριας" + 0.000*"συνεργους" + 0.000*"στον" + 0.000*"εγνωριζετο"
INFO : topic #27 (0.020): 0.001*"επισκοπος" + 0.001*"ροβοαμ" + 0.001*"στοχαστικω

INFO : topic diff=2.038873, rho=0.385128
INFO : -10.564 per-word bound, 1514.3 perplexity estimate based on a held-out corpus of 1484 documents with 12049996 words
INFO : PROGRESS: pass 6, dispatched chunk #0 = documents up to #1484/1484, outstanding queue size 1
INFO : topic #17 (0.020): 0.005*"ηπειροιο" + 0.004*"σιδηριτιν" + 0.003*"τειχεος" + 0.003*"σεο" + 0.002*"λευσσων" + 0.002*"διονυσω" + 0.002*"ικοντο" + 0.002*"αστεριας" + 0.002*"αφενος" + 0.002*"φερεν"
INFO : topic #26 (0.020): 0.002*"εκτεινεν" + 0.001*"νοηματων" + 0.001*"ϲε" + 0.001*"αλμυρον" + 0.001*"καταλεξω" + 0.001*"διοικει" + 0.001*"εωντων" + 0.001*"πολους" + 0.001*"πυργων" + 0.001*"φασκουσαν"
INFO : topic #41 (0.020): 0.047*"πεντεκαιδεκατον" + 0.006*"ασματων" + 0.005*"χηρα" + 0.004*"νισου" + 0.003*"ακρωτηριον" + 0.003*"ξʹ" + 0.003*"παρερχομενοι" + 0.002*"ρʹ" + 0.002*"εκφευγουσι" + 0.002*"μυσαραν"
INFO : topic #22 (0.020): 0.002*"ρητορικη" + 0.001*"ενυλα" + 0.001*"εκαυθη" + 0.001*"ναρθηκας" + 0.001*"τελεωτερας" + 0.001*"γα

INFO : topic #15 (0.020): 0.004*"διαλογος" + 0.003*"θαυμασται" + 0.002*"μεσοτης" + 0.002*"διατελουσι" + 0.002*"εγνωκα" + 0.002*"αντιστηναι" + 0.002*"διαλογοις" + 0.002*"προδικος" + 0.002*"μεταβαλλεται" + 0.002*"γοργια"
INFO : topic #46 (0.020): 0.003*"προσηκατο" + 0.003*"αττικοι" + 0.003*"συνιστανται" + 0.002*"διαχει" + 0.002*"ποταμιοι" + 0.002*"αλοχους" + 0.001*"πανσεληνω" + 0.001*"αρχετυπου" + 0.001*"εμψυχω" + 0.001*"εκλαμποντα"
INFO : topic #16 (0.020): 0.011*"αμφισβητησιν" + 0.004*"επιρρημα" + 0.003*"συγκοπην" + 0.003*"απορουμενοις" + 0.003*"επιλογου" + 0.003*"υπερβολων" + 0.003*"ρυθμοις" + 0.002*"ευειδεις" + 0.002*"καμπανοι" + 0.002*"ονομαστι"
INFO : topic #33 (0.020): 0.003*"εξηρημενης" + 0.002*"κερως" + 0.002*"θεοπομπος" + 0.002*"αλεξις" + 0.001*"ερμιππος" + 0.001*"αντιφανης" + 0.001*"πδ" + 0.001*"συκα" + 0.001*"λωστε" + 0.001*"τετυκται"
INFO : topic diff=2.194567, rho=0.269758
INFO : -10.260 per-word bound, 1226.1 perplexity estimate based on a held-out corpus of 1484 documents

INFO : topic #19 (0.020): 0.019*"λευκοτητα" + 0.013*"μιγμα" + 0.010*"πυελος" + 0.010*"ροιας" + 0.008*"απαγγειλαι" + 0.007*"αλισκηται" + 0.006*"αδηλα" + 0.005*"κατενεχθησεται" + 0.005*"κολακευων" + 0.005*"κολλη"
INFO : topic #25 (0.020): 0.006*"συμπληροι" + 0.005*"ερμηνεια" + 0.004*"προαγορευει" + 0.003*"ανοδον" + 0.002*"ταχυν" + 0.002*"κοιμωμενους" + 0.002*"αλοχω" + 0.002*"παρεχουσα" + 0.002*"ακολαστους" + 0.002*"παρεδιδουν"
INFO : topic diff=0.933939, rho=0.225063
INFO : -10.208 per-word bound, 1183.0 perplexity estimate based on a held-out corpus of 1484 documents with 12049996 words
INFO : PROGRESS: pass 19, dispatched chunk #0 = documents up to #1484/1484, outstanding queue size 1
INFO : topic #39 (0.020): 0.008*"κωλυοντι" + 0.007*"αιτησις" + 0.005*"δηεις" + 0.005*"ανερχομενον" + 0.005*"απονον" + 0.005*"συμπληρουσθαι" + 0.004*"υβρισαντες" + 0.004*"εισαγη" + 0.004*"μουνοι" + 0.004*"αποστασεις"
INFO : topic #29 (0.020): 0.011*"υπατον" + 0.004*"εκαταιος" + 0.003*"αζ" + 0.003*"κολοφων"

INFO : topic #40 (0.020): 0.020*"τρεπων" + 0.006*"μελετωμεν" + 0.004*"χαιρω" + 0.004*"εμφερεις" + 0.003*"διερχομενου" + 0.003*"θρηνουσαν" + 0.003*"νεστοριος" + 0.003*"βιαι" + 0.003*"παρισταμενον" + 0.002*"μαθωσιν"
INFO : topic diff=0.354359, rho=0.197096
INFO : -10.191 per-word bound, 1168.8 perplexity estimate based on a held-out corpus of 1484 documents with 12049996 words
INFO : PROGRESS: pass 25, dispatched chunk #0 = documents up to #1484/1484, outstanding queue size 1
INFO : topic #0 (0.020): 0.025*"μο" + 0.012*"μοιρων" + 0.010*"ζωας" + 0.010*"ιβ" + 0.009*"ιδ" + 0.008*"στομαχου" + 0.007*"γαληνου" + 0.007*"ιγ" + 0.006*"ισαριθμων" + 0.005*"εμφαινομενα"
INFO : topic #18 (0.020): 0.032*"συνεργους" + 0.019*"αλφα" + 0.017*"δελτα" + 0.017*"εξαιτεισθαι" + 0.014*"παραδεδομενας" + 0.012*"σχισθηναι" + 0.011*"λμ" + 0.011*"αγραφα" + 0.010*"βητα" + 0.008*"λευ"
INFO : topic #3 (0.020): 0.009*"ευειδεις" + 0.007*"κελυφος" + 0.006*"κογχυλιων" + 0.006*"γενικη" + 0.006*"σπληνος" + 0.005*"διισχυριζον

INFO : PROGRESS: pass 31, dispatched chunk #0 = documents up to #1484/1484, outstanding queue size 1
INFO : topic #16 (0.020): 0.010*"αμφισβητησιν" + 0.004*"επιρρημα" + 0.003*"αττικοι" + 0.003*"συγκοπην" + 0.003*"απορουμενοις" + 0.003*"επιλογου" + 0.003*"υπερβολων" + 0.002*"ρυθμοις" + 0.002*"δακνουσι" + 0.002*"καμπανοι"
INFO : topic #39 (0.020): 0.009*"κωλυοντι" + 0.009*"αιτησις" + 0.005*"υβρισαντες" + 0.005*"συμπληρουσθαι" + 0.005*"ανερχομενον" + 0.005*"δηεις" + 0.005*"εισαγη" + 0.004*"απονον" + 0.004*"πεζων" + 0.004*"μουνοι"
INFO : topic #45 (0.020): 0.016*"κοσμεισθαι" + 0.013*"ιστανται" + 0.010*"ανακαθαραι" + 0.010*"εξαιτειται" + 0.008*"ανθρωπειαν" + 0.008*"παιω" + 0.007*"διεπεμψατο" + 0.006*"διαφθαρεντος" + 0.006*"μαργαρον" + 0.006*"προκριναντες"
INFO : topic #14 (0.020): 0.003*"συγκλητω" + 0.002*"ιππεις" + 0.001*"θελουσι" + 0.001*"κατορθοι" + 0.001*"παραλυσας" + 0.001*"επεσχον" + 0.001*"λεσβιων" + 0.001*"αρμενιας" + 0.001*"επιστολαι" + 0.001*"ετοιμοτερος"
INFO : topic #27 (0.020):

INFO : topic #1 (0.020): 0.002*"επιτασεις" + 0.002*"φευγοντος" + 0.002*"δημοσθενους" + 0.002*"ψηφισμα" + 0.002*"αισχινης" + 0.002*"αντιτιθεις" + 0.002*"σαφει" + 0.002*"αδικημα" + 0.001*"κατεχοντες" + 0.001*"συντελουντα"
INFO : topic #38 (0.020): 0.004*"ροβοαμ" + 0.003*"προστεταγμενον" + 0.003*"εννοω" + 0.003*"ρυθμου" + 0.002*"εμηχανησατο" + 0.002*"ταλαιπωροι" + 0.002*"παμφυλιαν" + 0.002*"λαμπραι" + 0.002*"εγγονοι" + 0.002*"επισκοπος"
INFO : topic #39 (0.020): 0.009*"κωλυοντι" + 0.009*"αιτησις" + 0.006*"ετητυμον" + 0.005*"υβρισαντες" + 0.005*"συμπληρουσθαι" + 0.005*"πεζων" + 0.005*"ανερχομενον" + 0.005*"εισαγη" + 0.005*"δηεις" + 0.004*"απονον"
INFO : topic #37 (0.020): 0.004*"προσαγωγην" + 0.003*"οριστου" + 0.003*"ακρους" + 0.002*"εποιησω" + 0.002*"θαυμασθησεται" + 0.002*"οισουσιν" + 0.002*"χιμαιρα" + 0.002*"πλατειαι" + 0.002*"οριστεον" + 0.002*"δηλης"
INFO : topic diff=0.076985, rho=0.160660
INFO : -10.174 per-word bound, 1155.3 perplexity estimate based on a held-out corpus of 1484 do

INFO : topic #23 (0.020): 0.028*"κατορθουντας" + 0.013*"συνεχοντες" + 0.012*"ασελγης" + 0.010*"αναξιμενης" + 0.007*"θεασοιτο" + 0.006*"εδουλευσαν" + 0.005*"αισθητου" + 0.004*"θαλεια" + 0.004*"μηνιειν" + 0.003*"παρει"
INFO : topic #2 (0.020): 0.004*"αλμυρον" + 0.003*"αρισταρχος" + 0.003*"ψιλην" + 0.002*"αιακιδην" + 0.002*"στρατιην" + 0.002*"ολεσθαι" + 0.002*"γελας" + 0.002*"σοφιης" + 0.002*"θαλιας" + 0.002*"επικαλουμαι"
INFO : topic diff=0.050599, rho=0.149500
INFO : -10.169 per-word bound, 1151.6 perplexity estimate based on a held-out corpus of 1484 documents with 12049996 words
INFO : PROGRESS: pass 44, dispatched chunk #0 = documents up to #1484/1484, outstanding queue size 1
INFO : topic #3 (0.020): 0.009*"ευειδεις" + 0.007*"κελυφος" + 0.006*"κογχυλιων" + 0.006*"γενικη" + 0.006*"σπληνος" + 0.005*"διισχυριζοντο" + 0.005*"ανδρομεδας" + 0.005*"δυναστευων" + 0.004*"νικωσης" + 0.004*"επαγομενην"
INFO : topic #24 (0.020): 0.000*"τημερον" + 0.000*"δημοσθενους" + 0.000*"αινιττονται" + 0.00

INFO : topic #37 (0.020): 0.005*"προσαγωγην" + 0.003*"οριστου" + 0.003*"ακρους" + 0.002*"εποιησω" + 0.002*"θαυμασθησεται" + 0.002*"οισουσιν" + 0.002*"χιμαιρα" + 0.002*"πλατειαι" + 0.002*"οριστεον" + 0.002*"αναβησεται"
INFO : topic diff=0.037205, rho=0.140384
INFO : -10.166 per-word bound, 1148.7 perplexity estimate based on a held-out corpus of 1484 documents with 12049996 words
INFO : PROGRESS: pass 50, dispatched chunk #0 = documents up to #1484/1484, outstanding queue size 1
INFO : topic #8 (0.020): 0.003*"μακροτερος" + 0.003*"λεπτου" + 0.002*"φιλιου" + 0.002*"ινδικης" + 0.002*"ελαια" + 0.002*"ωνομαζε" + 0.001*"προσθεντος" + 0.001*"ετητυμον" + 0.001*"ατλαντιδος" + 0.001*"τειχισαι"
INFO : topic #15 (0.020): 0.011*"γοργια" + 0.008*"φαιδρω" + 0.007*"δηλουσαι" + 0.007*"μεταβαλλεται" + 0.007*"εισκρινεσθαι" + 0.006*"διαλογος" + 0.004*"διαλογοις" + 0.004*"θαυμασται" + 0.004*"μεσοτης" + 0.003*"προδικος"
INFO : topic #16 (0.020): 0.009*"αμφισβητησιν" + 0.004*"επιρρημα" + 0.003*"αττικοι" + 0.

INFO : topic diff=0.029551, rho=0.132754
INFO : -10.163 per-word bound, 1146.4 perplexity estimate based on a held-out corpus of 1484 documents with 12049996 words
INFO : PROGRESS: pass 56, dispatched chunk #0 = documents up to #1484/1484, outstanding queue size 1
INFO : topic #29 (0.020): 0.010*"υπατον" + 0.003*"εκαταιος" + 0.003*"αζ" + 0.003*"κολοφων" + 0.002*"ελλανικος" + 0.002*"κολαζεσθαι" + 0.002*"αναπαυσεως" + 0.002*"αρρυθμος" + 0.002*"γαυρος" + 0.002*"τερπανδρον"
INFO : topic #18 (0.020): 0.032*"συνεργους" + 0.019*"αλφα" + 0.018*"δελτα" + 0.017*"εξαιτεισθαι" + 0.014*"παραδεδομενας" + 0.012*"σχισθηναι" + 0.012*"λμ" + 0.011*"αγραφα" + 0.011*"βητα" + 0.008*"λευ"
INFO : topic #8 (0.020): 0.003*"μακροτερος" + 0.003*"λεπτου" + 0.002*"φιλιου" + 0.002*"ινδικης" + 0.002*"ελαια" + 0.002*"ωνομαζε" + 0.001*"προσθεντος" + 0.001*"ατλαντιδος" + 0.001*"κτησιας" + 0.001*"τειχισαι"
INFO : topic #23 (0.020): 0.025*"κατορθουντας" + 0.014*"συνεχοντες" + 0.013*"αναξιμενης" + 0.013*"ασελγης" + 0.010*"

INFO : topic #42 (0.020): 0.004*"ιλλυρικον" + 0.002*"κατεχουσιν" + 0.002*"γαιος" + 0.002*"κοσμεισθαι" + 0.002*"συναρμοσας" + 0.002*"ληιζομενος" + 0.002*"ημφιεστο" + 0.002*"αφορων" + 0.002*"αποδωσω" + 0.002*"πειν"
INFO : topic #20 (0.020): 0.011*"τηλικουτος" + 0.010*"υποθεμενος" + 0.009*"μοιρων" + 0.009*"πολον" + 0.009*"ελαχιστω" + 0.008*"υποκειμενος" + 0.008*"κεντρω" + 0.007*"λζʹ" + 0.007*"υπακουοντα" + 0.007*"γνωμων"
INFO : topic #23 (0.020): 0.023*"κατορθουντας" + 0.014*"συνεχοντες" + 0.014*"σφαλλονται" + 0.014*"αναξιμενης" + 0.013*"ασελγης" + 0.012*"ακραι" + 0.010*"αισθητου" + 0.008*"εδουλευσαν" + 0.008*"αυλως" + 0.008*"θεασοιτο"
INFO : topic #6 (0.020): 0.003*"διδαγματα" + 0.003*"δαμων" + 0.003*"δαμωνος" + 0.002*"ισχυε" + 0.002*"αρχαιης" + 0.002*"μεμφομενη" + 0.002*"πληρουντες" + 0.002*"πειριθους" + 0.002*"απολειπομενος" + 0.002*"διαλεξεσι"
INFO : topic diff=0.024284, rho=0.125253
INFO : -10.160 per-word bound, 1144.0 perplexity estimate based on a held-out corpus of 1484 documents

INFO : topic #3 (0.020): 0.009*"ευειδεις" + 0.007*"κελυφος" + 0.006*"γενικη" + 0.006*"κογχυλιων" + 0.006*"σπληνος" + 0.005*"διισχυριζοντο" + 0.005*"ανδρομεδας" + 0.005*"δυναστευων" + 0.004*"νικωσης" + 0.004*"επαγομενην"
INFO : topic #37 (0.020): 0.005*"προσαγωγην" + 0.003*"οριστου" + 0.003*"ακρους" + 0.002*"εποιησω" + 0.002*"θαυμασθησεται" + 0.002*"οισουσιν" + 0.002*"χιμαιρα" + 0.002*"πλατειαι" + 0.002*"οριστεον" + 0.002*"αναβησεται"
INFO : topic diff=0.020400, rho=0.119744
INFO : -10.158 per-word bound, 1142.3 perplexity estimate based on a held-out corpus of 1484 documents with 12049996 words
INFO : PROGRESS: pass 69, dispatched chunk #0 = documents up to #1484/1484, outstanding queue size 1
INFO : topic #45 (0.020): 0.017*"ιστανται" + 0.014*"δεσποται" + 0.014*"κοσμεισθαι" + 0.013*"ακριτα" + 0.013*"πεμπεις" + 0.010*"αφορων" + 0.009*"πειν" + 0.009*"ανακαθαραι" + 0.009*"εξαιτειται" + 0.008*"ανθρωπειαν"
INFO : topic #49 (0.020): 0.017*"εξοριαν" + 0.009*"αγουση" + 0.009*"θεοδοσιος" + 0.0

INFO : topic #4 (0.020): 0.014*"ηρακλειας" + 0.006*"εξεκαυσε" + 0.005*"μηστωρ" + 0.005*"φονευσαντα" + 0.003*"ωντινων" + 0.003*"κρατητα" + 0.002*"κρατης" + 0.002*"κυπριοι" + 0.002*"νευοντας" + 0.002*"οπισθε"
INFO : topic diff=0.017401, rho=0.114903
INFO : -10.156 per-word bound, 1140.8 perplexity estimate based on a held-out corpus of 1484 documents with 12049996 words
INFO : PROGRESS: pass 75, dispatched chunk #0 = documents up to #1484/1484, outstanding queue size 1
INFO : topic #12 (0.020): 0.002*"κυριας" + 0.002*"θερμοτης" + 0.001*"ρυθμου" + 0.001*"εθελησας" + 0.001*"χυμος" + 0.001*"τελευταιαι" + 0.001*"εψευσθη" + 0.001*"απελπισας" + 0.001*"συσκευην" + 0.001*"συνταττειν"
INFO : topic #43 (0.020): 0.186*"στοχαστικως" + 0.071*"νεται" + 0.056*"αφιεσθαι" + 0.046*"πολιες" + 0.039*"επιμενη" + 0.017*"καταπετασματος" + 0.016*"σχισθηναι" + 0.014*"τεταγμενου" + 0.013*"προισταται" + 0.013*"νο"
INFO : topic #24 (0.020): 0.000*"τημερον" + 0.000*"αινιττονται" + 0.000*"δημοσθενους" + 0.000*"αναλισ

INFO : topic diff=0.015096, rho=0.110606
INFO : -10.154 per-word bound, 1139.6 perplexity estimate based on a held-out corpus of 1484 documents with 12049996 words
INFO : PROGRESS: pass 81, dispatched chunk #0 = documents up to #1484/1484, outstanding queue size 1
INFO : topic #6 (0.020): 0.004*"διδαγματα" + 0.003*"δαμων" + 0.003*"δαμωνος" + 0.002*"ισχυε" + 0.002*"αρχαιης" + 0.002*"απολειπομενος" + 0.002*"μεμφομενη" + 0.002*"πληρουντες" + 0.002*"διαλεξεσι" + 0.002*"μαθησομεθα"
INFO : topic #12 (0.020): 0.002*"κυριας" + 0.002*"θερμοτης" + 0.001*"ρυθμου" + 0.001*"εθελησας" + 0.001*"χυμος" + 0.001*"τελευταιαι" + 0.001*"εψευσθη" + 0.001*"απελπισας" + 0.001*"συσκευην" + 0.001*"συνταττειν"
INFO : topic #3 (0.020): 0.010*"ευειδεις" + 0.007*"κελυφος" + 0.006*"γενικη" + 0.006*"κογχυλιων" + 0.006*"σπληνος" + 0.005*"διισχυριζοντο" + 0.005*"ανδρομεδας" + 0.005*"δυναστευων" + 0.004*"νικωσης" + 0.004*"επαγομενην"
INFO : topic #37 (0.020): 0.005*"προσαγωγην" + 0.003*"οριστου" + 0.003*"ακρους" + 0.003

INFO : topic #21 (0.020): 0.012*"καλλισθενης" + 0.011*"ζωιον" + 0.011*"θεραποντι" + 0.011*"τιτυον" + 0.010*"καθαρσιοις" + 0.009*"εδουλευον" + 0.008*"υδρευσασθαι" + 0.005*"ιστορουσιν" + 0.005*"μαξιμιανος" + 0.005*"εμακαριζον"
INFO : topic #2 (0.020): 0.004*"αλμυρον" + 0.004*"αρισταρχος" + 0.003*"ψιλην" + 0.002*"αιακιδην" + 0.002*"στρατιην" + 0.002*"ολεσθαι" + 0.002*"γελας" + 0.002*"σοφιης" + 0.002*"θαλιας" + 0.002*"επικαλουμαι"
INFO : topic #26 (0.020): 0.006*"εκτεινεν" + 0.004*"νοηματων" + 0.003*"εωντων" + 0.003*"καταλεξω" + 0.002*"διοικει" + 0.002*"πολους" + 0.002*"φασκουσαν" + 0.002*"πυργων" + 0.002*"ολι" + 0.002*"φιλτατοις"
INFO : topic #18 (0.020): 0.032*"συνεργους" + 0.019*"αλφα" + 0.018*"δελτα" + 0.017*"εξαιτεισθαι" + 0.014*"παραδεδομενας" + 0.012*"σχισθηναι" + 0.012*"λμ" + 0.011*"αγραφα" + 0.011*"βητα" + 0.008*"λευ"
INFO : topic #38 (0.020): 0.005*"ροβοαμ" + 0.003*"προστεταγμενον" + 0.003*"εννοω" + 0.003*"ταλαιπωροι" + 0.003*"εμηχανησατο" + 0.002*"ρυθμου" + 0.002*"ιωσηπος" + 0.0

INFO : topic #46 (0.020): 0.004*"προσηκατο" + 0.004*"συνιστανται" + 0.002*"αλοχους" + 0.002*"ποταμιοι" + 0.002*"διαχει" + 0.002*"καταλιμπανει" + 0.002*"αρχετυπου" + 0.002*"ενυλα" + 0.002*"πανσεληνω" + 0.002*"ομοιοτης"
INFO : topic #40 (0.020): 0.017*"τρεπων" + 0.006*"μελετωμεν" + 0.004*"χαιρω" + 0.003*"θρηνουσαν" + 0.003*"εμφερεις" + 0.003*"νεστοριος" + 0.003*"διερχομενου" + 0.003*"βιαι" + 0.003*"παρισταμενον" + 0.002*"μαθωσιν"
INFO : topic #4 (0.020): 0.014*"ηρακλειας" + 0.006*"εξεκαυσε" + 0.005*"φονευσαντα" + 0.005*"μηστωρ" + 0.003*"ωντινων" + 0.003*"κρατητα" + 0.002*"κρατης" + 0.002*"κυπριοι" + 0.002*"νευοντας" + 0.002*"οπισθε"
INFO : topic diff=0.011580, rho=0.102737
INFO : -10.152 per-word bound, 1137.5 perplexity estimate based on a held-out corpus of 1484 documents with 12049996 words
INFO : PROGRESS: pass 94, dispatched chunk #0 = documents up to #1484/1484, outstanding queue size 1
INFO : topic #5 (0.020): 0.003*"αναληψιν" + 0.003*"ρυθμου" + 0.003*"ροβοαμ" + 0.002*"ευτελεις" +

INFO : topic #25 (0.020): 0.005*"συμπληροι" + 0.004*"εξηρημενης" + 0.003*"ερμηνεια" + 0.003*"προαγορευει" + 0.002*"αγνοησαι" + 0.002*"ανοδον" + 0.002*"πετεσθαι" + 0.002*"αιγες" + 0.002*"κομψοι" + 0.002*"φασκοντος"
INFO : topic diff=0.010413, rho=0.099631
INFO : -10.151 per-word bound, 1136.7 perplexity estimate based on a held-out corpus of 1484 documents with 12049996 words


CPU times: user 5h 36min 30s, sys: 6h 57min 23s, total: 12h 33min 53s
Wall time: 4h 9min 35s


In [38]:
lda_model.print_topics(-1)  # print a few most important words for each LDA topic

INFO : topic #0 (0.020): 0.024*"μο" + 0.014*"μοιρων" + 0.011*"ιβ" + 0.011*"ζωας" + 0.009*"ιδ" + 0.008*"γαληνου" + 0.008*"στομαχου" + 0.007*"ιγ" + 0.007*"ισαριθμων" + 0.006*"εμφαινομενα"
INFO : topic #1 (0.020): 0.002*"επιτασεις" + 0.002*"φευγοντος" + 0.002*"δημοσθενους" + 0.002*"ψηφισμα" + 0.002*"αισχινης" + 0.002*"αντιτιθεις" + 0.002*"σαφει" + 0.002*"αδικημα" + 0.002*"συντελουντα" + 0.002*"κατεχοντες"
INFO : topic #2 (0.020): 0.004*"αλμυρον" + 0.004*"αρισταρχος" + 0.003*"ψιλην" + 0.002*"αιακιδην" + 0.002*"στρατιην" + 0.002*"ολεσθαι" + 0.002*"γελας" + 0.002*"σοφιης" + 0.002*"θαλιας" + 0.002*"επικαλουμαι"
INFO : topic #3 (0.020): 0.010*"ευειδεις" + 0.007*"κελυφος" + 0.006*"γενικη" + 0.006*"κογχυλιων" + 0.006*"σπληνος" + 0.005*"διισχυριζοντο" + 0.005*"ανδρομεδας" + 0.005*"δυναστευων" + 0.004*"νικωσης" + 0.004*"επαγομενην"
INFO : topic #4 (0.020): 0.014*"ηρακλειας" + 0.006*"εξεκαυσε" + 0.006*"φονευσαντα" + 0.005*"μηστωρ" + 0.003*"ωντινων" + 0.003*"κρατητα" + 0.002*"κρατης" + 0.002*"κυπριο

INFO : topic #39 (0.020): 0.025*"ετητυμον" + 0.011*"κωλυοντι" + 0.009*"αιτησις" + 0.006*"πεζων" + 0.006*"υβρισαντες" + 0.006*"συμπληρουσθαι" + 0.005*"ιππεας" + 0.005*"εισαγη" + 0.004*"ανερχομενον" + 0.004*"δηεις"
INFO : topic #40 (0.020): 0.017*"τρεπων" + 0.006*"μελετωμεν" + 0.004*"χαιρω" + 0.003*"θρηνουσαν" + 0.003*"εμφερεις" + 0.003*"νεστοριος" + 0.003*"διερχομενου" + 0.003*"βιαι" + 0.003*"παρισταμενον" + 0.002*"μαθωσιν"
INFO : topic #41 (0.020): 0.135*"πεντεκαιδεκατον" + 0.013*"ακριβεστερος" + 0.012*"ακρωτηριον" + 0.010*"εαρινης" + 0.010*"ξʹ" + 0.010*"παρερχομενοι" + 0.010*"σκαμανδρον" + 0.010*"ασματων" + 0.010*"ρʹ" + 0.009*"χηρα"
INFO : topic #42 (0.020): 0.004*"ιλλυρικον" + 0.002*"κατεχουσιν" + 0.002*"γαιος" + 0.002*"κοσμεισθαι" + 0.002*"συναρμοσας" + 0.002*"ληιζομενος" + 0.002*"ημφιεστο" + 0.002*"αφορων" + 0.002*"αποδωσω" + 0.002*"πειν"
INFO : topic #43 (0.020): 0.187*"στοχαστικως" + 0.072*"νεται" + 0.057*"αφιεσθαι" + 0.046*"πολιες" + 0.039*"επιμενη" + 0.017*"καταπετασματος" + 0.

[(0,
  '0.024*"μο" + 0.014*"μοιρων" + 0.011*"ιβ" + 0.011*"ζωας" + 0.009*"ιδ" + 0.008*"γαληνου" + 0.008*"στομαχου" + 0.007*"ιγ" + 0.007*"ισαριθμων" + 0.006*"εμφαινομενα"'),
 (1,
  '0.002*"επιτασεις" + 0.002*"φευγοντος" + 0.002*"δημοσθενους" + 0.002*"ψηφισμα" + 0.002*"αισχινης" + 0.002*"αντιτιθεις" + 0.002*"σαφει" + 0.002*"αδικημα" + 0.002*"συντελουντα" + 0.002*"κατεχοντες"'),
 (2,
  '0.004*"αλμυρον" + 0.004*"αρισταρχος" + 0.003*"ψιλην" + 0.002*"αιακιδην" + 0.002*"στρατιην" + 0.002*"ολεσθαι" + 0.002*"γελας" + 0.002*"σοφιης" + 0.002*"θαλιας" + 0.002*"επικαλουμαι"'),
 (3,
  '0.010*"ευειδεις" + 0.007*"κελυφος" + 0.006*"γενικη" + 0.006*"κογχυλιων" + 0.006*"σπληνος" + 0.005*"διισχυριζοντο" + 0.005*"ανδρομεδας" + 0.005*"δυναστευων" + 0.004*"νικωσης" + 0.004*"επαγομενην"'),
 (4,
  '0.014*"ηρακλειας" + 0.006*"εξεκαυσε" + 0.006*"φονευσαντα" + 0.005*"μηστωρ" + 0.003*"ωντινων" + 0.003*"κρατητα" + 0.002*"κρατης" + 0.002*"κυπριοι" + 0.002*"νευοντας" + 0.002*"οπισθε"'),
 (5,
  '0.003*"αναληψιν" + 0.00

In [39]:
# cache the transformed corpora to disk, for use in later notebooks
path_lda = os.path.join(user_dir, 'gensim_tlg_lda.mm')
%time gensim.corpora.MmCorpus.serialize(path_lda, lda_model[mm_corpus])

INFO : storing corpus in Matrix Market format to /home/kyle/cltk_data/user_data/gensim_tlg_lda.mm
INFO : saving sparse matrix to /home/kyle/cltk_data/user_data/gensim_tlg_lda.mm
INFO : PROGRESS: saving document #0
INFO : PROGRESS: saving document #1000
INFO : saved 1484x50 matrix, density=10.020% (7435/74200)
INFO : saving MmCorpus index to /home/kyle/cltk_data/user_data/gensim_tlg_lda.mm.index


CPU times: user 47.4 s, sys: 1min 32s, total: 2min 19s
Wall time: 18.4 s


In [40]:
# transform text into the bag-of-words space
bow_vector = id2word_tlg.doc2bow(tokenize(doc))
print([(id2word_tlg[id], count) for id, count in bow_vector])

[('ποιητικης', 1), ('μεθοδου', 1)]


In [41]:
# transform into LDA space
lda_vector = lda_model[bow_vector]
print(lda_vector)

[(22, 0.67333333333333334)]


In [42]:
# print the document's single most prominent LDA topic
print(lda_model.print_topic(max(lda_vector, key=lambda item: item[1])[0]))

0.002*"εκαυθη" + 0.002*"γαυριων" + 0.002*"ρητορικη" + 0.001*"επισκεψις" + 0.001*"συντελειται" + 0.001*"ναρθηκας" + 0.001*"κλεανθης" + 0.001*"συμβουλευτικον" + 0.001*"εδωκαμεν" + 0.001*"συνθηκη"


In [43]:
path_lda = os.path.join(user_dir, 'gensim_tlg_lda.model')

# store all trained models to disk
lda_model.save(path_lda)

INFO : saving LdaState object under /home/kyle/cltk_data/user_data/gensim_tlg_lda.model.state, separately None
INFO : saved /home/kyle/cltk_data/user_data/gensim_tlg_lda.model.state
INFO : saving LdaMulticore object under /home/kyle/cltk_data/user_data/gensim_tlg_lda.model, separately ['expElogbeta', 'sstats']
INFO : storing np array 'expElogbeta' to /home/kyle/cltk_data/user_data/gensim_tlg_lda.model.expElogbeta.npy
INFO : not storing attribute id2word
INFO : not storing attribute state
INFO : not storing attribute dispatcher
INFO : saved /home/kyle/cltk_data/user_data/gensim_tlg_lda.model


# Evaluation

# Word intrusion

> For each trained topic, they take its first ten words, then substitute one of them with another, randomly chosen word (intruder!) and see whether a human can reliably tell which one it was. If so, the trained topic is topically coherent (good); if not, the topic has no discernible theme (bad)

In [44]:
# select top 50 words for each of the 20 LDA topics
top_words = [[word for word, _ in lda_model.show_topic(topicno, topn=50)] for topicno in range(lda_model.num_topics)]
print(top_words)

[['μο', 'μοιρων', 'ιβ', 'ζωας', 'ιδ', 'γαληνου', 'στομαχου', 'ιγ', 'ισαριθμων', 'εμφαινομενα', 'συρακουσιους', 'τραπεζουντα', 'αγραφα', 'ειδαρ', 'προεχοντων', 'θινα', 'απτομεθα', 'ποιμαινοντες', 'εσθησι', 'ελαχιστω', 'σισυφος', 'νο', 'κγ', 'δυμη', 'σικυωνιας', 'καθευδει', 'αρχινος', 'πασασθαι', 'αφηρησθω', 'αρχι', 'καλυμμα', 'πολιτευομενην', 'αλεξανδρειας', 'υπεροχη', 'υποθεμενος', 'ελκομενους', 'πολον', 'στιβαδας', 'ευσεβη', 'αχθησονται', 'υπακουοντα', 'εσβεννυτο', 'μεροης', 'αντολαι', 'συντελουμενων', 'οικησαντας', 'υποτιθεσθαι', 'αξ', 'λϛʹ', 'παρηχθη'], ['επιτασεις', 'φευγοντος', 'δημοσθενους', 'ψηφισμα', 'αισχινης', 'αντιτιθεις', 'σαφει', 'αδικημα', 'συντελουντα', 'κατεχοντες', 'τελεωτερας', 'αντιθεσις', 'ρητορικη', 'θηβαιων', 'υπηρξε', 'αισχινου', 'προβολη', 'ρητορος', 'γερως', 'φιλιππω', 'οποιονουν', 'θηβαιους', 'μειδιου', 'συμφωνη', 'δημοσθενει', 'συμβουλευων', 'ναρθηκας', 'οριστου', 'δημοσθενην', 'κρινομενον', 'λυσεων', 'ασεβη', 'μοιχον', 'ψευδων', 'εκβασεως', 'τριηρεις', 'ισοκ

In [45]:
# get all top 50 words in all 20 topics, as one large set
all_words = set(itertools.chain.from_iterable(top_words))

print("Can you spot the misplaced word in each topic?")

# for each topic, replace a word at a different index, to make it more interesting
replace_index = np.random.randint(0, 10, lda_model.num_topics)

replacements = []
for topicno, words in enumerate(top_words):
    other_words = all_words.difference(words)
    replacement = np.random.choice(list(other_words))
    replacements.append((words[replace_index[topicno]], replacement))
    words[replace_index[topicno]] = replacement
    print("%i: %s" % (topicno, ' '.join(words[:10])))

Can you spot the misplaced word in each topic?
0: μο μοιρων ιβ ζωας ιδ γαληνου στομαχου ιγ ισαριθμων κομψως
1: επιτασεις φευγοντος γαυριων ψηφισμα αισχινης αντιτιθεις σαφει αδικημα συντελουντα κατεχοντες
2: αλμυρον σπευδε ψιλην αιακιδην στρατιην ολεσθαι γελας σοφιης θαλιας επικαλουμαι
3: ευειδεις κελυφος γενικη κογχυλιων κακο διισχυριζοντο ανδρομεδας δυναστευων νικωσης επαγομενην
4: ηρακλειας εξεκαυσε φονευσαντα ανωνυμοι ωντινων κρατητα κρατης κυπριοι νευοντας οπισθε
5: αναληψιν ρυθμου ροβοαμ ευτελεις απατηθεις προστεταγμενον στον εννοω ερωτωμενοι γινωσκονται
6: διδαγματα δαμων δαμωνος ισχυε απολειπομενος αρχαιης μεμφομενη βητα διαλεξεσι μαθησομεθα
7: ημαρ ταπητες γενικη νεφελαις χρονικον κατεστρεψεν φυλαξι εγνωριζετο θουκυδιδη υπεριδης
8: λεπτου μακροτερος μο ινδικης ωνομαζε ελαια ατλαντιδος προσθεντος κτησιας τειχισαι
9: καμνεις αυτιχ αισθητικη βδελυττεται ακτιον νοσερον κατεπληξε ειρξαι δραχμας τιθεμενης
10: πηδαν ιλλυρικον κολπων πελοποννησον κοσμεισθαι οχυρωματι τομον κατεχουσιν ο

In [46]:
print("Actual replacements were:")
print(list(enumerate(replacements)))

Actual replacements were:
[(0, ('εμφαινομενα', 'κομψως')), (1, ('δημοσθενους', 'γαυριων')), (2, ('αρισταρχος', 'σπευδε')), (3, ('σπληνος', 'κακο')), (4, ('μηστωρ', 'ανωνυμοι')), (5, ('εγνωριζετο', 'απατηθεις')), (6, ('πληρουντες', 'βητα')), (7, ('παιονιας', 'ημαρ')), (8, ('φιλιου', 'μο')), (9, ('βραχυτατης', 'καμνεις')), (10, ('επανερχομενος', 'πελοποννησον')), (11, ('ζοφω', 'εσθλων')), (12, ('εθελησας', 'κοινωνησειν')), (13, ('πεποιθοτες', 'ορφανους')), (14, ('επεσχον', 'λωστε')), (15, ('θαυμασται', 'κινουνται')), (16, ('ρυθμοις', 'μοιραις')), (17, ('ικοντο', 'ζωιον')), (18, ('βητα', 'λαμπραι')), (19, ('λευκοτητα', 'συμβεβηκοτος')), (20, ('υποκειμενος', 'ανεχονται')), (21, ('εμακαριζον', 'ηρωικα')), (22, ('εδωκαμεν', 'ξεινων')), (23, ('ευκαιριαν', 'πδ')), (24, ('σχοινων', 'γεννωμενος')), (25, ('εξηρημενης', 'κειμενω')), (26, ('πυργων', 'ροιας')), (27, ('καταβαινον', 'αυλως')), (28, ('υπακουοντα', 'δικαι')), (29, ('εκαταιος', 'συντιθεναι')), (30, ('οριστεον', 'ερπει')), (31, ('ερηται',

## Split doc

> We'll split each document into two parts, and check that 1) topics of the first half are similar to topics of the second 2) halves of different documents are mostly dissimilar

In [47]:
# evaluate on 1k documents **not** used in LDA training
tlg_preprocessed = os.path.expanduser('~/cltk_data/greek/text/tlg/plaintext/')
doc_stream = (tokens for _, tokens in iter_tlg(tlg_preprocessed))  # generator
test_docs = list(itertools.islice(doc_stream, 100, 200))  # ['πανυ', 'καλως', ...], [...], ...]

In [48]:
def intra_inter(model, test_docs, num_pairs=10000):
    # split each test document into two halves and compute topics for each half
    part1 = [model[id2word_tlg.doc2bow(tokens[: len(tokens) // 2])] for tokens in test_docs]
    part2 = [model[id2word_tlg.doc2bow(tokens[len(tokens) // 2 :])] for tokens in test_docs]
    
    # print computed similarities (uses cossim)
    print("average cosine similarity between corresponding parts (higher is better):")
    print(np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)]))

    random_pairs = np.random.randint(0, len(test_docs), size=(num_pairs, 2))
    print("average cosine similarity between {} random parts (lower is better):".format(num_pairs))    
    print(np.mean([gensim.matutils.cossim(part1[i[0]], part2[i[1]]) for i in random_pairs]))

In [49]:
print("LDA results:")
intra_inter(lda_model, test_docs)

LDA results:
average cosine similarity between corresponding parts (higher is better):
0.750338842199
average cosine similarity between 10000 random parts (lower is better):
0.433126791694


# Transform all docs

In [50]:
for title, tokens in iter_tlg(tlg_preprocessed):
    #print(title, tokens[:10])  # print the article title and its first ten tokens
    print(title)
    print(lda_model[id2word_tlg.doc2bow(tokens)])
    print('')

TLG2346.TXT
[(14, 0.39389740707735388), (24, 0.21067455806178731), (29, 0.1136918037101812), (33, 0.17409762749558913), (40, 0.095309836531801279)]

TLG1389.TXT
[(1, 0.26942331363740823), (8, 0.015139339272878569), (14, 0.01973173879480002), (16, 0.089583904671040562), (24, 0.038989333443285451), (29, 0.12612947608578889), (33, 0.17598846769899817), (43, 0.2649070781529877)]

TLG0404.TXT
[(24, 0.13841483006409908), (31, 0.069124241336902051), (33, 0.76205471720457707), (44, 0.019940601431003298)]

TLG0235.TXT
[(8, 0.037968259054561616), (16, 0.18147176836703305), (21, 0.01545021771768587), (43, 0.061998877321973234), (44, 0.39413830205480471), (48, 0.2960313990133534)]

TLG0535.TXT
[(1, 0.45536139471199766), (2, 0.018797458921507026), (14, 0.095276860187092), (24, 0.30209088967983838), (33, 0.080068597297550878), (47, 0.020818225064305491)]

TLG0507.TXT
[(39, 0.97930838894220751), (44, 0.011719648440970329)]

TLG1816.TXT
[(10, 0.020210517185290437), (16, 0.031779850265826096), (17, 0.4

TLG3070.TXT
[(2, 0.011238929416186804), (5, 0.058678063663210392), (8, 0.01874374857278896), (9, 0.019893755165342857), (14, 0.105431140055585), (22, 0.019875942495514894), (24, 0.14942021334885897), (27, 0.010278129048642766), (38, 0.074411937666920197), (44, 0.018549755497763405), (46, 0.012100053827103042), (47, 0.43223233753932988)]

TLG1426.TXT
[(1, 0.18771139346164889), (5, 0.044499867349516949), (8, 0.037794828084757079), (11, 0.010359989551775509), (12, 0.049897588077488751), (14, 0.17377642530681947), (22, 0.022368234945693524), (24, 0.12943069742963384), (31, 0.18554864412178637), (40, 0.063030594068426454), (44, 0.027077761350131468), (46, 0.065401935435994291)]

TLG1641.TXT
[(5, 0.35524257201396847), (8, 0.010423069027581014), (9, 0.043430657873092235), (14, 0.11387843939602446), (22, 0.10858343140200308), (24, 0.15825087792073758), (27, 0.056081666831608472), (37, 0.063301117724056347), (38, 0.060655940116705033), (46, 0.022758200262082279)]

TLG5040.TXT
[(1, 0.02997200137

TLG3045.TXT
[(5, 0.10034237870964828), (8, 0.035410094239831415), (12, 0.010694490484758562), (14, 0.14473834226663421), (22, 0.035516257095493636), (23, 0.019430566578952813), (24, 0.11966486884312534), (27, 0.016571943393430461), (29, 0.042867618464515825), (33, 0.020595280449640395), (37, 0.020703163827741611), (38, 0.2127538118394203), (44, 0.02086517182206923), (46, 0.032050672089183614), (47, 0.09124409549738742)]

TLG0632.TXT
[(1, 0.014130143426407343), (5, 0.052620346119288328), (8, 0.035960882952038392), (9, 0.019073464097406737), (12, 0.014068559658593828), (14, 0.086916130558796725), (16, 0.019983492291534632), (17, 0.021343116696380515), (20, 0.014391127433832671), (22, 0.079257571324369766), (24, 0.2154066757104463), (28, 0.015905927298272986), (29, 0.021980928040708798), (31, 0.010594551371093981), (33, 0.12379397013237944), (35, 0.014673254034304794), (37, 0.016620379692754182), (38, 0.021679046911669688), (44, 0.066435824211923081), (45, 0.013819562217004434), (46, 0.03

[(0, 0.010128282340030008), (1, 0.075496276292893821), (5, 0.045225741039651719), (8, 0.048241840951194694), (14, 0.1143001925041815), (22, 0.029246582204396989), (24, 0.18932328687805128), (29, 0.11447521920031263), (33, 0.2731133428833924), (46, 0.011251641638823423), (47, 0.023903369958303462)]

TLG4236.TXT
[(1, 0.17800270347342562), (14, 0.011519585145488556), (16, 0.012813753437479179), (22, 0.32401834389518108), (24, 0.14273783080857241), (28, 0.018785553399724497), (30, 0.085038758075142482), (31, 0.036739575687546017), (33, 0.051418960980841463), (37, 0.033303070340390838), (38, 0.039526633141425051), (46, 0.026920723764002043), (47, 0.017494492097684392)]

TLG1923.TXT
[(1, 0.11919680218193027), (5, 0.098840579819661045), (8, 0.08222548547078172), (9, 0.01914502784607796), (14, 0.071157600377433003), (16, 0.02193064190711648), (20, 0.011819622293464168), (22, 0.033492424824580883), (24, 0.30455936989495136), (33, 0.11253552206862423), (38, 0.042171742679576629), (42, 0.03433284

TLG4080.TXT
[(2, 0.023493113203618844), (5, 0.08066961314457767), (8, 0.032384051920073854), (9, 0.16213954353008678), (12, 0.015987874695790869), (14, 0.043368139598694094), (16, 0.043929528000721904), (17, 0.010880707731188384), (19, 0.023160678559875986), (22, 0.049891138082270053), (24, 0.21938324413718488), (25, 0.011541326966846004), (28, 0.02856433672422877), (29, 0.011998806945893999), (33, 0.065756936254298931), (37, 0.021822852602863946), (38, 0.015465747182400867), (44, 0.030251821300416468), (46, 0.022684103503956363), (47, 0.041080514908380175)]

TLG0064.TXT
[(8, 0.23968660306601924), (9, 0.058499182831260099), (14, 0.043828426203045841), (19, 0.02339145724137584), (22, 0.16062568455418275), (24, 0.19264328073413708), (29, 0.018983959687474516), (31, 0.045991052511224602), (35, 0.027354543290871253), (40, 0.054264901022988407), (47, 0.12891001333503255)]

TLG1320.TXT
[(1, 0.04446603607485268), (2, 0.012718494232913685), (5, 0.07942768536734221), (8, 0.019432688275271278), 

TLG0552.TXT
[(1, 0.044108652421864424), (5, 0.049306502596402214), (8, 0.094431527755082845), (9, 0.046065050992896206), (14, 0.011072049592804053), (16, 0.032362223401538617), (18, 0.238892385718524), (20, 0.11987491277413562), (22, 0.031653447788399923), (24, 0.063747691798412096), (31, 0.011642263899676694), (35, 0.010382175522590529), (37, 0.027756082277646258), (42, 0.011274196612783366), (44, 0.015156676222069742), (46, 0.064227076929371588), (48, 0.052936995241207922)]

TLG1463.TXT
[(5, 0.3026300228364226), (8, 0.015512390362428175), (12, 0.037399496067977135), (13, 0.010637882600686769), (14, 0.076929087959166434), (16, 0.026087734410814061), (17, 0.025158575941981461), (22, 0.02769284278796728), (24, 0.18402058948353148), (28, 0.020438041889646479), (33, 0.025884497065376343), (38, 0.068693533909173377), (44, 0.02215204698108619), (46, 0.029881481203797453), (47, 0.068786454022222027), (48, 0.021837625966549569)]

TLG5052.TXT
[(1, 0.030657921740826537), (2, 0.02197227886898371

TLG7000.TXT
[(2, 0.022517275505854941), (5, 0.06405775385325653), (8, 0.026700925441955046), (9, 0.031509896434797409), (12, 0.012615892894098778), (14, 0.021406041005034914), (16, 0.034653731725775391), (17, 0.13187719680533244), (22, 0.021781360669481765), (24, 0.17737229450270189), (29, 0.032313612669879206), (33, 0.06950228782758322), (37, 0.016798764422667277), (38, 0.013374072834825836), (44, 0.12825799335148849), (47, 0.019535066946692211), (48, 0.11865654923108822)]

TLG4391.TXT
[(5, 0.067138365144926854), (8, 0.069251376742259854), (17, 0.065042984564572998), (22, 0.075042727885035979), (24, 0.20016512228490457), (29, 0.33981426850644492), (30, 0.037177883090818514), (33, 0.11631461759797043), (43, 0.011294354161061429)]

TLG0610.TXT
[(1, 0.019286340212161492), (2, 0.059074721822575747), (5, 0.11056879242058337), (9, 0.037922619317012282), (14, 0.12714460729429372), (22, 0.081668881693812756), (24, 0.35905811455803188), (29, 0.040539726137842937), (31, 0.04786082545647416), (3

TLG9020.TXT
[(1, 0.094430864882579055), (2, 0.034033436920033175), (5, 0.075858568358040571), (8, 0.015174247890813555), (9, 0.049638824178870576), (14, 0.055273523275660083), (16, 0.041808420071971684), (22, 0.15760952693919769), (24, 0.17945067038154691), (29, 0.035155421716618541), (33, 0.043153762420852185), (37, 0.027939390486982948), (38, 0.018068846015442289), (42, 0.025388224938554294), (44, 0.035142591372255755), (46, 0.027763674023561391), (47, 0.012139235989810373)]

TLG2702.TXT
[(1, 0.019446824484305118), (2, 0.024079512868643817), (5, 0.13395397142862803), (8, 0.018246170118596944), (9, 0.042953754804762982), (12, 0.023750131624252234), (14, 0.057723327157297215), (16, 0.027671107161158826), (22, 0.096563516437084509), (24, 0.23177227821426488), (33, 0.026949853461717105), (37, 0.047202964241882417), (38, 0.018743361030844612), (42, 0.028260990927018677), (44, 0.028397096619383166), (46, 0.051000316548558092), (47, 0.03935622181584994)]

TLG1139.TXT
[(1, 0.0132540488822803

TLG1416.TXT
[(2, 0.078286482625159901), (5, 0.062360740199043259), (8, 0.044590685921488141), (9, 0.022719524520350105), (12, 0.017290548150739937), (14, 0.065308720971672007), (16, 0.083164131413442713), (17, 0.011065140936986064), (22, 0.061417315370870265), (24, 0.22060403501053327), (29, 0.065415540551690771), (33, 0.076080180066788683), (35, 0.025201849525296204), (37, 0.018615772113520574), (38, 0.025838120224765736), (44, 0.022947821557789863), (46, 0.013417290438573426), (47, 0.014260959955548026), (48, 0.010032414158375958)]

TLG1153.TXT
[(5, 0.082500813005544307), (8, 0.093228817003768283), (9, 0.019202698776978722), (12, 0.15913537412011583), (25, 0.019526410009582194), (26, 0.070569350355143737), (27, 0.047866317368585314), (28, 0.028904726417540265), (33, 0.11324204287091645), (42, 0.073546429105968078), (44, 0.035767058445249279), (46, 0.1446910417683567), (47, 0.10208207864698744)]

TLG4088.TXT
[(1, 0.03759062340415855), (2, 0.025074826441452657), (5, 0.1071727840112274)

TLG4024.TXT
[(2, 0.014917714074445117), (5, 0.080772435965192735), (8, 0.025649685926483177), (9, 0.02362022675585404), (12, 0.023599258128407605), (14, 0.16066166326701081), (16, 0.013255045550882508), (17, 0.014329079744679594), (22, 0.03084984275929898), (24, 0.32380324839477242), (31, 0.018339250266961746), (33, 0.019623000465727344), (37, 0.01708507575309889), (38, 0.023043978796449576), (40, 0.036300065901248409), (42, 0.02708901401906607), (44, 0.016791542620172847), (46, 0.010966640498289075), (47, 0.069530277941312738), (48, 0.012720628796781311)]

TLG0327.TXT
[(1, 0.073663953498358983), (12, 0.085100458222822847), (17, 0.12625086449699238), (19, 0.017241852789090444), (24, 0.34184900169852694), (35, 0.094569575810345308), (37, 0.029370460449000269), (42, 0.059584791879153748), (44, 0.15492223264507032)]

TLG0609.TXT
[(1, 0.013724415139097113), (2, 0.08923096267212112), (3, 0.11244001839722793), (5, 0.078346241156895841), (9, 0.012228384765412038), (12, 0.040883924441936587), 

TLG2046.TXT
[(2, 0.040968394907360907), (5, 0.06652738918217653), (8, 0.030839916514825049), (9, 0.022739510516460764), (13, 0.012816088005949402), (14, 0.012048539760200481), (16, 0.040091542064637158), (17, 0.14164915021334984), (22, 0.010926734894664718), (24, 0.098398941730560935), (29, 0.023646566186889775), (33, 0.034756005820756851), (40, 0.011037539077256334), (44, 0.14282999642921709), (48, 0.2631312766509048)]

TLG0527.TXT
[(5, 0.25654422691597001), (8, 0.014750790885711613), (9, 0.022364616677825942), (12, 0.032147347996248479), (14, 0.11913859900676527), (16, 0.042991567426283406), (17, 0.018155298608118254), (22, 0.047048528756113546), (24, 0.18540013405137407), (33, 0.021788855007571037), (38, 0.057074587707266831), (44, 0.021557789332287997), (46, 0.041839808774636902), (47, 0.055784574106043712)]

TLG2966.TXT
[(2, 0.10530909819825672), (5, 0.048445277577609899), (12, 0.090154484186220546), (14, 0.087806338893344271), (22, 0.039109838450900679), (24, 0.46871500243052228)

[(5, 0.11711630315154614), (8, 0.011077701104051764), (9, 0.022725829690964088), (10, 0.034742260988352737), (12, 0.038801214716184765), (14, 0.10286231617905489), (16, 0.03288070944311041), (17, 0.017435235685052491), (22, 0.03133567261663818), (24, 0.22506970418506919), (29, 0.03446520973990453), (31, 0.027042337855664011), (33, 0.038600859432575074), (37, 0.023623444277102171), (38, 0.032526423859028188), (44, 0.09817029982855241), (46, 0.028005315534872656), (47, 0.048665087935672695), (48, 0.019051319194853363)]

TLG0718.TXT
[(0, 0.012702903527550756), (2, 0.01281899221917448), (5, 0.10188532480915224), (8, 0.019022799221642642), (9, 0.23106954431151805), (12, 0.033007327858348277), (14, 0.035953661833162724), (16, 0.033890778218408378), (19, 0.020776470572688496), (22, 0.059780353420261272), (24, 0.20994537781341724), (33, 0.056446223650112946), (37, 0.031323885112600723), (38, 0.014497368975727763), (44, 0.037699744714387327), (47, 0.011990459832029329)]

TLG0752.TXT
[(2, 0.0243

TLG0616.TXT
[(1, 0.01574786675560012), (5, 0.054798092207687346), (8, 0.027413139827458591), (9, 0.010623504405338711), (12, 0.024884677346667661), (14, 0.3334136868571439), (16, 0.025390590106538886), (22, 0.020284635872781641), (24, 0.27416179054747247), (29, 0.025046032219452612), (33, 0.03488006419038276), (37, 0.010626561843802181), (38, 0.02613789666753882), (42, 0.011960570783812633), (44, 0.015844311595577275), (47, 0.030314805531954533)]

TLG0648.TXT
[(2, 0.024723361663405018), (5, 0.10853886881902196), (8, 0.028478291085749233), (9, 0.029114558164376404), (12, 0.030030801080842943), (14, 0.18249617526210812), (22, 0.067469038291171193), (24, 0.25159478104478128), (29, 0.019006267226199684), (31, 0.021867076872680604), (33, 0.014648327411429542), (40, 0.010460916313953465), (42, 0.049800369536183116), (44, 0.033170679182068369), (46, 0.022281495141925491), (47, 0.033622753866793607), (48, 0.011084028467733565)]

TLG2237.TXT
[(5, 0.13836110286212111), (10, 0.025612586875646617)

TLG3018.TXT
[(2, 0.01552954725826906), (5, 0.10428296960103209), (8, 0.01524554424159803), (9, 0.019248104010146715), (12, 0.018525810730826209), (14, 0.12617213792816664), (16, 0.016704436323178765), (22, 0.031720287721483505), (24, 0.20527323968989364), (27, 0.011731971728986894), (29, 0.012971350141865871), (33, 0.012764769978182756), (37, 0.011748891867651675), (38, 0.057320277548827972), (42, 0.027812794038051342), (44, 0.015952645217911648), (46, 0.023086136289251331), (47, 0.2178566149902362)]

TLG2391.TXT
[(14, 0.10647093821079388), (16, 0.050511864743459985), (29, 0.30913437151445611), (31, 0.037417717123257997), (33, 0.268791256576285), (38, 0.16329561399434447), (46, 0.046461571170735093)]

TLG1276.TXT
[(2, 0.10777981083474583), (5, 0.034146870599638446), (9, 0.023787365935936344), (14, 0.14332251994293968), (17, 0.028718325269354444), (24, 0.21330595031845401), (27, 0.01057725435134497), (29, 0.17239044225003441), (33, 0.19844367005436395), (44, 0.014350598502329493), (47, 

TLG1194.TXT
[(2, 0.15009768773060411), (5, 0.078696731239785142), (8, 0.020942366223828821), (9, 0.020887579336607575), (12, 0.010358437720807901), (14, 0.029112645042051192), (16, 0.035225153653283561), (17, 0.073006662043558337), (22, 0.051788480793612583), (24, 0.17181184537774166), (29, 0.042551246362173285), (33, 0.038723330058524789), (37, 0.014998175888858566), (44, 0.071779286879772156), (48, 0.13036932437505885)]

TLG4302.TXT
[(1, 0.16866483500263924), (2, 0.012249317707419346), (5, 0.079385213194884494), (8, 0.01879185120213794), (14, 0.062805089164940542), (16, 0.13889118664258807), (22, 0.025241484484930105), (24, 0.25956478461583543), (29, 0.062804661689282143), (33, 0.095194610769694071), (38, 0.018845517128814694), (47, 0.021767552744869904)]

TLG0703.TXT
[(2, 0.069765252432073446), (8, 0.02441571767757153), (9, 0.18854264543345747), (13, 0.07531804459398414), (17, 0.14130948730756379), (24, 0.33890458997151346), (37, 0.051334498722429525), (45, 0.019793231028330978), (4

TLG4034.TXT
[(1, 0.019906541392030724), (2, 0.041719831671027746), (5, 0.10253446233916258), (8, 0.012714954735894377), (9, 0.070884227497450006), (12, 0.035808617702108157), (14, 0.040895739098835017), (16, 0.02745989047394665), (22, 0.12193964832456884), (24, 0.19475396407996373), (28, 0.012833984029766281), (29, 0.012177661049057164), (30, 0.010912224113382874), (31, 0.037132899469880812), (33, 0.024760931176000277), (37, 0.087186056500415984), (42, 0.016294530733709386), (44, 0.018165590822158569), (46, 0.038412825664766219), (47, 0.020626141587554204)]

TLG0306.TXT
[(0, 0.070655886397276532), (2, 0.28232482780056795), (13, 0.19654796358980259), (14, 0.088550401225767614), (17, 0.11286895561618565), (24, 0.12942772814283374), (37, 0.058195665798995766)]

TLG5046.TXT
[(1, 0.082067503762209476), (5, 0.16045022543040136), (14, 0.072174287019148312), (16, 0.058448654385551059), (20, 0.01261711451556892), (22, 0.13667966125741393), (24, 0.35236633505156284), (27, 0.03178775451612275), (

TLG5032.TXT
[(2, 0.043639020954868282), (3, 0.015407254306574306), (5, 0.10541714576012005), (8, 0.023202579207874732), (9, 0.040394032933054581), (12, 0.014129402765423769), (14, 0.044715475124514271), (16, 0.075751387064054904), (17, 0.047635622914441435), (22, 0.045741661625662108), (24, 0.21853643115041185), (25, 0.010682138167212418), (29, 0.025516062173921935), (33, 0.051281888517236883), (37, 0.026460622519447845), (42, 0.011672975909653348), (44, 0.071814572453355457), (46, 0.01923055461110371), (47, 0.017216422074739106), (48, 0.04664580697102249)]

TLG0544.TXT
[(1, 0.018932955998003605), (2, 0.031923493804232761), (3, 0.022993249708080008), (5, 0.092522098633030389), (8, 0.019803710148631734), (9, 0.039889952545557987), (12, 0.032160438143852201), (14, 0.056559175894142159), (16, 0.017578428350582157), (17, 0.014364478310170941), (20, 0.012144566751657012), (22, 0.18815155910074322), (24, 0.16271024134014392), (27, 0.011693958866420778), (31, 0.017831001467449491), (33, 0.026

TLG3014.TXT
[(2, 0.028701433067639486), (5, 0.10129438999986289), (8, 0.041032192075865198), (9, 0.034462912109307374), (12, 0.02286541860645374), (14, 0.033129062503517062), (16, 0.020987757418440483), (17, 0.032579534564530221), (20, 0.067679091989809384), (22, 0.059890347847046513), (24, 0.19059600981870969), (28, 0.13116436378383547), (33, 0.019639975481804836), (37, 0.03046491588857346), (38, 0.015959960979156605), (42, 0.023114515937325882), (44, 0.035045896520277127), (46, 0.023357372155642722), (47, 0.031750138751888188), (48, 0.011699484121643362)]

TLG1230.TXT
[(4, 0.049268292682926824), (10, 0.049462121197686504), (11, 0.05094893388553292), (12, 0.12708510476978238), (18, 0.065207231974004395), (30, 0.059553704162982182), (33, 0.13183561608901123), (38, 0.13446522763144855), (40, 0.062027619079037095), (46, 0.17796580443437088), (48, 0.073155953849314739)]

TLG2934.TXT
[(1, 0.019354695008077256), (2, 0.019562090522790662), (5, 0.21172929333315019), (8, 0.011634919354763257),

TLG2770.TXT
[(5, 0.19989985794786028), (8, 0.010119873972628531), (9, 0.034409845387427807), (12, 0.028824071164852364), (14, 0.10775363437321006), (16, 0.024157218304528539), (22, 0.038315411047911883), (24, 0.25368990943855435), (27, 0.02737610389561005), (33, 0.020909225347192972), (37, 0.029514660396817195), (38, 0.017864633107188069), (40, 0.013199966418985585), (42, 0.012469387063939597), (44, 0.011736828630905987), (46, 0.021934031484970775), (47, 0.1173878513385329)]

TLG0045.TXT
[(5, 0.12149855345998663), (10, 0.080226056402861706), (13, 0.10218540724322286), (14, 0.30711691611736697), (24, 0.28536315358172004), (31, 0.042062948367140529), (47, 0.053032113342553269)]

TLG4030.TXT
[(1, 0.016851020834620856), (2, 0.037597111165588955), (5, 0.14428069931037812), (8, 0.017985532861071893), (9, 0.069141355942791999), (14, 0.037163694011140461), (16, 0.032147921325928926), (22, 0.14785774627066881), (24, 0.15821886149945899), (28, 0.018185621465772909), (33, 0.018584140742811811), (

TLG3168.TXT
[(2, 0.051359015743273201), (5, 0.11731057315455659), (8, 0.012335813904913987), (9, 0.034084696504487116), (14, 0.07039193746586872), (16, 0.015125738488435909), (17, 0.012351679832284922), (22, 0.034982917503146624), (24, 0.15238856591655603), (27, 0.02219106717654639), (28, 0.010837166506364369), (30, 0.016272164266991823), (31, 0.010012537735640051), (33, 0.012574246679826404), (37, 0.011207432532893394), (38, 0.019023287236301131), (42, 0.016940879963047627), (44, 0.05048755884622063), (45, 0.054932057032671053), (46, 0.033331294855210053), (47, 0.22865722506894012)]

TLG1105.TXT
[(5, 0.10347338119688343), (14, 0.069856743417523182), (17, 0.088468911055226029), (20, 0.059020899975290524), (24, 0.24255545073191681), (28, 0.15270736765606546), (31, 0.090382154234034687), (37, 0.11182123313461854), (44, 0.040717320194594994), (48, 0.033978994544197762)]

TLG0736.TXT
[(2, 0.02166501893032767), (5, 0.15501576379912652), (8, 0.011860245020575954), (9, 0.24460204970925914), (

TLG1419.TXT
[(1, 0.012375634723039078), (3, 0.010355819706293682), (5, 0.22313274522727375), (8, 0.010264742617812673), (9, 0.042638432222242374), (11, 0.01075775814230406), (12, 0.073332000513054552), (14, 0.092241262835356275), (16, 0.023825437838981042), (17, 0.047429513024982314), (22, 0.058909105879130182), (24, 0.22302735825676906), (28, 0.018628625824573437), (33, 0.025649554633295626), (37, 0.015934373001060053), (38, 0.015432404519026348), (44, 0.029103166955563153), (47, 0.045813157460970998)]

TLG0301.TXT
[(5, 0.38234019288684928), (42, 0.091384687754217944), (44, 0.45913226221607678)]

TLG1181.TXT
[(0, 0.058405360519849722), (1, 0.1483352698567281), (5, 0.022525686099889204), (8, 0.04041743587469511), (9, 0.091356383906706723), (13, 0.058992248371242562), (16, 0.018120404791560137), (18, 0.19224001700375459), (20, 0.18431029588475648), (24, 0.070461199369321381), (28, 0.013474706959984916), (33, 0.046018704339324472), (35, 0.024530864713621099), (46, 0.013307984194307731)]


TLG4149.TXT
[(1, 0.030106797970469026), (2, 0.071986502973613212), (3, 0.16056861044285234), (5, 0.086073660380775449), (8, 0.010928621300559301), (9, 0.026173780312628674), (12, 0.024966218804299101), (14, 0.013960880846690681), (16, 0.032572901162474382), (22, 0.11252060845261355), (24, 0.16188811585281912), (29, 0.025874151293250602), (33, 0.014549714900497402), (37, 0.032445248317545514), (38, 0.0441206687602178), (39, 0.012385745106667819), (42, 0.015496462811218132), (44, 0.030086233240840834), (46, 0.033647415077470079)]

TLG5022.TXT
[(0, 0.045440092932715628), (1, 0.10737124876177219), (3, 0.022425612077262774), (5, 0.089014735553388319), (8, 0.054481023062125197), (9, 0.034074006796286929), (13, 0.017271543526753235), (14, 0.042155135377490653), (16, 0.011683568526108997), (18, 0.15561998658960044), (20, 0.061337913167792593), (22, 0.049049600183438007), (24, 0.098833855299682832), (30, 0.015860928901326905), (31, 0.013512740163253607), (33, 0.01026820637293774), (37, 0.058077

TLG0082.TXT
[(1, 0.030420305065444558), (2, 0.060832960432546379), (3, 0.085426597983903826), (5, 0.11888008134400996), (9, 0.024993034797240878), (12, 0.019608359974692617), (14, 0.018562902051329676), (16, 0.042188448705491155), (17, 0.020082409754422888), (22, 0.14096976165314173), (24, 0.14741457767276844), (27, 0.010569759214823046), (29, 0.015247374612198107), (30, 0.017139471223443805), (33, 0.016594959137425164), (37, 0.050939324029606783), (38, 0.01770367732458681), (42, 0.013881660778692875), (44, 0.023181078616913375), (46, 0.048908727844522666), (47, 0.011713935245980776), (48, 0.021496855427376466)]

TLG0495.TXT
[(0, 0.075595192196475688), (1, 0.034617158671200993), (8, 0.10110365058651076), (9, 0.058875760474063572), (12, 0.15967795935901261), (14, 0.12450551925641377), (16, 0.031019321455316808), (17, 0.08490799008468472), (23, 0.12610710913449721), (24, 0.1425300473127154), (46, 0.050660291469109532)]

TLG2444.TXT
[(13, 0.067653419941430926), (14, 0.22144809519624201), 

TLG9010.TXT
[(1, 0.028218086553766009), (2, 0.023963486447528038), (5, 0.085255408435621197), (8, 0.022304078670160654), (9, 0.026233122509831897), (12, 0.011733037510283699), (14, 0.069174141563994887), (16, 0.074162980233644843), (17, 0.015033726353138712), (22, 0.048402200225050797), (24, 0.25881127923997355), (29, 0.038931056739130533), (33, 0.097551862203701531), (37, 0.020214235278776825), (38, 0.025824529465859097), (42, 0.01106752665683642), (44, 0.026332191341840137), (46, 0.015251615124278991), (47, 0.039441500135827853), (48, 0.015235358640553741)]

TLG0273.TXT
[(9, 0.074686074679527079), (17, 0.03828686373639912), (19, 0.02488714829932507), (20, 0.030779338878901105), (23, 0.19933309820418785), (24, 0.32642990450974074), (29, 0.12911593712122343), (33, 0.063926458674581202), (37, 0.042282125676743221), (44, 0.06063449600250434)]

TLG2587.TXT
[(19, 0.1623841001271154), (29, 0.17298683703403628), (31, 0.39311490750696854), (35, 0.098913883065189431), (39, 0.097600272266690893

TLG0548.TXT
[(2, 0.05022823734186857), (5, 0.083585889179699002), (8, 0.044146312971013929), (9, 0.017158529296545474), (12, 0.010378823521033954), (14, 0.11030871022043591), (16, 0.01621549999697644), (17, 0.024573916587696198), (22, 0.027024044769586777), (24, 0.24736016334095348), (29, 0.17136006474918092), (33, 0.037737483573522045), (37, 0.020009983607102964), (38, 0.018223232118805416), (44, 0.038068004436121683), (46, 0.014997656173472582), (47, 0.027862203243862026), (48, 0.011280577629242478)]

TLG2577.TXT
[(0, 0.033350674465514457), (3, 0.063597632700942069), (5, 0.091067092490656712), (8, 0.036979545437005171), (9, 0.087720994192949597), (14, 0.023449156721024107), (16, 0.019569136240865259), (17, 0.015233203764054995), (18, 0.031788187489415275), (20, 0.027886885512492181), (22, 0.088592487136501086), (24, 0.11353208215742205), (27, 0.010814217910405573), (28, 0.013445491449066744), (33, 0.043328860966466549), (37, 0.063200347835004042), (44, 0.052117465929086657), (46, 0.1

TLG2043.TXT
[(1, 0.015765408872621626), (2, 0.018579413583488504), (5, 0.089578451044965174), (8, 0.01489850204980803), (9, 0.034030785760461119), (12, 0.020677371944055591), (14, 0.05262039569488465), (16, 0.026097646857868324), (17, 0.014337652237856893), (20, 0.04163154020727123), (22, 0.070742163475012873), (24, 0.15891257003980502), (28, 0.21009441644002461), (29, 0.01467938779089981), (31, 0.014079480269968241), (33, 0.015447551949603091), (37, 0.036736242269673723), (38, 0.018839676273991416), (44, 0.013705636180423073), (46, 0.024945029450923587), (47, 0.029754588226657823), (48, 0.018164077866071566)]

TLG2322.TXT
[(1, 0.052609344743324876), (8, 0.13988930833259167), (10, 0.060453160374185133), (24, 0.50785923689960533), (25, 0.05649988988736579), (46, 0.13637327028924365)]

TLG1992.TXT
[(5, 0.12327271127530523), (9, 0.051741284552205072), (14, 0.13199985777949744), (22, 0.16997549576404819), (24, 0.24310228537671352), (35, 0.09941492826477899), (38, 0.1191752237246238), (47, 

TLG2084.TXT
[(1, 0.012108322701057973), (2, 0.031032822852269892), (5, 0.16454199990409599), (8, 0.01765892413569508), (9, 0.042067594215043344), (12, 0.055275151072043831), (14, 0.051678755459556962), (16, 0.01386657836906471), (17, 0.013637877302917992), (22, 0.099040594107099339), (24, 0.24573779560191486), (37, 0.046697093269651983), (38, 0.021199129587533377), (42, 0.025270245419874375), (44, 0.039361787122837916), (46, 0.040927256476649007), (47, 0.026804822535656078)]

TLG2244.TXT
[(1, 0.083654612242517398), (9, 0.043877688324978306), (11, 0.052146489345318479), (14, 0.092149912220753361), (15, 0.1041940199097578), (19, 0.031630687471813558), (24, 0.23650407846021188), (33, 0.051170602994590003), (42, 0.18658698896723611), (47, 0.093084920062823567)]

TLG1670.TXT
[(1, 0.062572681626340665), (12, 0.049573150613229672), (14, 0.16326745601915732), (24, 0.33503784702808687), (27, 0.099108128786497537), (38, 0.1731864188436423), (47, 0.10071585554458289)]

TLG1304.TXT
[(1, 0.03111986

TLG5029.TXT
[(1, 0.016213374551626639), (2, 0.025069389013009646), (5, 0.076347790814294131), (8, 0.037988920787451697), (9, 0.022713469257678982), (14, 0.044650161744097749), (16, 0.051122283411190614), (17, 0.012063243403438274), (22, 0.060649971501753344), (24, 0.26929812749966003), (29, 0.046147341825150456), (31, 0.011277697814619451), (33, 0.10800568186210932), (37, 0.072178147182281133), (38, 0.012952045106144007), (42, 0.012454174632673603), (44, 0.023583835534726358), (46, 0.014792708153542304), (47, 0.02743472197984834), (48, 0.01193944172835818)]

TLG0011.TXT
[(2, 0.024756194497258806), (5, 0.067582220217386704), (8, 0.029560722217446191), (9, 0.02518322208524678), (12, 0.017316503813235858), (13, 0.012740236947165948), (14, 0.055586228975812384), (16, 0.031150781040381506), (17, 0.057637362028099534), (22, 0.022778300735990072), (24, 0.18811334134725688), (29, 0.031626708173532553), (33, 0.05829793224399208), (37, 0.029877820774438537), (42, 0.010657404897994731), (44, 0.19

TLG0017.TXT
[(1, 0.23065809033003051), (2, 0.017095422845724902), (5, 0.079670863773915343), (8, 0.01894497264567151), (9, 0.011588003262962126), (12, 0.029849432202051905), (14, 0.12363130067910671), (16, 0.015603842762276115), (22, 0.033598609738391717), (24, 0.26546867761315368), (31, 0.015313991730817381), (33, 0.045150648752674438), (38, 0.019745847363482064), (42, 0.011057864383687482), (44, 0.026095150229438192), (46, 0.013480819000894307), (47, 0.012144468018738466)]

TLG4324.TXT
[(2, 0.050295107386447983), (5, 0.088864529646397711), (9, 0.05701280049505307), (12, 0.12358397889330675), (19, 0.044136475602523415), (22, 0.17926614741259808), (24, 0.17297658578297279), (29, 0.051198569485962568), (33, 0.059105185836006061), (37, 0.04457562847629902), (38, 0.042061504250740157), (39, 0.012786714888344904), (41, 0.013949882514500387), (46, 0.047285426474242789)]

TLG1512.TXT
[(5, 0.12520387369253808), (8, 0.027398869633254264), (9, 0.05353419922545443), (17, 0.092965636467985363), (

TLG4089.TXT
[(1, 0.017985127649798541), (2, 0.017017964384315909), (5, 0.19565976407104518), (8, 0.014617721530956317), (9, 0.023460304481497628), (12, 0.033678079644253664), (13, 0.01815254122354177), (14, 0.091202224146473651), (16, 0.020465319551430793), (22, 0.06084148173358632), (24, 0.22343745123014727), (27, 0.013886957702810196), (33, 0.023390232198037028), (37, 0.025073998382285877), (38, 0.051861181748783673), (42, 0.021972597620931063), (44, 0.018197093354630645), (46, 0.025038254517518663), (47, 0.050320499815877842)]

TLG0690.TXT
[(2, 0.017358551745042347), (3, 0.017325446375787378), (5, 0.083582159419581262), (9, 0.19992876101490262), (12, 0.067686067820813933), (14, 0.04622545585828574), (16, 0.018700123527468703), (22, 0.055251325235939008), (24, 0.23381858012059087), (29, 0.055916623073522007), (33, 0.03840812088872838), (35, 0.010410383120605304), (37, 0.030006911191149215), (46, 0.033365549617394311), (47, 0.013789032794866729), (48, 0.021548682037587825)]

TLG1627.T

TLG3088.TXT
[(2, 0.026616876230331994), (5, 0.069596482336921811), (8, 0.017624333776906871), (9, 0.020361617256281286), (12, 0.055380481190842087), (14, 0.16321392017996375), (22, 0.032840657820345627), (24, 0.25584214675899836), (31, 0.012484327091275938), (33, 0.010499283390245879), (37, 0.015650892163871354), (38, 0.016354382789859248), (42, 0.04678318936068248), (44, 0.019723935924202237), (46, 0.013806953784006593), (47, 0.17463271406332329), (48, 0.011103599776177553)]

TLG0284.TXT
[(1, 0.099000976974848054), (2, 0.018963246800429403), (5, 0.080449084321803005), (8, 0.023142365279802055), (9, 0.018314532494004908), (12, 0.013202422780346676), (14, 0.12810085287901871), (16, 0.018020262392944721), (22, 0.06405358464665914), (24, 0.30276351803726897), (29, 0.019232865353148464), (31, 0.025446161309969448), (33, 0.031072426267570799), (37, 0.017953397269136678), (38, 0.02043157323574404), (42, 0.011918700265224062), (44, 0.028644222456914756), (46, 0.018270281163683271), (47, 0.017

[(3, 0.030965855528566628), (5, 0.061397427601117723), (8, 0.014973906672956313), (9, 0.072073866471572448), (12, 0.054626145752967675), (14, 0.057084259285417556), (16, 0.012753240051148786), (17, 0.016232058260904594), (20, 0.10146664079138458), (22, 0.055828532088957578), (24, 0.17666912425952128), (28, 0.16860072858584324), (29, 0.018149940708573106), (37, 0.055780856008238761), (38, 0.018923830347159203), (44, 0.011259530153984957), (46, 0.027659027361415334), (47, 0.019680131388263754), (48, 0.010063799703000482)]

TLG1158.TXT
[(2, 0.013534608565545935), (5, 0.19109959452077993), (9, 0.024313441775519137), (12, 0.065744754536724404), (14, 0.077838913318574565), (16, 0.030719303960826332), (22, 0.076923206683682674), (24, 0.2054296210005308), (33, 0.055211853365838587), (37, 0.021824311669675765), (42, 0.013632257873604564), (44, 0.057486096454693145), (47, 0.13816629396969379), (48, 0.016560692476482734)]

TLG1125.TXT
[(1, 0.10996673642570805), (5, 0.048784491200731528), (8, 0.03

TLG4097.TXT
[(2, 0.098031077866613137), (3, 0.025833608675070133), (5, 0.087297936480819391), (8, 0.019710274976520295), (9, 0.025456205059442575), (12, 0.01582201526709056), (14, 0.031436684222766659), (16, 0.13150521316218136), (17, 0.035372586158560232), (22, 0.069453158807154899), (24, 0.19629772721001829), (29, 0.035826464389751456), (33, 0.042505590097444527), (37, 0.032343737297427289), (44, 0.052105246217734914), (46, 0.018688102892430079), (47, 0.016899473792080247), (48, 0.026009846984918292)]

TLG1375.TXT
[(12, 0.48264800467532704), (23, 0.11375467176997696), (33, 0.3364544664118399)]

TLG1638.TXT
[(12, 0.082759437864883337), (24, 0.37437298425036031), (29, 0.3185974707686125), (33, 0.043824598674253971), (37, 0.054753846619696123), (38, 0.072840840953417113), (40, 0.03897985312684045)]

TLG4303.TXT
[(2, 0.036141488772601621), (5, 0.11899631389805151), (8, 0.022569016095016863), (9, 0.035668133275994079), (12, 0.025613707230538352), (14, 0.037373998759095653), (16, 0.0437697

TLG0656.TXT
[(2, 0.016175655569571842), (5, 0.10724036203707693), (8, 0.032882051354288247), (9, 0.17557116149640348), (12, 0.011061471915398071), (14, 0.033216003876092637), (16, 0.034346115875203968), (17, 0.021506059588455181), (19, 0.036931970334143659), (22, 0.067865892405364461), (24, 0.18607254491233244), (25, 0.018909851806031688), (29, 0.012888543115064394), (33, 0.092568187725229678), (37, 0.022683836556959918), (42, 0.010471409900704146), (44, 0.03439536122159538), (46, 0.020559693215715956), (47, 0.017031040469227661)]

TLG0440.TXT
[(5, 0.17670374814937359), (12, 0.12018634716135851), (17, 0.085353779730484802), (33, 0.42046005961513999), (37, 0.16396273201031047)]

TLG1397.TXT
[(2, 0.12649969493418936), (5, 0.074633509541018439), (9, 0.034988902611129309), (14, 0.087527237267038538), (17, 0.023985403813518776), (22, 0.085821253795723446), (24, 0.28719701868630809), (29, 0.1398370377866813), (38, 0.13284327489772679)]

TLG0539.TXT
[(2, 0.028348545422888383), (5, 0.060296077

TLG0525.TXT
[(1, 0.011340351774201684), (2, 0.023007203988352046), (5, 0.051053406522306312), (8, 0.041909658678939851), (9, 0.018119155497333162), (14, 0.14513946328436481), (16, 0.025000230218278956), (17, 0.018853984980933325), (22, 0.02402732163850857), (24, 0.27044912723940617), (29, 0.19145109333058064), (33, 0.037869205205640799), (38, 0.024234684486112643), (44, 0.023642321352849459), (46, 0.013626364944567154), (47, 0.018496599431307222), (48, 0.014187500213205232)]

TLG2008.TXT
[(1, 0.13598194861425039), (5, 0.12711119387130734), (14, 0.30518138195435091), (16, 0.050132053911295993), (17, 0.030200157960068173), (22, 0.026056415513146369), (24, 0.2154573390196414), (33, 0.045733608516801499), (37, 0.022377587011349637), (45, 0.023568959875195996)]

TLG3064.TXT
[(2, 0.01946219031753341), (5, 0.098680893292621211), (8, 0.016043498342886667), (9, 0.016861192646894455), (12, 0.029052500956270495), (14, 0.16052184088012153), (16, 0.022371189180558457), (22, 0.031200685356503423), (

TLG2047.TXT
[(1, 0.29179280841086547), (2, 0.022107999705121405), (5, 0.060810620645242175), (8, 0.021878288403377768), (9, 0.015114247765019585), (12, 0.024066394367472092), (14, 0.044916548529587916), (16, 0.022082076274233059), (22, 0.10211969476194369), (24, 0.1940113962394456), (37, 0.057766541135102968), (42, 0.024052141480557194), (46, 0.022783049600505943), (47, 0.028315426081711378)]

TLG1222.TXT
[(5, 0.039513985933465993), (8, 0.091158294117174896), (9, 0.039798142646973865), (14, 0.25540692959830341), (22, 0.026495586900676538), (24, 0.18350989126006023), (29, 0.035159788221415016), (33, 0.022495060630824133), (38, 0.22721953051550817), (46, 0.04196899014606971), (47, 0.03211199821876249)]

TLG0309.TXT
[(8, 0.023536822435909991), (12, 0.07204410621736923), (16, 0.20296617702659678), (24, 0.40894508295399823), (33, 0.10297094827073527), (44, 0.13971780940535855), (48, 0.041387681141013218)]

TLG0570.TXT
[(0, 0.030681648931803165), (10, 0.095458650255392172), (13, 0.1290014252

TLG4046.TXT
[(5, 0.072533099815404387), (8, 0.015085572271732002), (9, 0.01918769877718355), (14, 0.11214821225964898), (16, 0.015518103517638318), (22, 0.03224035531808464), (24, 0.16118242169485264), (27, 0.084378158815015286), (29, 0.014662119073669761), (33, 0.01067522939369453), (38, 0.059849841795107085), (42, 0.020633321580469894), (44, 0.016267850928166662), (46, 0.018636831124076554), (47, 0.28398446395155663)]

TLG4311.TXT
[(2, 0.067238473109703836), (3, 0.037300810921121905), (5, 0.1008003776740469), (8, 0.014801386068591571), (9, 0.025734050265766487), (12, 0.018015453140344192), (14, 0.025993447887038137), (16, 0.12965788498306216), (17, 0.025768776666046996), (22, 0.069283634805161712), (24, 0.19271724967361345), (29, 0.048200085676059656), (33, 0.034579131878340261), (37, 0.042817348482742067), (44, 0.056334286596986367), (46, 0.0238627112317513), (47, 0.011573326440941617), (48, 0.03143847037366692)]

TLG4039.TXT
[(5, 0.10373087456732401), (8, 0.030687262855223895), (9,

TLG1764.TXT
[(0, 0.017711959515322236), (1, 0.020815343619408677), (2, 0.014709134690266003), (3, 0.020261349963993303), (5, 0.11731225606876186), (8, 0.013721532021288136), (9, 0.028119389969383901), (14, 0.055612038238754172), (16, 0.011015085168437272), (17, 0.01305460814322347), (20, 0.040564450884153364), (22, 0.069722611496069717), (24, 0.19427789770652659), (28, 0.18323188387556569), (29, 0.021492181832420774), (33, 0.017299003492229594), (37, 0.034837886556762655), (38, 0.018929404043844199), (44, 0.01060395509636214), (46, 0.033259026792471225), (47, 0.017047737544031633)]

TLG2557.TXT
[(5, 0.072980180030665587), (12, 0.24958734062205534), (14, 0.27077120032560259), (24, 0.20094965099607648), (28, 0.03363877616043319), (43, 0.032128763520882196), (47, 0.11220215286041405)]

TLG0298.TXT
[(5, 0.03745671882447104), (8, 0.11773892035013586), (12, 0.019729236550560841), (14, 0.056328265673321991), (16, 0.044476374069053221), (17, 0.016042744863401064), (22, 0.048548137811319961), (

TLG2049.TXT
[(1, 0.068636232108574752), (2, 0.052937972789188846), (5, 0.12856172249790335), (9, 0.027138636052913594), (13, 0.017923303011346169), (14, 0.06539900272982882), (17, 0.013721689319829642), (22, 0.080159290082177551), (24, 0.23979304355631462), (27, 0.011347088244270732), (28, 0.057167505724137931), (29, 0.043735468788187973), (31, 0.02045301431646428), (33, 0.016572812743187938), (37, 0.057172621690527808), (46, 0.041023945264352851), (47, 0.040205026626956547)]

TLG2655.TXT
[(8, 0.065050327750988235), (12, 0.10588635343827819), (13, 0.071358928851964334), (16, 0.066014532674472978), (19, 0.050526854474309643), (20, 0.033868949039201082), (22, 0.067285287209418315), (24, 0.30945317363848485), (29, 0.026250764978754188), (33, 0.083107431295248252), (37, 0.023210206126055188), (38, 0.02737671717490844), (44, 0.020167624550917074), (45, 0.03771414399589211)]

TLG0490.TXT
[(2, 0.029250474270516705), (5, 0.13206140009946246), (9, 0.048956089146035588), (11, 0.02546175091033758

TLG2022.TXT
[(1, 0.016231487034233313), (2, 0.024409704810231723), (5, 0.13135924854515901), (8, 0.012147776134509942), (9, 0.027284349457365902), (12, 0.036288473485952877), (13, 0.010503941113402599), (14, 0.054451123805694647), (16, 0.028439492832676171), (17, 0.040476024681333822), (22, 0.057440339040479398), (24, 0.22624378446986515), (27, 0.011648808411574792), (31, 0.013045341888078694), (33, 0.025795885695346833), (37, 0.030301685505582453), (38, 0.02188426376837611), (42, 0.015072777451482982), (44, 0.073330178750915964), (46, 0.027504489912116299), (47, 0.03167200288262681), (48, 0.048010176691617022)]

TLG1406.TXT
[(2, 0.11725003484975911), (5, 0.1225744510645259), (9, 0.070793331268189805), (12, 0.13114123837443903), (14, 0.082970059157470424), (15, 0.066983811109986052), (18, 0.01936467081389236), (24, 0.096184237183747578), (33, 0.10935041983423956), (35, 0.019618969546533391), (38, 0.030461496116967614), (40, 0.015716089380207771), (44, 0.064061499883632633), (46, 0.0483

[(1, 0.019532917785437757), (2, 0.12866047990301815), (3, 0.010926217532899614), (5, 0.075737741339119563), (8, 0.022476776745981624), (9, 0.023322459560139563), (12, 0.021441451065533395), (14, 0.038183827089847196), (16, 0.039531187662935244), (17, 0.036900660590998648), (22, 0.064702094143618244), (24, 0.21670858451545053), (29, 0.03100727354179008), (33, 0.059267380655309974), (37, 0.025246627571990525), (44, 0.049794885654470801), (46, 0.016862141188530098), (47, 0.021490995867519926), (48, 0.050781367221433532)]

TLG0547.TXT
[(1, 0.14308988572487003), (2, 0.010068291692105567), (5, 0.10062948034890552), (8, 0.021364304320147638), (9, 0.017611909230463275), (12, 0.015108930915371348), (14, 0.094452097198696058), (16, 0.019796407402361504), (22, 0.09487865637210699), (23, 0.017761267937340855), (24, 0.21832050066126538), (29, 0.018284325787912947), (31, 0.0158820908849912), (33, 0.054159760296420688), (37, 0.020591038645460902), (38, 0.010655124110995727), (42, 0.022524417926498855

TLG1494.TXT
[(3, 0.018571957294128225), (5, 0.072346073435549635), (9, 0.019497748179002825), (14, 0.064237415974267409), (16, 0.011664369270268799), (20, 0.025855735118393574), (22, 0.16048563372670988), (23, 0.015068042566005737), (24, 0.22791526656843508), (29, 0.043736016477680009), (31, 0.023570344491857971), (33, 0.096005890826154447), (37, 0.12092674493013093), (46, 0.015511286506920746), (48, 0.069760722934572097)]

TLG1584.TXT
[(2, 0.23241878203806543), (5, 0.051649529886431705), (8, 0.041656727784962627), (9, 0.020193103390262982), (14, 0.073994998114662772), (17, 0.021249437540533735), (24, 0.18017068334943409), (27, 0.011325666991272649), (29, 0.17643394454579606), (31, 0.019612680932471867), (33, 0.032005641330400673), (37, 0.020696521689136223), (38, 0.020820120270148158), (44, 0.03655914249203171), (47, 0.012985327576518612)]

TLG1729.TXT
[(1, 0.102575762525368), (3, 0.051985458346085436), (5, 0.16293710233385988), (14, 0.030369807842041251), (17, 0.015833089408110613), 

TLG0012.TXT
[(2, 0.074025897226690673), (5, 0.058020397716476588), (8, 0.023947174695085619), (9, 0.015869147731890638), (12, 0.015095018010510769), (14, 0.01401228652053013), (16, 0.031371651268788062), (17, 0.11099765899465347), (22, 0.026717230644800104), (24, 0.13618874440295195), (29, 0.023780677118415392), (33, 0.0434758183065432), (44, 0.10965885942760799), (46, 0.012150439200227247), (48, 0.23655851737932609)]

TLG0363.TXT
[(0, 0.16335048457029056), (1, 0.026758311818169199), (3, 0.010285596081472243), (5, 0.080482778578749803), (8, 0.03616838131312123), (9, 0.022541979068341558), (14, 0.047944398660765219), (16, 0.022276534928203078), (17, 0.017764315508560068), (18, 0.048322521227751637), (20, 0.098523653262173841), (22, 0.059871636728428303), (24, 0.14215676269412428), (28, 0.040884506484389495), (33, 0.028967189893790889), (37, 0.038261794754975591), (46, 0.034354043601008272), (48, 0.012889734364711119)]

TLG0264.TXT
[(10, 0.39507217433370689), (15, 0.12819368032667572), (

TLG2050.TXT
[(0, 0.010759577422873801), (1, 0.018611590141262978), (2, 0.014517334030264476), (5, 0.094786979987017531), (8, 0.018639770972346557), (9, 0.015475125412894549), (12, 0.030442670938294638), (14, 0.10904149592004504), (17, 0.012090059479041591), (22, 0.049793745214018727), (24, 0.33325373625131743), (29, 0.014201525765052276), (33, 0.039434987571114792), (37, 0.01781927348811508), (38, 0.037770751830557926), (42, 0.029584498576881495), (44, 0.020174030526211927), (46, 0.02441872110669047), (47, 0.057942650623707903), (48, 0.015234860218577151)]

TLG1407.TXT
[(2, 0.042223269801002473), (5, 0.05207651996313796), (8, 0.052715147663294409), (9, 0.02894316298468403), (12, 0.055582464265250292), (14, 0.14609195139383169), (16, 0.03333691476398995), (23, 0.011416585523676163), (24, 0.28208873895876535), (29, 0.074460916322810489), (31, 0.025404845260872555), (33, 0.11336718799064191), (38, 0.054322082792739673)]

TLG0607.TXT
[(1, 0.15721307248622765), (2, 0.023350807939891399), (5

TLG9019.TXT
[(1, 0.036465306243647022), (2, 0.036692866356058201), (5, 0.072041357722388816), (9, 0.038504248358721412), (12, 0.014819326712042577), (16, 0.01263813679584117), (20, 0.013585470297273469), (22, 0.21332987122617164), (24, 0.15883568424862313), (28, 0.019632736113713713), (29, 0.018751316083506608), (30, 0.13675330300719146), (31, 0.02743995622182924), (37, 0.053236987130182725), (38, 0.038425975679747766), (44, 0.042458736751979781), (46, 0.023660701207770723), (47, 0.021695430193230402)]

TLG2646.TXT
[(8, 0.078406228987259421), (9, 0.071233276165605749), (13, 0.049201423743079127), (16, 0.037276702808541881), (17, 0.091751590226563248), (24, 0.17312120846073797), (29, 0.077424066589499241), (38, 0.024883602535800915), (39, 0.011845724156536178), (44, 0.16501293764790975), (46, 0.041022439244662971), (47, 0.040962754127181263), (48, 0.13426581229691381)]

TLG1414.TXT
[(2, 0.085548289349047762), (5, 0.10264542165530212), (8, 0.024671761420103123), (9, 0.033753072186836125)

TLG3094.TXT
[(2, 0.022994940319616589), (5, 0.11284823571044868), (8, 0.018770696293414695), (9, 0.032766267868462297), (12, 0.020936276275875947), (14, 0.12690693930858132), (16, 0.019019168775787871), (17, 0.011718348971860552), (22, 0.038195812226996549), (24, 0.25807299880310158), (31, 0.011346718596686751), (33, 0.024271202904477724), (37, 0.018342677421797182), (38, 0.014660692100143471), (42, 0.042498286575698679), (44, 0.028080630303886765), (46, 0.022028449714971726), (47, 0.10616700609578052), (48, 0.013692823544574588)]

TLG0750.TXT
[(1, 0.049091133285691126), (17, 0.20366543956671282), (24, 0.219789386607117), (26, 0.033904955479015071), (27, 0.032958238745017049), (35, 0.038951641763174262), (38, 0.056979234064056861), (41, 0.019917671900295267), (44, 0.19771462584347838), (48, 0.13591656163433022)]

TLG0058.TXT
[(1, 0.016235275401290591), (2, 0.015922042651882528), (5, 0.11670305245544105), (8, 0.02083381728400055), (9, 0.056037228370684047), (12, 0.024572570985420067), (

TLG1482.TXT
[(2, 0.0166795080838474), (5, 0.14852658890943082), (8, 0.02458756774524452), (9, 0.11135982389628635), (12, 0.017770286437853316), (13, 0.011086025064504826), (16, 0.054629949493013169), (17, 0.014011295517276314), (19, 0.040795443606238026), (22, 0.039394401225268535), (24, 0.18796312243046187), (25, 0.021278634791560121), (29, 0.027090537429320979), (33, 0.052119429203764958), (37, 0.029531270631543149), (42, 0.013114809396637096), (44, 0.021003482839895067), (46, 0.022759218416135058), (47, 0.090907082464726702)]

TLG5017.TXT
[(1, 0.30176116299701894), (2, 0.025176878069976379), (5, 0.084834994668457134), (8, 0.023327138724125583), (9, 0.01765917266549746), (14, 0.088908595989503528), (16, 0.01913245082163061), (22, 0.084990842062544189), (24, 0.18778695902343426), (33, 0.020918041036749758), (37, 0.020505279152253343), (38, 0.020091846600691524), (42, 0.019699598507919389), (44, 0.010629164946106903), (46, 0.01593119237136589), (47, 0.018596149048251256)]

TLG2354.TXT


TLG5023.TXT
[(2, 0.050458589026961688), (5, 0.095820645224035605), (8, 0.02215664543870649), (9, 0.032003190080403164), (12, 0.012590615896652701), (14, 0.066061573321618106), (16, 0.038877502974824285), (17, 0.023874428960851391), (22, 0.042412441845891796), (24, 0.25502553576259673), (29, 0.088958292597446822), (33, 0.042299803003477181), (37, 0.017135256850430425), (44, 0.083725861984942668), (46, 0.013555691691940997), (47, 0.024193146267547514), (48, 0.021719605262831131)]

TLG1459.TXT
[(14, 0.086702043147238506), (17, 0.17714848691259716), (24, 0.27912701647988708), (29, 0.12547364176456041), (33, 0.14345589724845303), (37, 0.10042487909017027), (48, 0.059926099873222997)]

TLG1152.TXT
[(2, 0.10449311271750891), (5, 0.073504043432851579), (8, 0.036641747369480747), (9, 0.042937995518900388), (14, 0.044970894887760494), (16, 0.10321821099871895), (17, 0.023324225040425139), (22, 0.039971455143101596), (24, 0.22110533323255946), (28, 0.013030576546625509), (29, 0.023357097569775779

TLG0010.TXT
[(1, 0.10917742272260174), (2, 0.021976434050102921), (5, 0.083392191628273118), (8, 0.016717618128920541), (9, 0.019264831674378511), (12, 0.031110519737525398), (14, 0.17668559344130744), (16, 0.014899619683679137), (22, 0.036713075816318698), (24, 0.25552962213502994), (29, 0.012346266392662482), (31, 0.046273572652865198), (33, 0.028317924916983483), (37, 0.01286043546968099), (38, 0.029357865346183738), (42, 0.014594502587765918), (44, 0.032820944454589253), (46, 0.020138051452502907), (47, 0.011205001412235663)]

TLG0691.TXT
[(2, 0.047658224080046684), (5, 0.12585104874268432), (8, 0.034219709154627839), (9, 0.078212227523367525), (10, 0.020620316448374143), (13, 0.022738248574990759), (14, 0.043868616104723535), (20, 0.02528092504994095), (22, 0.094772145090427584), (24, 0.31610147212227191), (28, 0.026040525898084317), (31, 0.03519204237159073), (33, 0.045968029510674573), (38, 0.050709224884104148), (47, 0.028744255938343021)]

TLG1435.TXT
[(0, 0.015943011165472566

[(2, 0.058558367074309274), (5, 0.069861277080990464), (8, 0.084359447142175858), (10, 0.028687557021436553), (14, 0.055949511130187334), (20, 0.013802654805086065), (24, 0.20902182464203184), (28, 0.071935658530155266), (29, 0.19197513325991197), (38, 0.13762567267801021), (47, 0.032365377160577732), (48, 0.010727790580833299)]

TLG1342.TXT
[(2, 0.012123317803487636), (3, 0.023712660819761429), (5, 0.092398893980770835), (8, 0.037841326122930112), (9, 0.040775397293281133), (14, 0.10240702792592647), (17, 0.065356194636708859), (20, 0.014392429089332855), (22, 0.072184590978709648), (24, 0.1871448558668804), (28, 0.01170498842394951), (29, 0.01592955005440766), (33, 0.070542232062065668), (37, 0.057308141381117908), (44, 0.036362248973533878), (46, 0.049809129154570898), (48, 0.055690360960423159)]

TLG1791.TXT
[(1, 0.041414676381903909), (5, 0.057350355942183011), (8, 0.014781105042140308), (9, 0.062389565450804943), (12, 0.042722594762936834), (13, 0.065673365002152331), (14, 0.0451

TLG5004.TXT
[(2, 0.076911200845364736), (3, 0.080546787155144212), (5, 0.074546998020205438), (8, 0.016345535375284433), (9, 0.0170159912270238), (12, 0.016918883238407682), (14, 0.0161664584885917), (16, 0.104743208404052), (17, 0.038065661418986992), (22, 0.096354080245146662), (24, 0.11958135070953972), (29, 0.018992747013366989), (30, 0.023993042035660064), (33, 0.016557688213480647), (37, 0.037236720715934681), (38, 0.020867752356830922), (44, 0.087466883090332737), (46, 0.034762237535221702), (47, 0.018584802525040779), (48, 0.051594688091272441)]

TLG4227.TXT
[(1, 0.056721534077446699), (2, 0.015646018501042788), (5, 0.059591752456977932), (8, 0.044712947198228659), (9, 0.015862281914487613), (14, 0.042905822867759541), (15, 0.071812700742256791), (17, 0.017196031185875673), (22, 0.13113423840984526), (24, 0.20426344745858996), (27, 0.019595423173107022), (29, 0.028626807661224381), (31, 0.049001523317061112), (33, 0.063170152444414401), (37, 0.084064368143436888), (44, 0.012330

TLG0551.TXT
[(2, 0.025997908000222596), (5, 0.057491873125265471), (8, 0.030197626772657998), (9, 0.015939382597609925), (12, 0.01537794802026706), (14, 0.3315014102372153), (16, 0.021915580705532848), (22, 0.022045719028328345), (24, 0.25882400598178212), (29, 0.012115932207938124), (33, 0.012215145748842911), (38, 0.052040185089698439), (40, 0.011512865051378406), (42, 0.019800425952343288), (47, 0.04287424728556017)]

TLG4102.TXT
[(1, 0.01874771546016768), (2, 0.01672407457728076), (5, 0.21002544980306903), (8, 0.013136601071315694), (9, 0.020154189575951345), (12, 0.061742903030300486), (13, 0.021763127563446335), (14, 0.091325854374204332), (16, 0.019797029187217631), (22, 0.073949033124655922), (24, 0.21223828306643408), (27, 0.012625995031560869), (33, 0.013472048634952178), (37, 0.034723062788504282), (38, 0.026507968214827097), (42, 0.016451127176020066), (44, 0.016840781637328661), (46, 0.030546083479269633), (47, 0.044717835963597234)]

TLG2033.TXT
[(0, 0.089230203247250228)

TLG4084.TXT
[(1, 0.01248053731551259), (2, 0.022525052243391293), (5, 0.096740580843492199), (8, 0.029683244466209902), (9, 0.020244926396058648), (12, 0.028233207672119103), (14, 0.22144252932562158), (16, 0.016035569546242908), (22, 0.030068946845983265), (24, 0.24766909098245382), (33, 0.011997352469884965), (38, 0.044385452358097426), (40, 0.016745713201484505), (42, 0.031343008232814829), (44, 0.01580447903644551), (46, 0.012903245892870096), (47, 0.085197738349143043)]

TLG4098.TXT
[(2, 0.062428415053859286), (3, 0.048683322326233291), (5, 0.10776790194890618), (8, 0.017319203862845516), (9, 0.029825392030884905), (12, 0.02693503488186097), (14, 0.024878489845436686), (16, 0.1203868087909493), (17, 0.024523671547218358), (22, 0.085114649322276345), (24, 0.17354187238271471), (29, 0.019227124943598631), (33, 0.037348285491710174), (37, 0.027863693518586877), (44, 0.054434490201504385), (46, 0.028029836680939012), (47, 0.017199693575573775), (48, 0.035269935542030804)]

TLG0351.TXT

TLG2806.TXT
[(5, 0.16483427436707615), (9, 0.033400061631944762), (12, 0.033684835907607243), (14, 0.11488377184255895), (16, 0.014150829760804721), (17, 0.013507770543496399), (22, 0.051789360619184419), (24, 0.22194388118569125), (27, 0.051040401208780575), (33, 0.015912943311631716), (37, 0.012116353032665382), (38, 0.040214167509753838), (44, 0.014707633802913936), (46, 0.029885566043254302), (47, 0.11824815663563353), (48, 0.015641844410003984)]

TLG3086.TXT
[(1, 0.020942895675030389), (2, 0.011176871218330702), (3, 0.010391339253396945), (5, 0.13119516354869368), (8, 0.015278709939831882), (9, 0.028245743927577026), (12, 0.026048917869964758), (14, 0.085738124953915612), (16, 0.011595915578750325), (22, 0.063830912474702045), (24, 0.2202129901861129), (27, 0.020206878796007251), (33, 0.012232301423240565), (37, 0.026653864874220102), (38, 0.064733089217410583), (42, 0.022893404266150087), (44, 0.018570886496060313), (46, 0.032580818456199999), (47, 0.10925628689176824), (48, 0.01

[(2, 0.036722609748769834), (5, 0.16813042024560726), (8, 0.055269615974741171), (10, 0.02717658477642055), (14, 0.13283181816384107), (17, 0.050399828408992056), (20, 0.077175728055872861), (22, 0.055035449790629506), (24, 0.16296603010155067), (31, 0.014288629750474588), (35, 0.010881477730559124), (37, 0.064588169633557721), (38, 0.065597841537547524), (47, 0.016787007135565322), (48, 0.059947531084235776)]

TLG2298.TXT
[(1, 0.079914941774122061), (14, 0.21170423853347822), (17, 0.043339261562647106), (24, 0.1423806069742258), (33, 0.20445987990809053), (37, 0.049595563740904153), (38, 0.13148349709423313), (44, 0.11378867707896539)]

TLG1801.TXT
[(5, 0.10192916635918997), (8, 0.041871624081433655), (9, 0.04843752029990929), (12, 0.026400503750717442), (16, 0.091929022742582037), (17, 0.13911634567514686), (24, 0.1159533765389496), (30, 0.012562784577526749), (33, 0.1285696746438442), (35, 0.039085790336623613), (46, 0.070606184148623088), (48, 0.1760870264532968)]

TLG1737.TXT
[(8,

TLG2580.TXT
[(2, 0.028497538315386005), (5, 0.078566000450863344), (8, 0.038447265502148276), (9, 0.041799275426499499), (12, 0.026712753138350861), (14, 0.089231599557869801), (16, 0.024391237299281769), (17, 0.011771452239290624), (20, 0.019576977856127411), (22, 0.07836373930633142), (24, 0.2282393444190059), (28, 0.030216830193192139), (29, 0.025301976139176507), (33, 0.02791107342719467), (37, 0.022442442461972564), (38, 0.049865243548135089), (42, 0.014259649000247064), (44, 0.018248791456801309), (46, 0.042351650274274816), (47, 0.062140129130113589), (48, 0.014152528794114718)]

TLG0005.TXT
[(2, 0.021482177115927269), (5, 0.069550610620569525), (8, 0.036741873442657579), (9, 0.024089355837578118), (12, 0.015828671255551818), (14, 0.016448442987369742), (16, 0.033685431416012775), (17, 0.10676706046034329), (22, 0.019980531247674359), (24, 0.16911195094455778), (25, 0.01888715741159842), (29, 0.027896942250189471), (33, 0.070548148424612336), (37, 0.020312622239608009), (44, 0.1

TLG4193.TXT
[(1, 0.044075509346151877), (2, 0.044476163080499953), (5, 0.092637659936651698), (9, 0.04727569317484627), (12, 0.023481165110077105), (14, 0.042436880911782979), (16, 0.015112323189897338), (17, 0.010544665447395405), (22, 0.16048397873559908), (24, 0.14045684121216417), (28, 0.021968855516743678), (29, 0.013401200356360672), (30, 0.027525889942964663), (31, 0.051367004917798074), (33, 0.023875530596037947), (37, 0.1021359279992113), (38, 0.01484916311982961), (42, 0.013922082389668999), (44, 0.019251214677830517), (46, 0.033915788934908951), (47, 0.020565052430290179)]

TLG0007.TXT
[(1, 0.030804554203538344), (2, 0.015902596999201921), (5, 0.075781326805539978), (8, 0.022213046308382247), (9, 0.026855036548383327), (12, 0.017376016843769555), (14, 0.17470320577356141), (16, 0.018672216243520421), (22, 0.049517405112500266), (24, 0.31180419329242115), (29, 0.025787890173297821), (33, 0.050764982161751147), (37, 0.016746308558203474), (38, 0.021075979842894006), (42, 0.010

TLG2061.TXT
[(5, 0.24407577288979931), (8, 0.016990620399599185), (9, 0.013752641192622483), (12, 0.062013645177982725), (14, 0.085056406710476345), (16, 0.026408839018320994), (17, 0.016434246917973711), (22, 0.060630249073061401), (24, 0.18578000728354882), (27, 0.011711639492129824), (31, 0.012957314188995941), (33, 0.022530134332144007), (37, 0.017262060932209777), (38, 0.015527861978839581), (42, 0.021580059628667381), (44, 0.044097570614626463), (46, 0.021758859225098525), (47, 0.068294501715959521), (48, 0.012045717044014064)]

TLG0340.TXT
[(2, 0.18941952032050072), (8, 0.14453513971996682), (12, 0.15541637183397225), (16, 0.26790690912479903), (29, 0.19716561325289314), (34, 0.022992343183764155)]

TLG0384.TXT
[(5, 0.12216144641422845), (10, 0.011962462520151516), (12, 0.043908268856233862), (19, 0.012713094367238626), (22, 0.092897781749529743), (24, 0.18396614919518126), (29, 0.028753437150236417), (33, 0.03219631121017992), (37, 0.010739593566794697), (44, 0.0386053314581702

TLG0614.TXT
[(2, 0.025826887308763532), (5, 0.086332452621855155), (9, 0.025107048307930903), (12, 0.039219912304550221), (14, 0.071825144862140991), (16, 0.041296378057337491), (17, 0.045513280801422029), (22, 0.041480029955482396), (24, 0.28970838592930009), (25, 0.034541540678807457), (29, 0.038892638626201778), (33, 0.055424661948557753), (37, 0.013923537288561584), (44, 0.075812184297162494), (46, 0.011324088644207539), (47, 0.025775467636507822), (48, 0.041653035568258015)]

TLG0014.TXT
[(1, 0.27032481229946043), (2, 0.013789486729349166), (5, 0.071476277001895877), (8, 0.014596947498172447), (9, 0.016954739527207708), (12, 0.016601296148103278), (14, 0.11592318713909743), (16, 0.012925198192457804), (22, 0.031799566390153848), (24, 0.22997964893317116), (31, 0.021865230076376151), (33, 0.033178878008457396), (38, 0.025185105877304294), (44, 0.036400908834527249), (46, 0.013337627131579356), (47, 0.018117657205959255)]

TLG3181.TXT
[(5, 0.1275832031877713), (8, 0.0173831767772616

TLG4031.TXT
[(1, 0.041754658538487144), (2, 0.023929113630109196), (3, 0.010868247045780887), (5, 0.12467362698659688), (8, 0.017337603746144965), (9, 0.044690851734846194), (14, 0.029059333396407625), (16, 0.015782856254760565), (22, 0.18502352134212224), (24, 0.15985888879295498), (28, 0.018649236036107606), (30, 0.016739336108005209), (31, 0.026310330396136564), (37, 0.10796301831293934), (38, 0.020606738164776366), (42, 0.013466169838302303), (44, 0.016546685798059693), (46, 0.052515322019799027), (47, 0.019706204090753354)]

TLG2236.TXT
[(2, 0.032908598009874647), (5, 0.074831670298675962), (9, 0.014034812923507207), (12, 0.012003986674798873), (14, 0.26059497351944044), (22, 0.027005541829956178), (24, 0.26526603563917273), (29, 0.01797058296923695), (38, 0.07284013831441162), (40, 0.021576342686329665), (42, 0.02415906485229586), (44, 0.012946241669026595), (46, 0.013054972475719084), (47, 0.10454186832555447)]

TLG0645.TXT
[(1, 0.010115682501988006), (2, 0.024983124571434812), 

TLG2023.TXT
[(1, 0.015504952217742082), (3, 0.015981752933624197), (5, 0.11843611747068887), (8, 0.017160786877246766), (9, 0.037467939579104319), (12, 0.019502094443590082), (14, 0.061157746007637674), (16, 0.019761014669459516), (20, 0.011977824755023014), (22, 0.11523181686731272), (24, 0.18427902673266885), (28, 0.011725011230469233), (29, 0.012290285122297247), (31, 0.029389963379409297), (33, 0.048222335195405107), (37, 0.06661502490147414), (38, 0.013310188673714224), (42, 0.018259513952955327), (44, 0.01502969119038031), (46, 0.082757999937136117), (47, 0.017211045120405825), (48, 0.019644260511213898)]

TLG2506.TXT
[(0, 0.14557247151516051), (3, 0.05699168014170955), (19, 0.017857678828058908), (27, 0.15255301908480695), (31, 0.21869492872773549), (35, 0.19678571428571415), (37, 0.17424579679511171), (39, 0.022298710621702012)]

TLG1127.TXT
[(0, 0.013108139605964048), (1, 0.017576546213180281), (2, 0.045810106528874378), (3, 0.014698332337778737), (5, 0.12758389027179751), (9,

# Visualization

Following: http://nbviewer.jupyter.org/github/bmabey/pyLDAvis/blob/master/notebooks/pyLDAvis_overview.ipynb

In [51]:
lda_model.show_topics()

[(16,
  '0.009*"αμφισβητησιν" + 0.004*"αττικοι" + 0.003*"επιρρημα" + 0.003*"συγκοπην" + 0.002*"επιλογου" + 0.002*"υπερβολων" + 0.002*"απορουμενοις" + 0.002*"ρυθμοις" + 0.002*"δακνουσι" + 0.002*"φατνας"'),
 (44,
  '0.003*"ηλθ" + 0.002*"αισχυλος" + 0.002*"βροτοις" + 0.002*"αζυμα" + 0.001*"ακινητους" + 0.001*"κολαζεσθαι" + 0.001*"μνησθησομαι" + 0.001*"οδ" + 0.001*"φαλαγγες" + 0.001*"αλιας"'),
 (40,
  '0.017*"τρεπων" + 0.006*"μελετωμεν" + 0.004*"χαιρω" + 0.003*"θρηνουσαν" + 0.003*"εμφερεις" + 0.003*"νεστοριος" + 0.003*"διερχομενου" + 0.003*"βιαι" + 0.003*"παρισταμενον" + 0.002*"μαθωσιν"'),
 (3,
  '0.010*"ευειδεις" + 0.007*"κελυφος" + 0.006*"γενικη" + 0.006*"κογχυλιων" + 0.006*"σπληνος" + 0.005*"διισχυριζοντο" + 0.005*"ανδρομεδας" + 0.005*"δυναστευων" + 0.004*"νικωσης" + 0.004*"επαγομενην"'),
 (15,
  '0.015*"γοργια" + 0.009*"δηλουσαι" + 0.009*"φαιδρω" + 0.007*"μεταβαλλεται" + 0.007*"εισκρινεσθαι" + 0.007*"διαλογος" + 0.005*"διαλογοις" + 0.005*"θαυμασται" + 0.004*"διατελουσι" + 0.004*"μεσοτη

In [52]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

In [53]:
pyLDAvis.gensim.prepare(lda_model, mm_corpus, id2word_tlg)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]
