In [1]:
from gensim.models import LdaModel
from gensim.corpora import Dictionary
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.stop.latin.stops import STOPS_LIST as latin_stops
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence
from gensim.models import LdaModel
import logging
import os
import time

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [64]:
def gen_docs_lda(corpus, lemmatize, rm_stops, testing):
    """This returns a list of tokenized words for an entire document; no sentence tokenization."""
    punkt = PunktLanguageVars()
    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    if lemmatize:
        lemmatizer = LemmaReplacer(language)        
    if testing:
        filepaths = filepaths[:20]

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:
        with open(filepath) as f:
            text = f.read()

        text = text_cleaner(text, rm_punctuation=True, rm_periods=True)
        words = punkt.word_tokenize(text)
        words = [w.lower() for w in words]
        words = [w for w in words if w]

        if stops:
            words = [w for w in words if w not in stops]
        words = [w for w in words if len(w) > 1]  # rm short words

        if words:
            words = words
        if words and language == 'latin':
            words = [jv_replacer.replace(word) for word in words]
        if lemmatize:
            words = lemmatizer.lemmatize(words)
        
        # dirty hack to ch incorrect 'edo1' lemmas
        new_words = []
        for word in words:
            if word == 'edo1':
                word = 'sum1'
            new_words.append(word)

        yield new_words

In [71]:
documents = gen_docs_lda('phi5', lemmatize=True, rm_stops=True, testing=False)
dictionary = Dictionary(documents)
documents = gen_docs_lda('phi5', lemmatize=True, rm_stops=True, testing=False)
corpus = [dictionary.doc2bow(doc) for doc in documents]

model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=10)

In [72]:
model.save('/tmp/lda_gensim_latin.model')

In [67]:
model.num_terms

128818

In [73]:
model.num_topics

100

In [77]:
model.show_topics(100)

['0.002*negumate + 0.001*ningulus + 0.001*nouentium + 0.001*expliciunt + 0.001*paroptus + 0.001*admiscis + 0.001*ascaloniam + 0.001*sapidus + 0.001*turio + 0.001*luxus2',
 '0.002*mens + 0.002*eiero + 0.002*tangomenas + 0.002*quiritum + 0.002*nusquam + 0.001*laus + 0.001*retineo + 0.001*pelvis + 0.001*perimadeia + 0.001*esor',
 '0.001*kato + 0.001*spolieis + 0.001*atur + 0.001*tueis + 0.001*uisce + 0.001*deiuitiora + 0.001*lycori + 0.001*tyria + 0.001*post + 0.001*historia',
 '0.030*sum1 + 0.028*tu + 0.028*ego + 0.021*hic + 0.015*qui1 + 0.014*quis1 + 0.013*facio + 0.010*meus + 0.009*dico2 + 0.009*ille',
 '0.003*cluo + 0.002*latonius + 0.002*icadium + 0.002*rediculi + 0.002*iapydem + 0.002*boaulia + 0.001*scaeu + 0.001*eanum + 0.001*oscillo + 0.001*talla',
 '0.036*sum1 + 0.034*facio + 0.015*is + 0.015*eo1 + 0.013*oportet + 0.011*uto + 0.010*qui1 + 0.010*vinum + 0.010*sero1 + 0.009*fio',
 '0.002*alpinaque + 0.002*crystallus + 0.001*aridulus + 0.001*anquina + 0.001*genumana + 0.001*arateis