In [1]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.latin.stops import STOPS_LIST
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence

from gensim.models import Word2Vec
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %message)s', level=logging.INFO)

# Prepare PHI sentences

In [60]:
filepaths = assemble_phi5_author_filepaths()
filepaths = filepaths[:5]  # for testing
sent_tokenizer = TokenizeSentence('latin')
p = PunktLanguageVars()
jv = JVReplacer()

In [46]:
phi_sentences = []
for filepath in filepaths:
    with open(filepath) as f:
        text_raw = f.read()
    text_clean = phi5_plaintext_cleanup(text_raw)  # phi5_plaintext_cleanup()
    sent_tokens_upper = sent_tokenizer.tokenize_sentences(text_clean)  # sentence tokenize
    sent_tokens = [s.lower() for s in sent_tokens_upper]  # lowercase
    #sentence_tokens_author = []
    for sent in sent_tokens:  # tokenize words in sentences
        sent_word_tokens = []
        sent_word_tokens = p.word_tokenize(sent)
        sent_word_tokens = [jv.replace(word) for word in sent_word_tokens]
        sent_word_tokens_new = []
        for word in sent_word_tokens:  # remove punctuation (final period, commas, etc)
            if word[-1] in ['.', '“']:
                word_new = word[:-1]
                sent_word_tokens_new.append(word_new)
            elif word[0] =='“':
                word_new = word[1:]
                sent_word_tokens_new.append(word_new)
            elif word in [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}', '!', '”']:
                continue
            elif word in STOPS_LIST:  # remove stops
                continue
            elif '˘' in word:  # rm meter
                continue
            elif 'á' in word:  # rm accents from vowels; find more graceful way of doing this
                word_new = word.replace('á', 'a')
                sent_word_tokens_new.append(word_new)
            elif 'é' in word:
                word_new = word.replace('é', 'e')
                sent_word_tokens_new.append(word_new)
            elif 'í' in word:
                word_new = word.replace('í', 'i')
                sent_word_tokens_new.append(word_new)
            elif 'ó' in word: #! no 'ó' found in PHI5
                word_new = word.replace('ó', 'o')
                sent_word_tokens_new.append(word_new)
                print('rmd vowel', word, word_new)
            elif 'ú' in word:
                word_new = word.replace('ú', 'u')
                sent_word_tokens_new.append(word_new)
            else:
                sent_word_tokens_new.append(word)
        sent_word_tokens_new = [w for w in sent_word_tokens_new if len(w) > 1]  # rm short words

        sentence = [w for w in sent_word_tokens_new if w]  # remove any empty words (created thru above cleanup)
        if sentence:  # remove any empty sentences (created thru above cleanup)
            phi_sentences.append(sentence)

In [47]:
print(phi_sentences[:5])
print('Total sentences:', len(phi_sentences))

[['uerborum', 'ius', 'ciuile', 'pertinent', 'significatione', 'postliminio', 'reus', 'altero', 'litem', 'contestatam', 'habet', 'siue', 'egit', 'siue', 'eo', 'actum', 'est'], ['reus', 'stipulando', 'stipulator', 'dicitur', 'quippe', 'suo', 'nomine', 'altero', 'quid', 'stipulatus', 'alteri', 'adstipulatus', 'est'], ['reus', 'promittendo', 'suo', 'nomine', 'alteri', 'quid', 'promisit', 'altero', 'quid', 'promisit'], ['saltus', 'siluae', 'pastiones', 'sunt', 'quarum', 'causa', 'casae', 'quoque'], ['qua', 'particula', 'eo', 'saltu', 'pastorum', 'custodum', 'causa', 'aratur', 'ea', 'res', 'peremit', 'nomen', 'saltui', 'fundi', 'agro', 'culto', 'eius', 'causa', 'habet', 'aedificium', 'qua', 'particula', 'eo', 'habet', 'siluam']]
Total sentences: 46042


In [48]:
# lemmatize sentences
lemmatizer = LemmaReplacer('latin')

phi_sentences_lemma = []
for sentence in phi_sentences:
    lemmatized_sent = lemmatizer.lemmatize(sentence)
    phi_sentences_lemma.append(lemmatized_sent)    

INFO:CLTK:Loading lemmata. This may take a minute.


In [49]:
print(phi_sentences_lemma[:5])
print('Total sentences:', len(phi_sentences_lemma))

[['verbum', 'jus', 'civile', 'pertineo', 'significatio', 'postliminium', 'reus', 'alter', 'lito', 'contestor', 'habeo', 'si', 'ago', 'si', 'eo1', 'ago', 'edo1'], ['reus', 'stipulo', 'stipulator', 'dico2', 'quippe', 'suo', 'nomen', 'alter', 'quis1', 'stipulo', 'alter', 'adstipulo', 'edo1'], ['reus', 'promitto', 'suo', 'nomen', 'alter', 'quis1', 'promitto', 'alter', 'quis1', 'promitto'], ['saltus1', 'silva', 'pastio', 'sum1', 'qui1', 'causa', 'cado', 'quoque'], ['qui1', 'particula', 'eo1', 'saltus1', 'pasco', 'custos', 'causa', 'aro', 'is', 'reor', 'per-emo', 'nomen', 'saltus1', 'fundo1', 'ager', 'colo1', 'is', 'causa', 'habeo', 'aedificium', 'qui1', 'particula', 'eo1', 'habeo', 'silva']]
Total sentences: 46042


# Train model

In [50]:
model = Word2Vec(sentences=phi_sentences_lemma, size=100, window=5, min_count=5, workers=4)
# If you’re finished training a model (=no more updates, only querying), you can do
# https://radimrehurek.com/gensim/models/word2vec.html
# model.init_sims(replace=True)

In [51]:
import os
model_path = os.path.expanduser('~/cltk_data/user_data/word2vec_phi_lemma.model')

In [52]:
model.save(model_path)  # 84 MB

In [53]:
# to load:
model = Word2Vec.load(model_path)

# Fun with word2vec

In [54]:
model.most_similar('alter')

[('xx', 0.769722580909729),
 ('duo', 0.7590145468711853),
 ('uter', 0.7507055997848511),
 ('xii', 0.7478469610214233),
 ('undecimus', 0.7446541786193848),
 ('hemitonium', 0.7335435152053833),
 ('circinatio', 0.7317605018615723),
 ('xxx', 0.7282785177230835),
 ('unus', 0.7280042767524719),
 ('cymatium', 0.7254071235656738)]

In [55]:
model.most_similar('habeo')

[('sum1', 0.8618136644363403),
 ('ambulatio', 0.8575546741485596),
 ('mutulus', 0.8557659387588501),
 ('epistylium', 0.8400512933731079),
 ('fio', 0.835148811340332),
 ('denticulus', 0.8341249227523804),
 ('orbiculus', 0.8340144753456116),
 ('uto', 0.8300203084945679),
 ('influo', 0.8242273330688477),
 ('symmetria', 0.822694718837738)]

In [56]:
model.most_similar('pontus1')

[('nauta', 0.9465115666389465),
 ('aequor', 0.9337424039840698),
 ('unda', 0.9327002167701721),
 ('undo', 0.9317194223403931),
 ('rupes', 0.9209202527999878),
 ('fluctus', 0.9206447601318359),
 ('amnis', 0.9127741456031799),
 ('libyen', 0.9100551605224609),
 ('fretus1', 0.90400230884552),
 ('boreas', 0.9012879729270935)]

In [57]:
model.most_similar('eo1')

[('is', 0.9140404462814331),
 ('dirigo', 0.8806520700454712),
 ('ito', 0.8663041591644287),
 ('ambulatio', 0.8542487025260925),
 ('uto', 0.8472296595573425),
 ('sui', 0.8286542892456055),
 ('posticum', 0.8192173838615417),
 ('epistylium', 0.8109995722770691),
 ('paries', 0.8076523542404175),
 ('qui1', 0.8073225617408752)]

In [58]:
model.most_similar('castra')

[('procedo', 0.8233951926231384),
 ('minito', 0.7950356006622314),
 ('fuga', 0.7927190065383911),
 ('fascis', 0.7914959788322449),
 ('relabor', 0.7890385389328003),
 ('ad-levo1', 0.7867715358734131),
 ('triumpho', 0.7811341881752014),
 ('speculor', 0.7683494091033936),
 ('concurro', 0.7673412561416626),
 ('euphraten', 0.7663215398788452)]

In [59]:
model.most_similar('cura')

[('conscius', 0.890749454498291),
 ('tristis', 0.8817610144615173),
 ('saevio', 0.8768550157546997),
 ('!”', 0.8750863075256348),
 ('parens', 0.8706927299499512),
 ('virginitas', 0.8700000047683716),
 ('casus1', 0.8685871362686157),
 ('en', 0.8672870993614197),
 ('ecquis', 0.8627985715866089),
 ('profugus', 0.8614393472671509)]

In [59]:
model.most_similar('cura')

[('conscius', 0.890749454498291),
 ('tristis', 0.8817610144615173),
 ('saevio', 0.8768550157546997),
 ('!”', 0.8750863075256348),
 ('parens', 0.8706927299499512),
 ('virginitas', 0.8700000047683716),
 ('casus1', 0.8685871362686157),
 ('en', 0.8672870993614197),
 ('ecquis', 0.8627985715866089),
 ('profugus', 0.8614393472671509)]

In [41]:
# which word doesn't go with the others?
model.doesnt_match("filius pater mater canis".split())

'filius'

In [42]:
model.similarity('pater', 'mater')

0.87677168053217447

In [43]:
model.similarity('pater', 'canis')

KeyError: 'canis'

In [44]:
model['hasta']

array([ 0.01135846,  0.0193827 , -0.04424696,  0.06448502,  0.07657508,
       -0.03711487, -0.0270774 ,  0.01487874, -0.02835034, -0.04341644,
       -0.00732749,  0.01200995,  0.07838054, -0.03163796, -0.01311052,
        0.00202885,  0.00944777,  0.03137747, -0.01638612, -0.01159532,
       -0.00729375, -0.02914317, -0.01382305, -0.00218378, -0.00654112,
       -0.05540094, -0.02213213, -0.10004523,  0.01012381,  0.02273786,
        0.02455524, -0.04728288,  0.00378993,  0.02812943,  0.04177308,
       -0.00882489,  0.01114683, -0.0447725 ,  0.11040398, -0.05051815,
       -0.01209557,  0.12336719,  0.00746626,  0.05108232, -0.09615205,
        0.01124229, -0.01870903,  0.03904261,  0.03580958,  0.01955826,
        0.01387766,  0.08324827,  0.03606161, -0.08391311, -0.05632499,
        0.06681492, -0.03705744, -0.01570069,  0.0073466 , -0.04375116,
       -0.07401293,  0.04874749,  0.06110816, -0.00129882,  0.03043128,
       -0.01511536, -0.08096661, -0.02472437,  0.02812227, -0.07