# Word2Vec experiments in Greek and Latin

The following are tests of Word2Vec w/ Latin and Greek. For each language, I'm testing the results gained through removing stopwords and lemmatizing. For all of these, the Word2Vec values are `size=100, window=10, min_count=5`.

In [11]:
from cltk.corpus.utils.formatter import assemble_phi5_author_filepaths
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import phi5_plaintext_cleanup
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stem.latin.j_v import JVReplacer
from cltk.stem.lemma import LemmaReplacer
from cltk.stop.greek.stops import STOPS_LIST as greek_stops
from cltk.stop.latin.stops import STOPS_LIST as latin_stops
from nltk.tokenize.punkt import PunktLanguageVars
from cltk.tokenize.sentence import TokenizeSentence
from gensim.models import Word2Vec
import logging
import os

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Sentence-loading function

In [12]:
def gen_sentences(corpus, lemmatize, rm_stops, testing):
    # TODO: Replace accented chars with un
    punkt = PunktLanguageVars()
    punctuation = [',', '.', ';', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}']
    if corpus == 'phi5':
        language = 'latin'
        filepaths = assemble_phi5_author_filepaths()
        jv_replacer = JVReplacer()
        text_cleaner = phi5_plaintext_cleanup
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    elif corpus == 'tlg':
        language = 'greek'
        filepaths = assemble_tlg_author_filepaths()
        text_cleaner = tlg_plaintext_cleanup
        if rm_stops:
            stops = latin_stops
        else:
            stops = None
    if lemmatize:
        lemmatizer = LemmaReplacer(language)        
    if testing:
        filepaths = filepaths[:5]

    sent_tokenizer = TokenizeSentence(language)

    for filepath in filepaths:   
        with open(filepath) as f:
            text_raw = f.read()
        text_cleaned = text_cleaner(text_raw)
        sent_tokens = sent_tokenizer.tokenize_sentences(text_cleaned)
        doc_sentences = []
        for sentence in sent_tokens:
            sentence = punkt.word_tokenize(sentence)
            sentence = [s.lower() for s in sentence]

            # rm junk chars
            _sentence = []
            for word in sentence:
                word = [c for c in word if c not in punctuation]
                _sentence.append(''.join(word))
            sentence = _sentence


            sentence = [w for w in sentence if w]
            if stops:
                sentence = [w for w in sentence if w not in stops]
            #sentence = [w for w in sentence if len(w) > 1]  # rm short words

            if sentence:
                sentence = sentence
            if lemmatize:
                sentence = lemmatizer.lemmatize(sentence)
            if sentence and language == 'latin':
                sentence = [jv_replacer.replace(word) for word in sentence]
            if sentence != []:
                doc_sentences.append(sentence)
        if doc_sentences != []:
            yield doc_sentences

# Latin

## Train model, Latin, lemmatize & rm_stops

In [21]:
docs = gen_sentences('phi5', lemmatize=False, rm_stops=True, testing=False)
model_started = False
counter = 0
for sentences in docs:
    counter += 1
    if not model_started:
        model = Word2Vec(sentences=sentences, size=100, window=10, min_count=5, workers=4, sg=1)
    if model_started:
        if counter % 100 == 0:
            print(counter)
        model.train(sentences)
    model_started = True


10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250


KeyboardInterrupt: 

In [None]:
len(model.vocab)

In [6]:
model_path = os.path.expanduser('~/word2vec_tests/latin/w2v_w10_latin_lemmatizer_stops.model')
model.init_sims(replace=True)
model.save(model_path)

## Train model, Latin, lemmatize

In [7]:
sentence_generator = train_model('phi5', lemmatize=True, rm_stops=False, testing=False)
model = Word2Vec(sentences=list(sentence_generator), size=100, window=10, min_count=5, workers=4)

INFO:CLTK:Loading lemmata. This may take a minute.


In [8]:
model_path = os.path.expanduser('~/word2vec_tests/latin/w2v_w10_latin_lemmatizer.model')
model.init_sims(replace=True)
model.save(model_path)

## Train model, Latin, rm_stops

In [9]:
sentence_generator = train_model('phi5', lemmatize=False, rm_stops=True, testing=False)
model = Word2Vec(sentences=list(sentence_generator), size=100, window=10, min_count=5, workers=4)

In [10]:
model_path = os.path.expanduser('~/word2vec_tests/latin/w2v_w10_latin_stops.model')
model.init_sims(replace=True)
model.save(model_path)

## Train model, Latin

In [11]:
sentence_generator = train_model('phi5', lemmatize=False, rm_stops=False, testing=False)
model = Word2Vec(sentences=list(sentence_generator), size=100, window=10, min_count=5, workers=4)

In [12]:
model_path = os.path.expanduser('~/word2vec_tests/latin/w2v_w10_latin.model')
model.init_sims(replace=True)
model.save(model_path)

# Greek

## Train model, Greek, lemmatize & rm_stops

In [19]:
sentence_generator = train_model('tlg', lemmatize=True, rm_stops=True, testing=False)
model = Word2Vec(sentences=list(sentence_generator), size=100, window=10, min_count=5, workers=4)

INFO:CLTK:Loading lemmata. This may take a minute.


KeyboardInterrupt: 

In [None]:
model_path = os.path.expanduser('~/word2vec_tests/greek/w2v_w10_greek_lemmatizer_stops.model')
model.init_sims(replace=True)
model.save(model_path)

## Train model, Greek, lemmatize

In [None]:
sentence_generator = train_model('tlg', lemmatize=True, rm_stops=False, testing=False)
model = Word2Vec(sentences=list(sentence_generator), size=100, window=10, min_count=5, workers=4)

In [None]:
model_path = os.path.expanduser('~/word2vec_tests/greek/w2v_w10_greek_lemmatizer.model')
model.init_sims(replace=True)
model.save(model_path)

## Train model, Greek, rm_stops

In [None]:
sentence_generator = train_model('tlg', lemmatize=False, rm_stops=True, testing=False)
model = Word2Vec(sentences=list(sentence_generator), size=100, window=10, min_count=5, workers=4)

In [None]:
model_path = os.path.expanduser('~/word2vec_tests/greek/w2v_w10_greek_stops.model')
model.init_sims(replace=True)
model.save(model_path)

## Train model, Greek

In [None]:
sentence_generator = train_model('tlg', lemmatize=False, rm_stops=False, testing=False)
model = Word2Vec(sentences=list(sentence_generator), size=100, window=10, min_count=5, workers=4)

In [None]:
model_path = os.path.expanduser('~/word2vec_tests/greek/w2v_w10_greek.model')
model.init_sims(replace=True)
model.save(model_path)