# About

The following is a qualitative test of results returned by word2vec models generated by means of the CLTK and gensim's implementation of the algorithm. There are four models for Latin and Greek each, with the following parameters:

* lemmatized and stopwords removed
* lemmatized 
* stopwords removed
* plaintext (ie, not lemmatized nor stopwords removed)

All the models were build with `Word2Vec()` arguments `size=100`, `window=5`, and `min_count=5`. The code made to generate these is in notebook </word2vec_build_model_phi5_tlg_test>. It was run on a remote server [with this setup code](https://github.com/kylepjohnson/cltk_remote_setup).

# Latin

In [1]:
from collections import defaultdict
from gensim.models import Word2Vec
import os
from termcolor import colored

In [2]:
models_paths = {'lemmatized_no_stops': '~/cltk_data/user_data/word2vec/w2v_latin_lemmatizer_stops.model',
                'lemmatized_yes_stops': '~/cltk_data/user_data/word2vec/w2v_latin_lemmatizer.model',
                'unlemmatized_no_stops': '~/cltk_data/user_data/word2vec/w2v_latin_stops.model',
                'unlemmatized_yes_stops': '~/cltk_data/user_data/word2vec/w2v_latin.model'}

save_dir = '~/cltk_data/user_data/word2vec/'

In [3]:
def get_sims(model_name, word_1, word_2=None):
    word = word_1
    if word_2 and model_name in ['lemmatized_no_stops', 'lemmatized_yes_stops']:
        word = word_2
    return word_1, model.most_similar(word)

def print_sims(word_all_models_list):
    for headword, sims in word_all_models_list.items():
        print(colored(headword, 'red'))
        for sims_pair in sims:
            print(colored(sims_pair[0], 'blue'))
            for sim in sims_pair[1]:
                print(sim)
        print()

In [4]:
word_all_models_list = defaultdict(list)
for model_name, model_path in models_paths.items():

    # setup paths
    model_path = os.path.expanduser(model_path)
    save_dir = os.path.expanduser(save_dir)
    save_file = os.path.join(save_dir, model_name)

    model = Word2Vec.load(model_path)

    words = ['amicitia', ('carus', 'carus1'), 'dignitas', 'amo', 'amor', ('industria', 'industria1'), 
             'facio', 'laus', 'scribo', 'cano', 'pudor']
    for word in words:
        if type(word) is str:
            headword, sims = get_sims(model_name, word)
        elif type(word) is tuple:
            headword, sims = get_sims(model_name, word[0], word[1])
        word_all_models_list[headword].append((model_name, sims))

In [5]:
print_sims(word_all_models_list)

[31mlaus[0m
[34munlemmatized_yes_stops[0m
('dignitas', 0.7702658176422119)
('commendatio', 0.7273139357566833)
('uirtus', 0.7205942869186401)
('prudentia', 0.7164344787597656)
('gloria', 0.7120715379714966)
('opinio', 0.709394097328186)
('auctoritas', 0.7079125642776489)
('perturbatio', 0.7022783160209656)
('disciplina', 0.7003540992736816)
('aequabilitas', 0.6998453140258789)
[34munlemmatized_no_stops[0m
('grauitas', 0.7767825126647949)
('exspectatio', 0.7622722387313843)
('amplitudo', 0.7578041553497314)
('dignitas', 0.7558329105377197)
('ingeni', 0.7504878044128418)
('memoria', 0.7502184510231018)
('commendatio', 0.7450073957443237)
('perturbatio', 0.7413634061813354)
('contentio', 0.737472653388977)
('uoluntas', 0.7345441579818726)
[34mlemmatized_yes_stops[0m
('glorior', 0.7771285772323608)
('uirtus', 0.738848090171814)
('honor', 0.7327480316162109)
('industria1', 0.7065572738647461)
('dignitas', 0.7001360654830933)
('gloria', 0.6975167989730835)
('honesto', 0.6631809473037

In [52]:
def write_model_vectors(language, models_paths, save_dir):
    save_dir = os.path.expanduser(save_dir)
    for model_name, model_path in models_paths.items():
        
        # setup file paths
        model_path = os.path.expanduser(model_path)
        model_name = language + '_' + model_name + '.txt'
        save_file = os.path.join(save_dir, model_name)
        
        model = Word2Vec.load(model_path)
        vocab = model.vocab
        
        vocab_len = len(vocab)
        counter = 0
        final_list = []
        print(vocab_len)
        for word in vocab:
            counter += 1
            if counter % 10000 == 0:
                print(counter, '/', vocab_len)

            pairs = model.most_similar(word)
            
            line = word + ': ' + str(pairs)
            final_list.append(line)
                    
        final_str = '\n\n'.join(final_list)

        with open(save_file, 'w') as file_open:
            file_open.write(final_str)
        
        print('Wrote file at:', save_file)


In [53]:
write_model_vectors('latin', models_paths, save_dir)

87835
striges: [('spumas', 0.8318402767181396), ('lapsumque', 0.8267949819564819), ('bucula', 0.8252154588699341), ('aetnaea', 0.8249616622924805), ('adleuat', 0.820337176322937), ('spumantis', 0.817375898361206), ('anhelo', 0.816428542137146), ('tergoque', 0.8148610591888428), ('pectine', 0.813423752784729), ('telas', 0.8128271102905273)]

fultus: [('hyacintho', 0.7800098657608032), ('examen', 0.7194118499755859), ('bipennibus', 0.7076067924499512), ('uersumque', 0.7016180753707886), ('trahebatur', 0.6994680166244507), ('tegitur', 0.6986311078071594), ('instaurat', 0.6970216035842896), ('cyllenius', 0.695743203163147), ('excuteret', 0.6950575113296509), ('nascebantur', 0.6947343945503235)]

consectentur: [('declinando', 0.725358247756958), ('toleraturos', 0.7214525938034058), ('turbidis', 0.714789867401123), ('sceleratis', 0.699560284614563), ('impares', 0.6862906217575073), ('uariantis', 0.6860278844833374), ('cientur', 0.6838405728340149), ('ignauae', 0.6830829977989197), ('terruere

KeyboardInterrupt: 

# Greek

In [55]:
models_paths = {'lemmatized_no_stops': '~/cltk_data/user_data/word2vec/w2v_greek_lemmatizer_stops.model',
                'lemmatized_yes_stops': '~/cltk_data/user_data/word2vec/w2v_greek_lemmatizer.model',
                'unlemmatized_no_stops': '~/cltk_data/user_data/word2vec/w2v_greek_stops.model',
                'unlemmatized_yes_stops': '~/cltk_data/user_data/word2vec/w2v_greek.model'}

save_dir = '~/cltk_data/user_data/word2vec/'

In [21]:
word_all_models_list = defaultdict(list)
for model_name, model_path in models_paths.items():

    # setup paths
    model_path = os.path.expanduser(model_path)
    save_dir = os.path.expanduser(save_dir)
    save_file = os.path.join(save_dir, model_name)

    model = Word2Vec.load(model_path)

    words = ['ἄγγελος', 'εἶπον', 'λόγος', 'ἵππος', 'κύων', 'ὄνος', 'οἶδα']
    for word in words:
        if type(word) is str:
            headword, sims = get_sims(model_name, word)
        elif type(word) is tuple:
            headword, sims = get_sims(model_name, word[0], word[1])
        word_all_models_list[headword].append((model_name, sims))

In [22]:
print_sims(word_all_models_list)

[31mοἶδα[0m
[34munlemmatized_yes_stops[0m
('οἶσθα', 0.7605229616165161)
('οἶδας', 0.7389824390411377)
('ἔγνων', 0.726704478263855)
('ἀγνοῶ', 0.7231594920158386)
('ἐπίσταμαι', 0.719325065612793)
('ἀγνοεῖς', 0.7121965289115906)
('εἶπον', 0.6860429048538208)
('πυροῦμαι', 0.683976411819458)
('ἔχω', 0.6616619825363159)
('ᾔδεις', 0.658226728439331)
[34mlemmatized_yes_stops[0m
('λέγω', 0.5777326822280884)
('ὁράω', 0.5736971497535706)
('φησὶν', 0.5720843076705933)
('ἀγνοέω', 0.567689836025238)
('δείκνυμι', 0.5646995306015015)
('εἶπον', 0.5587722063064575)
('εἰδὼς', 0.5557109713554382)
('εἰδότες', 0.552374005317688)
('ἐκεῖνος', 0.5251285433769226)
('δύναμαι', 0.5176582336425781)
[34mlemmatized_no_stops[0m
('λέγω', 0.5961875915527344)
('ἀγνοέω', 0.5912941694259644)
('εἰδὼς', 0.5896536111831665)
('εἰδότες', 0.5775903463363647)
('φησὶν', 0.5709648132324219)
('δείκνυμι', 0.5685352087020874)
('ὁράω', 0.5333505868911743)
('εἶπον', 0.5292441844940186)
('ἐκεῖνος', 0.5261932611465454)
('ἀκριβό

In [None]:
write_model_vectors('greek', models_paths, save_dir)

407239
διαχειρίζω: [('ταμιεύω', 0.8252047300338745), ('δαμάσω', 0.8073007464408875), ('ἰχανῶ', 0.8062002062797546), ('ὑβρίζω', 0.8042540550231934), ('προΐξω', 0.8021806478500366), ('ἐνδιατρίβω', 0.8002261519432068), ('ἔταον', 0.8000962734222412), ('νοήμη', 0.7943591475486755), ('διοικῶ', 0.7935482859611511), ('διαβάλλω', 0.7934131026268005)]

ἐνεγέννησεν: [('ἀποφέρουσα', 0.6958976984024048), ('ἐπαινετὴ', 0.6571913957595825), ('ἐπιστημονικὴ', 0.6513071060180664), ('ἀπηλλαγμένη', 0.6497482061386108), ('ἀντιποιουμένη', 0.6440267562866211), ('ἀναπόβλητος', 0.6410380601882935), ('ἀργία', 0.6388857364654541), ('ἀναγκαστικὴ', 0.6372959613800049), ('ἔφεσις', 0.6267553567886353), ('ὁδηγία', 0.6204506158828735)]

παλινδρομήσαντας: [('καθυβρίσαντες', 0.7391155958175659), ('παρεδήλουν', 0.7340777516365051), ('μετακαλέσασθαι', 0.7245435118675232), ('ἡγεμονεύοντα', 0.7109376192092896), ('εἰσκαλεσάμενος', 0.7104780673980713), ('ἐπικαλεσαμένης', 0.7087868452072144), ('ἀνέμενον', 0.7076782584190369), (