# About

The following is a qualitative test of results returned by word2vec models generated by means of the CLTK and gensim's implementation of the algorithm. There are four models for Latin and Greek each, with the following parameters:

* lemmatized and stopwords removed
* lemmatized 
* stopwords removed
* plaintext (ie, not lemmatized nor stopwords removed)

All the models were build with `Word2Vec()` arguments `size=100`, `window=5`, and `min_count=5`. The code made to generate these is in notebook </word2vec_build_model_phi5_tlg_test>. It was run on a remote server [with this setup code](https://github.com/kylepjohnson/cltk_remote_setup).

# Latin

In [77]:
from collections import defaultdict
from gensim.models import Word2Vec
import os
from termcolor import colored

In [4]:
models_paths = {'lemmatized_no_stops': '~/cltk_data/user_data/word2vec/w2v_latin_lemmatizer_stops.model',
                'lemmatized_yes_stops': '~/cltk_data/user_data/word2vec/w2v_latin_lemmatizer.model',
                'unlemmatized_no_stops': '~/cltk_data/user_data/word2vec/w2v_latin_stops.model',
                'unlemmatized_yes_stops': '~/cltk_data/user_data/word2vec/w2v_latin.model'}

save_dir = '~/cltk_data/user_data/word2vec/'

In [80]:
def get_sims(model_name, word_1, word_2=None):
    word = word_1
    if word_2 and model_name in ['lemmatized_no_stops', 'lemmatized_yes_stops']:
        word = word_2
    return word_1, model.most_similar(word)

sims_dict = {}
for model_name, model_path in models_paths.items():
    
    # setup paths
    model_path = os.path.expanduser(model_path)
    save_dir = os.path.expanduser(save_dir)
    save_file = os.path.join(save_dir, model_name)
    
    model = Word2Vec.load(model_path)
    #print(colored(model_name, 'blue'))
    #print(model)

    sims_list = []
    words = ['amicitia', ('carus', 'carus1'), 'dignitas', 'amo', 'amor', ('industria', 'industria1'), 
             'facio', 'laus', 'scribo', 'cano', 'pudor']
    for word in words:
        if type(word) is str:
            sims = get_sims(model_name, word)
        elif type(word) is tuple:
            sims = get_sims(model_name, word[0], word[1])
        sims_list.append(sims)
    sims_dict[model_name] = sims_list

    '''
    # rm file if already exists
    if os.path.isfile(save_file):
        os.remove(save_file)
    
    # write contents
    with open(save_file, 'w') as file_open:
        file_open.write('')
    '''

#! start here
# make a dict like:
# {'amicitia': {'lemmatized_no_stops': [('benuolentia': 0.8302, ...), ...]}
headword_dict = {}
for model_type in sims_dict:
    pairs = sims_dict[model_type]
    sims_dict = defaultdict(list)
    for pair in pairs:
        sims_vals = (model_type, pair[1])
        sims_dict[pair[0]].append(sims_vals)
    headword_dict[] = (model_type, pair[1])

print(headword_dict)

{}
