In [1]:
import pickle
import os
import nltk

from gensim.models import Word2Vec
from src.evaluator.evaluator import Word_Associations_Evaluator
from tqdm import tqdm

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


In [2]:
# load corpus
wd = os.getcwd()
data_dir = os.path.join(wd, 'data')
with open(os.path.join(data_dir, 'sentences_dump.pkl'), 'rb') as f:
    sentences = pickle.load(f)

In [3]:
# load country list
country_path = os.path.join(wd, 'src', 'wikitravel', 'countries.txt')
with open(country_path, 'r') as f:
    countries = f.read().lower().split('\n')

In [4]:
# train model with several hyperparametes and evaluate performance
evaluator = Word_Associations_Evaluator()
lemmatizer=nltk.WordNetLemmatizer()
stemmer=nltk.PorterStemmer()
performances = []
window_sizes = [2, 5, 10, 15, 20, 25, 30]
vector_sizes = [100, 200, 300, 500]
sampling_method = 'hierarchical'
training_algo = 'skip-gram'

sg = 0
if training_algo == 'skip-gram':
    sg = 1
hs = 0
negative = 5
if sampling_method == 'hierarchical':
    hs = 1
    negative = 0
    
    
for vector_size in vector_sizes:
    for window_size in tqdm(window_sizes):
        results_all = {}
        valid_scores = []
        model = Word2Vec(sentences, size=vector_size, window=window_size, sg=sg, hs=hs, 
                         negative=negative, min_count=1, workers=4)

        for country in countries:
            try:
                country_sl = lemmatizer.lemmatize(stemmer.stem(country))
                n_most_similar = [x[0] for x in model.wv.most_similar(country_sl, topn=3)]
                score = evaluator.simple_evaluate(country_sl, n_most_similar)
            except Exception as e:
                #print(e)
                score = -1

            if score > 0:
                valid_scores.append(score)

            results_all[country] = score

        valid_len = len(valid_scores)
        performances.append((vector_size, window_size, sum(valid_scores) / valid_len))

100%|██████████| 7/7 [15:47<00:00, 143.65s/it]
100%|██████████| 7/7 [18:45<00:00, 173.35s/it]
100%|██████████| 7/7 [24:19<00:00, 224.53s/it]
100%|██████████| 7/7 [34:29<00:00, 322.37s/it]


In [5]:
# learn optimal model
best_vector_size, best_winfod, best_score = max(performances, key=lambda x: x[2])

del model
optimal_model = Word2Vec(sentences, size=bess_vector_size, window=best_window, 
                         sg=sg, hs=hs, negative=negative, min_count=1, workers=4)
print('model params: {0}, {1}; score: {2}'.format(best_window, bess_vector_size, best_score))

model params: 10, 100; score: 0.7855072463768116


In [6]:
optimal_model.wv.save_word2vec_format(os.path.join(wd, 'model.bin'), binary=True)