In [1]:
import pickle
import os
import nltk

from gensim.models import Word2Vec
from src.evaluator.evaluator import Word_Associations_Evaluator
from tqdm import tqdm

In [2]:
# load corpus
wd = os.getcwd()
data_dir = os.path.join(wd, 'data')
with open(os.path.join(data_dir, 'sentences_dump.pkl'), 'rb') as f:
    sentences = pickle.load(f)

In [3]:
# load country list
country_path = os.path.join(wd, 'src', 'wikitravel', 'countries.txt')
with open(country_path, 'r') as f:
    countries = f.read().lower().split('\n')

In [4]:
# train model with several hyperparametes and evaluate performance
evaluator = Word_Associations_Evaluator()
lemmatizer=nltk.WordNetLemmatizer()
stemmer=nltk.PorterStemmer()
performances = {}
window_sizes = [2, 5, 10, 15, 20, 25, 30]
sampling_methods = ['hierarchical', 'negative']
training_algos = ['skip-gram', 'cbow']

for training_algo in training_algos:
    sg = 0
    if training_algo == 'skip-gram':
        sg = 1
    performances[training_algo] = {}
    
    for sampling_method in sampling_methods:
        hs = 0
        negative = 5
        if sampling_method == 'hierarchical':
            hs = 1
            negative = 0
        
        perf = []
        for window_size in tqdm(window_sizes):
            results_all = {}
            valid_scores = []
            model = Word2Vec(sentences, size=100, window=window_size, sg=sg, hs=hs, 
                             negative=negative, min_count=1, workers=4)
            
            for country in countries:
                try:
                    country_sl = lemmatizer.lemmatize(stemmer.stem(country))
                    n_most_similar = [x[0] for x in model.wv.most_similar(country_sl, topn=3)]
                    score = evaluator.simple_evaluate(country_sl, n_most_similar)
                except Exception as e:
                    #print(e)
                    score = -1

                if score > 0:
                    valid_scores.append(score)

                results_all[country] = score

            valid_len = len(valid_scores)
            perf.append((sum(valid_scores) / valid_len, window_size))
            
        performances[training_algo][sampling_method] = perf

100%|██████████| 7/7 [17:37<00:00, 159.93s/it]
100%|██████████| 7/7 [17:56<00:00, 167.36s/it]
100%|██████████| 7/7 [11:09<00:00, 95.32s/it]
100%|██████████| 7/7 [09:18<00:00, 80.15s/it]


In [31]:
# learn optimal model
best_score = 0
for training_algo in performances.keys():
    for sampling_algo in performances[training_algo]:
        best_candidate = max(performances[training_algo][sampling_algo], key=lambda x: x[0])
        if best_candidate[0] > best_score:
            best_score = best_candidate[0]
            best_window = best_candidate[1]
            best_algo = training_algo
            best_sampling = sampling_algo

hs = 0
negative = 5
if best_sampling == 'hierarchical':
    hs = 1
    negative = 0
sg = 0
if best_algo == 'skip-gram':
    sg = 1

del model
optimal_model = Word2Vec(sentences, size=100, window=best_window, 
                         sg=sg, hs=hs, negative=negative, min_count=1, workers=4)            

In [32]:
optimal_model.wv.save_word2vec_format(os.path.join(wd, 'model.bin'), binary=True)