# Word2Vec models comparison

We compare the different models of word2vec against different intrinsic word embeddings tasks.

**BLUNDER: all 200 and 300 D models are not trained on leema text**

### Import and load datasets

In [1]:
# imports
!pip install ray[tune]
import ray
from ray.tune.utils import pin_in_object_store, get_pinned_object
ray.init(ignore_reinit_error=True)
import pickle
import time
from numpy.linalg import norm
from scipy.spatial.distance import cosine, cdist
import base64
!pip install telepot
import telepot

!pip install nltk
import nltk
nltk.download('wordnet')
import glob
!pip install tqdm
from tqdm import tqdm
import pandas as pd
!pip install gensim
from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m
2020-09-01 20:06:58,639	INFO resource_spec.py:231 -- Starting Ray with 4.05 GiB memory available for workers and up to 2.03 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-09-01 20:07:02,863	INFO services.py:1193 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m
Traceback (most recent call last):
  File "/opt/venv/lib/python3.7/site-packages/ray/dashboard/dashboard.py", line 961, in <module>
    dashboard.run()
  File "/opt/venv/lib/python3.7/site-packages/ray/dashboard/dashboard.py", line 576, in run
    aiohttp.web.run_app(self.app, host=self.host, port=self.port)
  File "/opt/venv/lib/python3.7/site-packages/aiohttp/web.py", line 433, in run_app
    reuse_port=reuse_port))
  File "/usr/local/lib/python3.7/asyncio/base_events.py", line 584, in run_until_complete
    return future.result()
  File "/opt/venv/

In [2]:
## Load telepot
def send_telegram_message(message):
    token = '1361671158:AAF9jfW_fT0aF0zHwHtpOaUEB9CPYhmyew8'
    TelegramBot = telepot.Bot(token)
    TelegramBot.sendMessage(934022573, str(message))

In [3]:
# send_telegram_message("test from deepnotes")

In [4]:
with open ("../data/all_datasets.pickle", 'rb') as f:
    all_datasets = pickle.load(f)

In [5]:
# all([len(set(x))==4 for x in all_datasets['analogy_datasets']['bats_analogy']])
# all_datasets['analogy_datasets']['bats_analogy'][:10]
# for x , y in all_datasets['relatedness_datasets'].items():
#     print(x, len(y))

### Load word2vec models

In [6]:
def nnorm(matrix):
    "norm each vector in the matrix"
    return (matrix.T/norm(matrix, axis=1)).T
    
# model = Word2Vec.load("../../../embeddings_lemma/word2vec_mc=10_iter=5_size=200_window=5_sg=0/word2vec_wikiEn20171001_millionSentences_mc=10_iter=5_size=200_window=5_sg=0")
# model.wv.vectors = nnorm(model.wv.vectors) 
# model.trainables.syn1neg = nnorm(model.trainables.syn1neg)


### Create Word2vec similarity computing method

In [7]:
def word2vec_get_index_by_word(word2vec_model, word):
    """Return the index of the word in the model
    """
    return word2vec_model.wv.index2word.index(word)

def word2vec_get_word_by_index(word2vec_model, index):
    """Return the word by the provided index
    """
    return word2vec_model.wv.index2word[index]

def word2vec_get_model_matrix_by_method(word2vec_model, method):
    """Return the source and compare matrix based on model and method selection
    """
    input_weights = word2vec_model.wv.vectors
    output_weights = word2vec_model.trainables.syn1neg
    weights1, weights2 = None, None
    if method=="IN-IN":
        weights1, weights2 = input_weights, input_weights 
    elif method=="IN-OUT":
        weights1, weights2 = input_weights, output_weights
    elif method=="OUT-IN":
        weights1, weights2 = output_weights, input_weights
    elif method=="OUT-OUT":
        weights1, weights2 = output_weights, output_weights
    return weights1, weights2

def word2vec_find_top_similar_vectors_bw_matrix(vector, compare_matrix, source_word_index, index2word, top_n=5, no_self_similarity=True):
    """Find top n similar vectors in the compare matrix
    """
    score = np.dot(vector, compare_matrix)[0]
    if no_self_similarity:
        score[source_word_index] = -1 # negate self-similarity
    top_n_similar_words = np.argpartition(-score, top_n)[:top_n]
    sorted_result = sorted([(index2word[index], score[index]) for index in top_n_similar_words], 
                key=lambda x: x[1], 
                reverse=True)
    return sorted_result

def word2vec_find_top_similar_words(word2vec_model, source_word, method='IN-IN', top_n=5, no_self_similarity=True):
    """
    Provided a word, find the top_n most similar from the model following the method
    """
    score = []
    source_word_index = word2vec_get_index_by_word(word2vec_model, source_word)
    source_matrix, compare_matrix = word2vec_get_model_matrix_by_method(word2vec_model, method)
    sorted_result = word2vec_find_top_similar_vectors_bw_matrix(
            vector=source_matrix[source_word_index].reshape(1, -1),
            compare_matrix=compare_matrix.T, 
            source_word_index = source_word_index,
            index2word=word2vec_model.wv.index2word,
            top_n=top_n, 
            no_self_similarity=no_self_similarity)
    return sorted_result

def word2vec_find_similarity(word2vec_model, source_word, target_word, method="IN-IN"):
    """Return the cosine similarity between two words based on the suggested method
    """
    source_word_index = word2vec_get_index_by_word(word2vec_model, source_word)
    target_word_index = word2vec_get_index_by_word(word2vec_model, target_word)
    source_matrix, compare_matrix = word2vec_get_model_matrix_by_method(word2vec_model, method)
    score = cosine_similarity(source_matrix[source_word_index].reshape(1, -1), 
                              compare_matrix[target_word_index].reshape(1, -1))[0]
    return score

# word2vec_find_similarity(model, "car", "truck", "IN-OUT")
# word2vec_find_top_similar_words(model, "car", "IN-IN")
# word2vec_find_top_similar_words(model, "car", "IN-OUT")

# lemmatizer - noun lemma -- https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word
def lemma(word): return nltk.stem.WordNetLemmatizer().lemmatize(word)

# preprocss the word - lowercase and lemma
def pre(word): return lemma(word.lower())

In [8]:
# word2vec_find_top_similar_words(get_pinned_object(model), 'car', method='IN-IN', top_n=10, no_self_similarity=True)

## Test functions

In [12]:
glob_w2v_models = ['../../../embeddings_lemma/word2vec_mc=10_iter=5_size=100_window=5_sg=0',
                    '../../../embeddings_lemma/word2vec_mc=10_iter=5_size=100_window=5_sg=1',
                    '../../../embeddings_lemma/word2vec_mc=10_iter=5_size=100_window=50_sg=0',
                    '../../../embeddings_lemma/word2vec_mc=10_iter=5_size=100_window=50_sg=1',
                    '../../../embeddings_lemma/word2vec_mc=10_iter=5_size=200_window=5_sg=0',
                    '../../../embeddings_lemma/word2vec_mc=10_iter=5_size=200_window=5_sg=1',
                    '../../../embeddings_lemma/word2vec_mc=10_iter=5_size=200_window=50_sg=0',
                    '../../../embeddings_lemma/word2vec_mc=10_iter=5_size=200_window=50_sg=1',
                    '../../../embeddings_lemma/word2vec_mc=10_iter=5_size=300_window=5_sg=0',
                    '../../../embeddings_lemma/word2vec_mc=10_iter=5_size=300_window=5_sg=1',
                    '../../../embeddings_lemma/word2vec_mc=10_iter=5_size=300_window=50_sg=0',
                    '../../../embeddings_lemma/word2vec_mc=10_iter=5_size=300_window=50_sg=1']

# @ray.remote
def compare_word2vec_model_with_relatedness_dataset(model, model_name, dataset_name, dataset):
    missing_words = 0
    score_table = []
    for row in dataset.to_dict(orient="records"):
        methods = ["IN-IN", "IN-OUT", "OUT-IN", "OUT-OUT"]
        for method in methods:
            try:
                sim_score = word2vec_find_similarity(model, pre(row['word_1']), pre(row['word_2']), method)[0]
            except:
                sim_score = None
                missing_words += 1
            row[f"{model_name}_{method}"] = sim_score
        score_table.append(row)
    score_table = pd.DataFrame.from_dict(score_table)
    score_table = score_table.dropna().corr("pearson")[['similarity_score']].tail(4)
    score_table.columns = [dataset_name]
    missing_words = missing_words/len(methods)
    return score_table, dataset_name, missing_words

def calculate_tp_score(y_pred, y_true):
    y_pred, y_true = set(y_pred), set(y_true)
    tp = y_pred & y_true
    # fp = y_pred - tp
    # tn = 0.001
    # fn = y_true - y_pred
    # precision = len(tp) / (len(tp) + len(fp))
    # recall = len(tp) / (len(tp) + len(fn))
    # try:
    #     f1_score = 2*(precision * recall)/(precision + recall)
    # except ZeroDivisionError:
    #     f1_score = 0
    return len(tp)/len(y_true)

def word2vec_check_all_word_in_model(words, index2word):
    return all([word in index2word for word in words])

@ray.remote
def word2vec_test_association_dataset(model, model_name, dataset_name, dataset):
    # word2vec_model = get_pinned_object(model)
    word2vec_model = model
    possible_top_n = [10]
    methods = ["IN-IN", "IN-OUT", "OUT-IN", "OUT-OUT"]
    max_possible_top_n = max(possible_top_n)
    all_cue_test_result= []
    for cue_name, cue_data in dataset.groupby(['cue']):
        expected_responses = cue_data['response'].values
        if not word2vec_check_all_word_in_model([cue_name] + list(expected_responses), word2vec_model.wv.index2word):
            continue
        cue_test_result = {'cue': cue_name, 'expected_responses': expected_responses}
        for method in methods:
            model_response = word2vec_find_top_similar_words(word2vec_model, cue_name, method, top_n=max_possible_top_n)
            for top_n in possible_top_n:
                subset_model_response = [x[0] for x in model_response[:top_n]]
                tp_score = calculate_tp_score(subset_model_response, expected_responses)
                model_method_topn_name = f'{model_name}_{method}_{top_n}'
                cue_test_result[f'{model_method_topn_name}_model_response'] = subset_model_response
                cue_test_result[f'{model_method_topn_name}_tp_score'] = tp_score
        all_cue_test_result.append(cue_test_result)
    return dataset_name, all_cue_test_result

## 
# ANALGOY
##
def cos3mul(compare_matrix, word_index, a, a_s, b,  b_s, top_n, e=0.0000001):
    """From: https://www.aclweb.org/anthology/W14-1618.pdf; Page 175
    """
    c = np.stack([b, a_s, a], axis=1)
    all_scores = np.dot(compare_matrix, c)
    all_scores = (all_scores + 1)/2 # normalize bw 0 to 1
    all_scores[:, 2] += e
    all_scores = ((all_scores[:, 0] * all_scores[:, 1]) / all_scores[:, 2])
    top_n_similar_words = np.argpartition(-all_scores, top_n)[:top_n]
    sorted_result = sorted([(word_index[index], all_scores[index]) for index in top_n_similar_words], 
                key=lambda x: x[1], 
                reverse=True)
    sorted_result = [x[0] for x in sorted_result]
    return sorted_result, 1 if b_s in sorted_result else 0

def cos3add(compare_matrix, a, a_s, b, b_s, top_n, index2word):
    """Find the most similar vector with (b-a+a_)
    """
    b_ = b.reshape(1, -1)-a.reshape(1, -1)+a_s.reshape(1, -1)
    res = word2vec_find_top_similar_vectors_bw_matrix(
            b_, compare_matrix, None, index2word, top_n=top_n, no_self_similarity=False)
    res = [x[0] for x in res]
    return res, 1 if b_s in res else 0

# for dataset_name, dataset in all_datasets['analogy_datasets'].items():
    # dataset_result = []
@ray.remote
def word2vec_test_analogy_dataset(model, model_name, dataset_name, dataset):
    dataset_result = []
    # word2vec_model = get_pinned_object(model)
    word2vec_model = model
    for row in dataset:
        a, a_s, b, b_s = row
        try:
            a_index = word2vec_get_index_by_word(word2vec_model, a)
            a_s_index = word2vec_get_index_by_word(word2vec_model, a_s)
            b_index = word2vec_get_index_by_word(word2vec_model, b)
            b_s_index = word2vec_get_index_by_word(word2vec_model, b_s)
        except:
            continue
        for method in ["IN-IN", "IN-OUT", "OUT-IN", "OUT-OUT"]:
            source_matrix, compare_matrix = word2vec_get_model_matrix_by_method(word2vec_model, method)
            a_vector = source_matrix[a_index]
            a_s_vector = source_matrix[a_s_index]
            b_vector = source_matrix[b_index]
            cos3add_response, cos3add_score = \
                cos3add(compare_matrix.T, a_vector, a_s_vector, b_vector, b_s, 3, word2vec_model.wv.index2word)
            cos3mul_response, cos3mul_score = \
                cos3mul(compare_matrix, word2vec_model.wv.index2word, a_vector, a_s_vector, b_vector, b_s, top_n=3)
            dataset_result.append({
                'analogy': row,
                'method': method, 
                'model': model_name,
                'cos3add_response': cos3add_response,
                'cos3add_score': cos3add_score,
                'cos3mul_response': cos3mul_response,
                'cos3mul_score': cos3mul_score
            })
    return dataset_name, pd.DataFrame(dataset_result)

In [13]:
# ['car', 'truck'] not in get_pinned_object(model).wv.index2word
# all([True, False, True])
# [5] + [1,2,3]

## Run Analogy

In [None]:
# perform this for each model
for model_dir in tqdm(glob_w2v_models[3:]):
    model_name = model_dir.replace("../../../embeddings_lemma/", "").replace("iter=5_", "")
    model_path = glob.glob(model_dir + "/*[!(npy)]")[0]
    model = Word2Vec.load(model_path)
    model.wv.vectors = nnorm(model.wv.vectors) 
    model.trainables.syn1neg = nnorm(model.trainables.syn1neg)
    # model = pin_in_object_store(model)
    # print("Running analysis on each dataset")
    # Step 1: Handle relatedness dataset
    futures = [word2vec_test_analogy_dataset.remote(model, model_name, dataset_name, dataset) \
                    for dataset_name, dataset in all_datasets['analogy_datasets'].items()]
    res = ray.get(futures)    
    # print("Post processing and Saving results")
    # pd.concat(res, axis=1)
    df_res = {dname:df_res for dname, df_res in res}
    with open(f"../output/word2vec/analogy/word2vec_results_{model_name}_analogy.pickle", "wb") as f:
        pickle.dump({"score_matrix": df_res}, f)
    # clean ray object store
    # ray.internal.free(model)
    # message from telegram
    try:
        send_telegram_message(model_name)
        send_telegram_message(str(df_res['google_analogy'].groupby('method').mean()))
        send_telegram_message(str(df_res['bats_analogy'].groupby('method').mean()))
    except:
        pass
                

 67%|██████▋   | 6/9 [2:47:33<1:28:26, 1768.90s/it]

100%|██████████| 9/9 [4:46:47<00:00, 1911.90s/it]  


In [None]:
# glob.glob("../../../embeddings_lemma/word2vec_mc=10_iter=5_size=100_window=5_sg=0/*")
import pickle
with open("../output/word2vec/analogy/word2vec_results_word2vec_mc=10_size=100_window=5_sg=0_analogy.pickle", "rb") as f:
    _ = pickle.load(f)
# _['score_matrix'][0]
(_['score_matrix']['bats_analogy'].groupby('method').mean())
# _['score_matrix']['google_analogy'].query('method=="IN-OUT"')

Unnamed: 0_level_0,cos3add_score,cos3mul_score
method,Unnamed: 1_level_1,Unnamed: 2_level_1
IN-IN,0.272093,0.230233
IN-OUT,0.153488,0.151163
OUT-IN,0.0,0.0
OUT-OUT,0.206977,0.204651


'listens'

## Run Relatedness tests

In [None]:
# all_df_res = []
# all_missing_words = []

# perform this for each model
# for model_dir in tqdm(glob.glob("../../../embeddings_lemma/word2vec_*")[3:]):
for model_dir in tqdm(glob_w2v_models):
    model_name = model_dir.replace("../../../embeddings_lemma/", "").replace("iter=5_", "")
    model_path = glob.glob(model_dir + "/*[!(npy)]")[0]
    model = Word2Vec.load(model_path)
    model.wv.vectors = nnorm(model.wv.vectors) 
    model.trainables.syn1neg = nnorm(model.trainables.syn1neg)
    # print("Running analysis on each dataset")
    # Step 1: Handle relatedness dataset
    futures = [compare_word2vec_model_with_relatedness_dataset.remote(model, model_name, dataset_name, dataset) \
                    for dataset_name, dataset in all_datasets['relatedness_datasets'].items()]
    res = ray.get(futures)    
    # print("Post processing and Saving results")
    # pd.concat(res, axis=1)
    df_res = pd.concat([df_res for df_res, _, _ in res], axis=1)
    missing_words = {key:val for _, key, val in res}
    # all_df_res.append(df_res)
    # all_missing_words.append(missing_words)
    with open(f"../output/word2vec/relatedness/word2vec_results_{model_name}_relatedness.pickle", "wb") as f:
        pickle.dump({"score_matrix": df_res, 'missing_words': missing_words}, f)
    try:
        send_telegram_message(model_name)
        send_telegram_message(str(df_res.iloc[:, [0]]))
    except:
        pass

  0%|          | 0/12 [00:00<?, ?it/s][2m[33m(pid=raylet)[0m E0901 19:52:15.305629   218   233 store.cc:252] Not enough memory to create the object ffffffffffffffffffffffff0100008004000000, data_size=769881459, metadata_size=6, will send a reply of PlasmaError::OutOfMemory
2020-09-01 19:52:19,004	INFO (unknown file):0 -- gc.collect() freed 16 refs in 3.551877399906516 seconds
[2m[33m(pid=raylet)[0m E0901 19:52:16.324450   218   233 store.cc:252] Not enough memory to create the object ffffffffffffffffffffffff0100008004000000, data_size=769881459, metadata_size=6, will send a reply of PlasmaError::OutOfMemory
[2m[36m(pid=249)[0m 2020-09-01 19:52:16,983	INFO (unknown file):0 -- gc.collect() freed 7 refs in 1.5036049876362085 seconds
[2m[33m(pid=raylet)[0m E0901 19:52:18.325723   218   233 store.cc:252] Not enough memory to create the object ffffffffffffffffffffffff0100008004000000, data_size=769881459, metadata_size=6, will send a reply of PlasmaError::OutOfMemory
[2m[33m(pi

ObjectStoreFullError: Failed to put object ffffffffffffffffffffffff0100008006000000 in object store because it is full. Object size is 769881459 bytes.
The local object store is full of objects that are still in scope and cannot be evicted. Try increasing the object store memory available with ray.init(object_store_memory=<bytes>). You can also try setting an option to fallback to LRU eviction when the object store is full by calling ray.init(lru_evict=True). See also: https://docs.ray.io/en/latest/memory-management.html.

In [11]:
## NO RAY version

# perform this for each model
# for model_dir in tqdm(glob.glob("../../../embeddings_lemma/word2vec_*")[3:]):
for model_dir in tqdm(glob_w2v_models):
    model_name = model_dir.replace("../../../embeddings_lemma/", "").replace("iter=5_", "")
    model_path = glob.glob(model_dir + "/*[!(npy)]")[0]
    model = Word2Vec.load(model_path)
    model.wv.vectors = nnorm(model.wv.vectors) 
    model.trainables.syn1neg = nnorm(model.trainables.syn1neg)
    # print("Running analysis on each dataset")
    # Step 1: Handle relatedness dataset
    all_df_res = []
    for dataset_name, dataset in all_datasets['relatedness_datasets'].items():
        df_res = compare_word2vec_model_with_relatedness_dataset(model, model_name, dataset_name, dataset)
        all_df_res.append(df_res)
    with open(f"../output/word2vec/relatedness/word2vec_results_{model_name}_relatedness.pickle", "wb") as f:
        pickle.dump({"score_matrix": all_df_res}, f)
    try:
        send_telegram_message(model_name)
        send_telegram_message(str(df_res.iloc[:, [0]]))
    except:
        pass

  0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
# glob.glob("../../../embeddings_lemma/word2vec_mc=10_iter=5_size=100_window=5_sg=0/*")
import pickle
with open("../output/word2vec/word2vec_results_word2vec_mc=10_size=100_window=5_sg=0_relatedness.pickle", "rb") as f:
    _ = pickle.load(f)
_['score_matrix'][0]

Unnamed: 0,EN-VERB-143,EN-SimVerb-3500,EN-RG-65,EN-RW-STANFORD,EN-MTurk-771,EN-MEN-TR-3k,EN-MC-30,EN-MTurk-287,EN-SIMLEX-999,EN-WS-353-REL,EN-YP-130,EN-WS-353-ALL,EN-WS-353-SIM
word2vec_mc=10_size=100_window=5_sg=0_IN-IN,0.397469,0.21732,0.733976,0.399236,0.65879,0.720543,0.788112,0.714405,0.377482,0.554434,0.396154,0.629383,0.720831
word2vec_mc=10_size=100_window=5_sg=0_IN-OUT,0.204517,0.14138,0.801957,0.155526,0.656008,0.764957,0.832302,0.657793,0.280067,0.636703,0.443034,0.670449,0.729736
word2vec_mc=10_size=100_window=5_sg=0_OUT-IN,0.271551,0.144981,0.79861,0.235247,0.64745,0.76677,0.823056,0.679873,0.262355,0.631646,0.463202,0.663323,0.725765
word2vec_mc=10_size=100_window=5_sg=0_OUT-OUT,0.260214,0.196653,0.605548,0.414003,0.497709,0.590263,0.688151,0.529687,0.324987,0.475809,0.300444,0.551615,0.604563


## Run association test

In [None]:
all_datasets['association_datasets'].keys()

dict_keys(['swow8500', 'eat'])

In [None]:

# perform this for each model
# for model_dir in tqdm(glob.glob("../../../embeddings_lemma/word2vec_*")[3:]):
for model_dir in tqdm(glob_w2v_models):
    model_name = model_dir.replace("../../../embeddings_lemma/", "").replace("iter=5_", "")
    model_path = glob.glob(model_dir + "/*[!(npy)]")[0]
    model = Word2Vec.load(model_path)
    model.wv.vectors = nnorm(model.wv.vectors) 
    model.trainables.syn1neg = nnorm(model.trainables.syn1neg)
    # model = pin_in_object_store(model)
    # print("Running analysis on each dataset")
    # Step 1: Handle relatedness dataset
    futures = [word2vec_test_association_dataset.remote(model, model_name, dataset_name, dataset) \
                    for dataset_name, dataset in all_datasets['association_datasets'].items()]
    res = ray.get(futures)    
    # print("Post processing and Saving results")
    # pd.concat(res, axis=1)
    df_res = {dname:df_res for dname, df_res in res}
    with open(f"../output/word2vec/association/word2vec_results_{model_name}_association.pickle", "wb") as f:
        pickle.dump({"score_matrix": df_res}, f)
    try:
        send_telegram_message(model_name)
    except:
        pass

100%|██████████| 12/12 [11:05:23<00:00, 3326.95s/it] 


## Test Analogy

In [None]:
# all_datasets['analogy_datasets']['google_analogy'][:]
def cos3mul(compare_matrix, word_index, a, a_s, b,  b_s, top_n, e=0.0000001):
    c = np.stack([b, a_s, a], axis=1)
    all_scores = np.dot(compare_matrix, c)
    all_scores = (all_scores + 1)/2 # normalize bw 0 to 1
    all_scores[:, 2] += e
    all_scores = ((all_scores[:, 0] * all_scores[:, 1]) / all_scores[:, 2])
    top_n_similar_words = np.argpartition(-all_scores, top_n)[:top_n]
    sorted_result = sorted([(word_index[index], all_scores[index]) for index in top_n_similar_words], 
                key=lambda x: x[1], 
                reverse=True)
    sorted_result = [x[0] for x in sorted_result]
    return sorted_result, 1 if b_s in sorted_result else 0

def cos3add(compare_matrix, a, a_s, b, b_s, top_n, index2word):
    """Find the most similar vector with (b-a+a_)
    """
    b_ = b.reshape(1, -1)-a.reshape(1, -1)+a_s.reshape(1, -1)
    res = word2vec_find_top_similar_vectors_bw_matrix(
            b_, compare_matrix, None, index2word, top_n=top_n, no_self_similarity=False)
    res = [x[0] for x in res]
    return res, 1 if b_s in res else 0

# for dataset_name, dataset in all_datasets['analogy_datasets'].items():
    # dataset_result = []
@ray.remote
def word2vec_test_analogy_dataset(model_name, dataset, dataset_name):
    word2vec_model = get_pinned_object(model)
    for row in tqdm(dataset):
        a, a_s, b, b_s = row
        try:
            a_index = word2vec_get_index_by_word(word2vec_model, a)
            a_s_index = word2vec_get_index_by_word(word2vec_model, a_s)
            b_index = word2vec_get_index_by_word(word2vec_model, b)
            b_s_index = word2vec_get_index_by_word(word2vec_model, b_s)
        except:
            continue
        for method in ["IN-IN", "IN-OUT", "OUT-IN", "OUT-OUT"]:
            source_matrix, compare_matrix = word2vec_get_model_matrix_by_method(word2vec_model, method)
            a_vector = source_matrix[a_index]
            a_s_vector = source_matrix[a_s_index]
            b_vector = source_matrix[b_index]
            cos3add_response, cos3add_score = \
                cos3add(compare_matrix.T, a_vector, a_s_vector, b_vector, b_s, 3, word2vec_model.wv.index2word)
            cos3mul_response, cos3mul_score = \
                cos3mul(compare_matrix, word2vec_model.wv.index2word, a_vector, a_s_vector, b_vector, b_s, top_n=3)
            dataset_result.append({
                'analogy': row,
                'method': method, 
                'cos3add_response': cos3add_response,
                'cos3add_score': cos3add_score,
                'cos3mul_response': cos3mul_response,
                'cos3mul_score': cos3mul_score
            })
    return dataset_name, pd.DataFrame(dataset_result)

# perform this for each model
for model_dir in tqdm(glob_w2v_models):
    model_name = model_dir.replace("../../../embeddings_lemma/", "").replace("iter=5_", "")
    model_path = glob.glob(model_dir + "/*[!(npy)]")[0]
    model = Word2Vec.load(model_path)
    model.wv.vectors = nnorm(model.wv.vectors) 
    model.trainables.syn1neg = nnorm(model.trainables.syn1neg)
    model = pin_in_object_store(model)
    # print("Running analysis on each dataset")
    # Step 1: Handle relatedness dataset
    futures = [word2vec_test_association_dataset.remote(model_name, dataset_name, dataset) \
                    for dataset_name, dataset in all_datasets['analogy_datasets'].items()]
    res = ray.get(futures)    
    # print("Post processing and Saving results")
    # pd.concat(res, axis=1)
    df_res = {dname:df_res for dname, df_res in res}
    with open(f"../output/word2vec_results_{model_name}_analogy.pickle", "wb") as f:
        pickle.dump({"score_matrix": df_res}, f)
                

  3%|▎         | 27/994 [00:15<09:04,  1.78it/s]


KeyboardInterrupt: 

In [None]:
pd.DataFrame(dataset_result)

Unnamed: 0,analogy,method,cos3add_response,cos3add_score,cos3mul_response,cos3mul_score
0,"[rome, italy, islamabad, pakistan]",IN-IN,"[islamabad, malaysia, ===pakistan===]",0,"[===pakistan===, ====pakistan====, malaysia]",0
1,"[rome, italy, islamabad, pakistan]",IN-OUT,"[pakistan, malaysia, iran]",1,"[pakistan, malaysia, bangladesh]",1
2,"[rome, italy, islamabad, pakistan]",OUT-IN,"[aa/semitic, petrochimi, tejan-sie]",0,"[aa/semitic, petrochimi, tejan-sie]",0
3,"[rome, italy, islamabad, pakistan]",OUT-OUT,"[islamabad, pakistan, sargodha]",1,"[pakistan, islamabad, gilgit-baltistan]",1
4,"[hanoi, vietnam, canberra, australia]",IN-IN,"[canberra, queensland, australian]",0,"[queensland, australian, nsw]",0
...,...,...,...,...,...,...
105,"[bangkok, thailand, canberra, australia]",IN-OUT,"[australia, canberra, ===australia===]",1,"[australia, ===australia===, canberra]",1
106,"[bangkok, thailand, canberra, australia]",OUT-IN,"[aa/semitic, hmts, forceably]",0,"[aa/semitic, hmts, forceably]",0
107,"[bangkok, thailand, canberra, australia]",OUT-OUT,"[canberra, queensland, tasmania]",0,"[canberra, queensland, tasmania]",0
108,"[madrid, spain, athens, greece]",IN-IN,"[greece, athens, peloponnesus]",1,"[greece, colchis, athens]",1


In [None]:
# # all_datasets['analogy_datasets']['google_analogy']['gram8-plural'][:10]
# # all_datasets['analogy_datasets']['bats_analogy'][:10]
# # pd.DataFrame(dataset_result)
# def cos3mul(compare_matrix, word_index, c,  b_s, top_n, e=0.0000001):
#     all_scores = np.dot(compare_matrix, c)
#     all_scores = (all_scores + 1)/2 # normalize bw 0 to 1
#     all_scores[:, 2] += e
#     all_scores = ((all_scores[:, 0] * all_scores[:, 1]) / all_scores[:, 2])
#     top_n_similar_words = np.argpartition(-all_scores, top_n)[:top_n]
#     sorted_result = sorted([(word_index[index], all_scores[index]) for index in top_n_similar_words], 
#                 key=lambda x: x[1], 
#                 reverse=True)
#     sorted_result = [x[0] for x in sorted_result]
#     return sorted_result, 1 if b_s in sorted_result else 0
# # test
# # source_matrix, compare_matrix = word2vec_get_model_matrix_by_method(model, "IN-IN")
# # c = np.stack(
# #     [source_matrix[word2vec_get_index_by_word(model, 'tehran')],
# #     source_matrix[word2vec_get_index_by_word(model, 'italy')],
# #     source_matrix[word2vec_get_index_by_word(model, 'rome')]]
# #         , axis=1)
# # cos3mul(compare_matrix, model.wv.index2word, c,  'pakistan', 10, e=0.0001)

(['iran',
  'tehran',
  '===iran===',
  'azerbaijan',
  'khodro',
  'utc+03:30',
  'kuwait',
  'abadan',
  'uae',
  'bahrain'],
 0)

In [None]:
# np.stack([[1, 0], [1, 1], [0, 1]], axis=1)
# all_dataset
# model.wv.most_similar(positive=['italy', 'islamabad'], negative=['rome'])
# ais = np.dot(compare_matrix, c)
# max(((ais[:, 0] * ais[:, 1])/(ais[:, 2]+0.001)))#[1951]
ais.min()

-0.4604089

In [None]:
# print(model.wv.index2word.index('pakistan'))
all_datasets['analogy_datasets']['google_analogy'][:10]

[['rome', 'italy', 'islamabad', 'pakistan'],
 ['hanoi', 'vietnam', 'canberra', 'australia'],
 ['ottawa', 'canada', 'havana', 'cuba'],
 ['stockholm', 'sweden', 'london', 'england'],
 ['havana', 'cuba', 'berlin', 'germany'],
 ['athens', 'greece', 'tehran', 'iran'],
 ['cairo', 'egypt', 'canberra', 'australia'],
 ['tokyo', 'japan', 'helsinki', 'finland'],
 ['islamabad', 'pakistan', 'hanoi', 'vietnam'],
 ['islamabad', 'pakistan', 'paris', 'france']]

## Combine all results

### Relatedness

In [None]:
# Load all relatedness results
all_relatedness_table = []
for result_file in glob.glob("../output/word2vec/relatedness/*"):
    with open (result_file, "rb") as f:
        data = pickle.load(f)
        all_relatedness_table.append(data['score_matrix'][0])    
        # print(result_file)
all_relatedness_table = pd.concat(all_relatedness_table)
# all_relatedness_table.to_csv("../output/all_relatedness_table.csv")

../output/word2vec/relatedness/word2vec_results_word2vec_mc=10_size=200_window=50_sg=1_relatedness.pickle
../output/word2vec/relatedness/word2vec_results_word2vec_mc=10_size=300_window=5_sg=1_relatedness.pickle
../output/word2vec/relatedness/word2vec_results_word2vec_mc=10_size=200_window=50_sg=0_relatedness.pickle
../output/word2vec/relatedness/word2vec_results_word2vec_mc=10_size=300_window=50_sg=0_relatedness.pickle
../output/word2vec/relatedness/word2vec_results_word2vec_mc=10_size=200_window=5_sg=1_relatedness.pickle
../output/word2vec/relatedness/word2vec_results_word2vec_mc=10_size=200_window=5_sg=0_relatedness.pickle
../output/word2vec/relatedness/word2vec_results_word2vec_mc=10_size=100_window=5_sg=1_relatedness.pickle
../output/word2vec/relatedness/word2vec_results_word2vec_mc=10_size=100_window=5_sg=0_relatedness.pickle
../output/word2vec/relatedness/word2vec_results_word2vec_mc=10_size=300_window=50_sg=1_relatedness.pickle
../output/word2vec/relatedness/word2vec_results_wor

In [None]:
# all_relatedness_table.index
# np.allclose(all_relatedness_table[3].values, all_relatedness_table[10].values)
# pd.concat(all_relatedness_table)
# all_relatedness_table[10].index

Index(['word2vec_mc=10_size=100_window=5_sg=0_IN-IN',
       'word2vec_mc=10_size=100_window=5_sg=0_IN-OUT',
       'word2vec_mc=10_size=100_window=5_sg=0_OUT-IN',
       'word2vec_mc=10_size=100_window=5_sg=0_OUT-OUT'],
      dtype='object')

dict_keys(['score_matrix'])

### Association

In [None]:
# all association result
def clean_association_model_name(name): return name.replace("mc=10_", "").replace("_10_tp_score", "")
all_association_result = []
for result_file in tqdm(glob.glob("../output/word2vec/association/*")):
    with open(result_file, "rb") as f:
        data = pickle.load(f)
        sw_data = pd.DataFrame(data['score_matrix']['swow8500'])
        eat_data = pd.DataFrame(data['score_matrix']['eat'])
        # x.
        association_result = []
        for col in sw_data.describe().columns:
            result = {}
            for dataset_name, dataset in [("swow8500", sw_data), ("eat", eat_data)]:
                def get_stats(dataset_name, dataset):
                    hit_rate = dataset[dataset[col]>0].shape[0] / dataset.shape[0] 
                    avg_coverage =  dataset.loc[dataset[col]>0, col].mean()
                    return {'col': clean_association_model_name(col), f'{dataset_name}_hit_rate': hit_rate, f'{dataset_name}_avg_coverage': avg_coverage}
                result = {**result, **get_stats(dataset_name, dataset)}
            association_result.append(result)
        association_result = pd.DataFrame(association_result)
        association_result.index = association_result['col']
        association_result.drop(columns=['col'], inplace=True)
        all_association_result.append(association_result)
all_association_result = pd.concat(all_association_result)
all_association_result.to_csv("../output/all_association_result.csv")

100%|██████████| 12/12 [00:17<00:00,  1.45s/it]


In [None]:
# with open('../output/word2vec/association/word2vec_results_word2vec_mc=10_size=100_window=50_sg=0_association.pickle', 'rb') as f:
#     data = pickle.load(f)
#     sw_data = pd.DataFrame(data['score_matrix']['swow8500'])
#     display(sw_data.describe())
# # glob.glob("../output/word2vec/association/*")

### Analogy

In [None]:
# all association result
# def clean_association_model_name(name): return name.replace("mc=10_", "").replace("_10_tp_score", "")
all_analogy_result = []
for result_file in tqdm(glob.glob("../output/word2vec/analogy/*")):
    with open(result_file, "rb") as f:
        data = pickle.load(f)
        model_name = re.findall( "_word2vec.*",result_file)[0].replace("_analogy.pickle", "")
        def extract_result(dataset_name, model_name):
            dataset = pd.DataFrame(data['score_matrix'][dataset_name])
            dataset = pd.DataFrame(dataset).groupby('method').mean()
            dataset.columns = [dataset_name + "_" + str(x) for x in dataset.columns]
            dataset.index = [model_name + "_" + x for x in dataset.index]
            return dataset
        result = pd.concat([extract_result('google_analogy', model_name), 
                              extract_result('bats_analogy', model_name)], axis=1)
        all_analogy_result.append(result)
all_analogy_result = pd.concat(all_analogy_result)
all_analogy_result.to_csv("../output/all_analogy_result.csv")

100%|██████████| 12/12 [00:02<00:00,  5.79it/s]


Unnamed: 0,google_analogy_cos3add_score,google_analogy_cos3mul_score,bats_analogy_cos3add_score,bats_analogy_cos3mul_score
_word2vec_mc=10_size=100_window=50_sg=0_IN-IN,0.667002,0.6167,0.255814,0.2
_word2vec_mc=10_size=100_window=50_sg=0_IN-OUT,0.620724,0.610664,0.195349,0.186047
_word2vec_mc=10_size=100_window=50_sg=0_OUT-IN,0.0,0.0,0.0,0.0
_word2vec_mc=10_size=100_window=50_sg=0_OUT-OUT,0.546278,0.546278,0.232558,0.234884
_word2vec_mc=10_size=100_window=5_sg=1_IN-IN,0.646881,0.635815,0.216279,0.211628
_word2vec_mc=10_size=100_window=5_sg=1_IN-OUT,0.537223,0.50503,0.172093,0.160465
_word2vec_mc=10_size=100_window=5_sg=1_OUT-IN,0.440644,0.377264,0.155814,0.123256
_word2vec_mc=10_size=100_window=5_sg=1_OUT-OUT,0.608652,0.593561,0.223256,0.213953
_word2vec_mc=10_size=300_window=5_sg=1_IN-IN,0.732394,0.758551,0.246512,0.267442
_word2vec_mc=10_size=300_window=5_sg=1_IN-OUT,0.628773,0.626761,0.211628,0.209302


In [None]:
import re
re.findall( "_word2vec.*", glob.glob("../output/word2vec/analogy/*")[0])

['_word2vec_mc=10_size=100_window=50_sg=0_analogy.pickle']