# Word2Vec models comparison

We compare the different models of word2vec against different intrinsic word embeddings tasks.

### Import and load datasets

In [1]:
# imports
!pip install ray
import ray
ray.init()

!pip install nltk
import nltk
nltk.download('wordnet')
import glob
!pip install tqdm
from tqdm import tqdm
import pandas as pd
!pip install gensim
from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# load the files
def load_similarity_datasets():
    """Load all (13) datasets which can be used to test word interchangeable similarity
    """
    sim_data = {}
    for file_path in glob.glob("../data/word-sim/*"):
        file_name = file_path[17:].replace(".txt", "")
        print(file_name)
        try:
            df = pd.read_csv(file_path, sep="\t", header=None)
            df.columns = ['word_1', 'word_2', 'similarity_score']
        except:
            df = pd.read_csv(file_path, sep=" ", header=None)
            df.columns = ['word_1', 'word_2', 'similarity_score']
        sim_data[file_name] = df
    return sim_data

# load similarity datasets
similarity_datasets = load_similarity_datasets()

Collecting ray
  Downloading ray-0.8.7-cp37-cp37m-manylinux1_x86_64.whl (22.0 MB)
[K     |████████████████████████████████| 22.0 MB 3.0 MB/s eta 0:00:01    |████▊                           | 3.3 MB 3.0 MB/s eta 0:00:07
[?25hCollecting pyyaml
  Downloading PyYAML-5.3.1.tar.gz (269 kB)
[K     |████████████████████████████████| 269 kB 74.3 MB/s eta 0:00:01
Collecting colorful
  Downloading colorful-0.5.4-py2.py3-none-any.whl (201 kB)
[K     |████████████████████████████████| 201 kB 61.0 MB/s eta 0:00:01
[?25hCollecting msgpack<2.0.0,>=1.0.0
  Downloading msgpack-1.0.0-cp37-cp37m-manylinux1_x86_64.whl (275 kB)
[K     |████████████████████████████████| 275 kB 59.7 MB/s eta 0:00:01
[?25hCollecting redis<3.5.0,>=3.3.2
  Downloading redis-3.4.1-py2.py3-none-any.whl (71 kB)
[K     |████████████████████████████████| 71 kB 10.5 MB/s eta 0:00:01
[?25hCollecting colorama
  Downloading colorama-0.4.3-py2.py3-none-any.whl (15 kB)
Collecting py-spy>=0.2.0
  Downloading py_spy-0.3.3-py2.py3-no

### Load word2vec models

In [None]:
# model = Word2Vec.load("../../../embeddings_lemma/word2vec_mc=10_iter=5_size=100_window=5_sg=0/word2vec_wikiEn20171001_millionSentences_mc=10_iter=5_size=100_window=5_sg=0")

### Create Word2vec similarity computing method

In [None]:
def word2vec_get_index_by_word(model, word):
    """Return the index of the word in the model
    """
    return model.wv.index2word.index(word)

def word2vec_get_word_by_index(model, index):
    """Return the word by the provided index
    """
    return model.wv.index2word[index]

def word2vec_find_top_similar_words(model, source_word, method='IN-IN', top_n=5, no_self_similarity=True):
    """
    Provided a word, find the top_n most similar from the model following the method
    """
    score = []
    input_weights = model.wv.vectors
    output_weights = model.trainables.syn1neg
    source_word_index = word2vec_get_index_by_word(model, source_word)
    if method=="IN-IN":
        weights1, weights2 = input_weights, input_weights
    elif method=="IN-OUT":
        weights1, weights2 = input_weights, output_weights
    elif method=="OUT-IN":
        weights1, weights2 = output_weights, input_weights
    elif method=="OUT-OUT":
        weights1, weights2 = output_weights, output_weights
    score = cosine_similarity(weights1[source_word_index].reshape(1, -1), weights2)[0]
    if no_self_similarity:
        score[source_word_index] = -1 # negate self-similarity
    top_n_similar_words = np.argpartition(-score, top_n)[:top_n]
    return sorted([(word2vec_get_word_by_index(model, index), score[index]) for index in top_n_similar_words], 
                key=lambda x: x[1], 
                reverse=True)

def word2vec_find_similarity(model, source_word, target_word, method="IN-IN"):
    """Return the cosine similarity between two words based on the suggested method
    """
    input_weights = model.wv.vectors
    output_weights = model.trainables.syn1neg
    source_word_index = word2vec_get_index_by_word(model, source_word)
    target_word_index = word2vec_get_index_by_word(model, target_word)
    if method=="IN-IN":
        weights1, weights2 = input_weights, input_weights
    elif method=="IN-OUT":
        weights1, weights2 = input_weights, output_weights
    elif method=="OUT-IN":
        weights1, weights2 = output_weights, input_weights
    elif method=="OUT-OUT":
        weights1, weights2 = output_weights, output_weights
    score = cosine_similarity(weights1[source_word_index].reshape(1, -1), 
                              weights2[target_word_index].reshape(1, -1))[0]
    return score
# word2vec_find_similarity(model, "car", "truck", "IN-OUT")
# word2vec_find_top_similar_words(model, "car", "IN-IN")
# word2vec_find_top_similar_words(model, "car", "IN-OUT")

### Generate stats for one similarity dataset

In [None]:
df = similarity_datasets['EN-SIMLEX-999'].copy()
score_table = []
dimension = model.trainables.syn1neg.shape[1]
for row in df.to_dict(orient="records"):
    methods = ["IN-IN", "IN-OUT", "OUT-IN", "OUT-OUT"]
    for method in methods:
        try:
            sim_score = word2vec_find_similarity(model, row['word_1'], row['word_2'], method)[0]
        except:
            sim_score = None
        row[f'word2vec_{dimension}_{method}_sim_score'] = sim_score
    score_table.append(row)
score_table = pd.DataFrame.from_dict(score_table)

In [None]:
score_table.dropna().corr("pearson")

Unnamed: 0,similarity_score,word2vec_100_IN-IN_sim_score,word2vec_100_IN-OUT_sim_score,word2vec_100_OUT-IN_sim_score,word2vec_100_OUT-OUT_sim_score
similarity_score,1.0,0.378299,0.284335,0.267766,0.326836
word2vec_100_IN-IN_sim_score,0.378299,1.0,0.629071,0.64211,0.790624
word2vec_100_IN-OUT_sim_score,0.284335,0.629071,1.0,0.920948,0.475211
word2vec_100_OUT-IN_sim_score,0.267766,0.64211,0.920948,1.0,0.47515
word2vec_100_OUT-OUT_sim_score,0.326836,0.790624,0.475211,0.47515,1.0


## Generate similarity score for each similarity datasets

In [None]:
# lemmatizer - noun lemma -- https://stackoverflow.com/questions/25534214/nltk-wordnet-lemmatizer-shouldnt-it-lemmatize-all-inflections-of-a-word
def lemma(word): return nltk.stem.WordNetLemmatizer().lemmatize(word)

# preprocss the word - lowercase and lemma
def pre(word): return lemma(word.lower())

all_df_res = []
all_missing_words = []

@ray.remote
def compare_word2vec_model_with_dataset(model, model_name, dataset_name, dataset):
    missing_words = 0
    score_table = []
    for row in dataset.to_dict(orient="records"):
        methods = ["IN-IN", "IN-OUT", "OUT-IN", "OUT-OUT"]
        for method in methods:
            try:
                sim_score = word2vec_find_similarity(model, pre(row['word_1']), pre(row['word_2']), method)[0]
            except:
                sim_score = None
                missing_words += 1
            row[f"{model_name}_{method}"] = sim_score
        score_table.append(row)
    score_table = pd.DataFrame.from_dict(score_table)
    score_table = score_table.dropna().corr("pearson")[['similarity_score']].tail(4)
    score_table.columns = [dataset_name]
    missing_words = missing_words/len(methods)
    return score_table, dataset_name, missing_words

for model_dir in tqdm(glob.glob("../../../embeddings_lemma/word2vec_*")[3:]):
    model_name = model_dir.replace("../../../embeddings_lemma/", "").replace("iter=5_", "")
    model_path = glob.glob(model_dir + "/*[!(npy)]")[0]
    model = Word2Vec.load(model_path)
    # print("Running analysis on each dataset")
    futures = [compare_word2vec_model_with_dataset.remote(model, model_name, dataset_name, dataset) \
                    for dataset_name, dataset in similarity_datasets.items()]
    res = ray.get(futures)    
    # print("Post processing and Saving results")
    # pd.concat(res, axis=1)
    df_res = pd.concat([df_res for df_res, _, _ in res], axis=1)
    missing_words = {key:val for _, key, val in res}
    all_df_res.append(df_res)
    all_missing_words.append(missing_words)
    with open("word2vec_results1.pickle", "wb") as f:
        pickle.dump({"score_matrix": all_df_res, 'missing_words': all_missing_words}, f)

KernelInterrupted: Execution interrupted by the Jupyter kernel.

In [None]:
# glob.glob("../../../embeddings_lemma/word2vec_mc=10_iter=5_size=100_window=5_sg=0/*")
import pickle
with open("word2vec_results1.pickle", "rb") as f:
    _ = pickle.load(f)
_['score_matrix'][0]

Unnamed: 0,EN-VERB-143,EN-SimVerb-3500,EN-RG-65,EN-RW-STANFORD,EN-MTurk-771,EN-MEN-TR-3k,EN-MC-30,EN-MTurk-287,EN-SIMLEX-999,EN-WS-353-REL,EN-YP-130,EN-WS-353-ALL,EN-WS-353-SIM
word2vec_mc=10_size=200_window=50_sg=1_IN-IN,0.348385,0.136466,0.689558,0.375208,0.599916,0.705429,0.748075,0.697174,0.250338,0.576391,0.404015,0.622246,0.688071
word2vec_mc=10_size=200_window=50_sg=1_IN-OUT,0.235017,0.155179,0.692447,0.325123,0.575224,0.705933,0.676273,0.681006,0.244184,0.601608,0.41791,0.642435,0.703578
word2vec_mc=10_size=200_window=50_sg=1_OUT-IN,0.241199,0.134058,0.699191,0.345627,0.568313,0.707308,0.774847,0.686461,0.212156,0.606514,0.358022,0.643855,0.70872
word2vec_mc=10_size=200_window=50_sg=1_OUT-OUT,0.200181,0.120349,0.662345,0.357199,0.522494,0.655967,0.705394,0.663086,0.258613,0.478582,0.264028,0.568965,0.65218


## Appendix

1. Check for the presence of word in dataset and model

In [None]:
# missing_words
list_of_words = similarity_datasets['EN-VERB-143']['word_1'].values
print(f"Unique words: {len(set(list_of_words))}")
print(f"Words in model: {sum([lemma(word.lower()) not in model.wv.index2word for word in set(list_of_words) ])}")

Unique words: 87
Words in model: 0


In [None]:
missing_words['EN-VERB-143']
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
glob.glob("../../../embeddings_lemma/word2vec_mc=10_iter=5_size=200_window=50_sg=1/*[!(npy)]")

['../../../embeddings_lemma/word2vec_mc=10_iter=5_size=200_window=50_sg=1/word2vec_wikiEn20171001_millionSentences_mc=10_iter=5_size=200_window=50_sg=1']