# Word2Vec models comparison

We compare the different models of word2vec against different intrinsic word embeddings tasks.

### Import and load datasets

In [33]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
# imports
import glob
import pandas as pd
!pip install gensim
from gensim.models import KeyedVectors
from gensim.models.word2vec import Word2Vec

# load the files
def load_similarity_datasets():
    """Load all (13) datasets which can be sued to test word interchangeable similarity
    """
    sim_data = {}
    for file_path in glob.glob("../data/word-sim/*"):
        file_name = file_path[17:].replace(".txt", "")
        print(file_name)
        try:
            df = pd.read_csv(file_path, sep="\t", header=None)
            df.columns = ['word_1', 'word_2', 'similarity_score']
        except:
            df = pd.read_csv(file_path, sep=" ", header=None)
            df.columns = ['word_1', 'word_2', 'similarity_score']
        sim_data[file_name] = df
    return sim_data

# load similarity datasets
similarity_datasets = load_similarity_datasets()

You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m
EN-VERB-143
EN-SimVerb-3500
EN-RG-65
EN-RW-STANFORD
EN-MTurk-771
EN-MEN-TR-3k
EN-MC-30
EN-MTurk-287
EN-SIMLEX-999
EN-WS-353-REL
EN-YP-130
EN-WS-353-ALL
EN-WS-353-SIM


### Load word2vec models

In [10]:
model = Word2Vec.load("../../../embeddings_lemma/word2vec_mc=10_iter=5_size=100_window=5_sg=0/word2vec_wikiEn20171001_millionSentences_mc=10_iter=5_size=100_window=5_sg=0")

### Create Word2vec similarity computing method

In [78]:
def word2vec_get_index_by_word(model, word):
    """Return the index of the word in the model
    """
    return model.wv.index2word.index(word)

def word2vec_get_word_by_index(model, index):
    """Return the word by the provided index
    """
    return model.wv.index2word[index]

def word2vec_find_top_similar_words(model, source_word, method='IN-IN', top_n=5, no_self_similarity=True):
    """
    Provided a word, find the top_n most similar from the model following the method
    """
    score = []
    input_weights = model.wv.syn0
    output_weights = model.syn1neg
    source_word_index = word2vec_get_index_by_word(model, source_word)
    if method=="IN-IN":
        weights1, weights2 = input_weights, input_weights
    elif method=="IN-OUT":
        weights1, weights2 = input_weights, output_weights
    elif method=="OUT-IN":
        weights1, weights2 = output_weights, input_weights
    elif method=="OUT-OUT":
        weights1, weights2 = output_weights, output_weights
    score = cosine_similarity(weights1[source_word_index].reshape(1, -1), weights2)[0]
    if no_self_similarity:
        score[source_word_index] = -1 # negate self-similarity
    top_n_similar_words = np.argpartition(-score, top_n)[:top_n]
    return sorted([(word2vec_get_word_by_index(model, index), score[index]) for index in top_n_similar_words], 
                key=lambda x: x[1], 
                reverse=True)

def word2vec_find_similarity(model, source_word, target_word, method="IN-IN"):
    """Return the cosine similarity between two words based on the suggested method
    """
    input_weights = model.wv.syn0
    output_weights = model.syn1neg
    source_word_index = word2vec_get_index_by_word(model, source_word)
    target_word_index = word2vec_get_index_by_word(model, target_word)
    if method=="IN-IN":
        weights1, weights2 = input_weights, input_weights
    elif method=="IN-OUT":
        weights1, weights2 = input_weights, output_weights
    elif method=="OUT-IN":
        weights1, weights2 = output_weights, input_weights
    elif method=="OUT-OUT":
        weights1, weights2 = output_weights, output_weights
    score = cosine_similarity(weights1[source_word_index].reshape(1, -1), 
                              weights2[target_word_index].reshape(1, -1))[0]
    return score
# word2vec_find_similarity(model, "car", "truck", "IN-OUT")
# word2vec_find_top_similar_words(model, "car", "IN-IN")
# word2vec_find_top_similar_words(model, "car", "IN-OUT")

  app.launch_new_instance()


[('motorcycle', 0.1590093),
 ('racing', 0.15262602),
 ('driver', 0.14650589),
 ('truck', 0.14469106),
 ('motor', 0.14405525)]

### Generate stats for each similarity dataset

In [106]:
%%capture
df = similarity_datasets['EN-SIMLEX-999'].copy()
score_table = []
dimension = model.syn1neg.shape[1]
for row in df.to_dict(orient="records"):
    methods = ["IN-IN", "IN-OUT", "OUT-IN", "OUT-OUT"]
    for method in methods:
        try:
            sim_score = word2vec_find_similarity(model, row['word_1'], row['word_2'], method)[0]
        except:
            sim_score = None
        row[f'word2vec_{dimension}_{method}_sim_score'] = sim_score
    score_table.append(row)
score_table = pd.DataFrame.from_dict(score_table)

In [110]:
score_table.dropna().corr("pearson")

Unnamed: 0,similarity_score,word2vec_100_IN-IN_sim_score,word2vec_100_IN-OUT_sim_score,word2vec_100_OUT-IN_sim_score,word2vec_100_OUT-OUT_sim_score
similarity_score,1.0,0.378299,0.284335,0.267766,0.326836
word2vec_100_IN-IN_sim_score,0.378299,1.0,0.629071,0.64211,0.790624
word2vec_100_IN-OUT_sim_score,0.284335,0.629071,1.0,0.920948,0.475211
word2vec_100_OUT-IN_sim_score,0.267766,0.64211,0.920948,1.0,0.47515
word2vec_100_OUT-OUT_sim_score,0.326836,0.790624,0.475211,0.47515,1.0


In [None]:
# score_table.isnull().sum()