# Comparing Word Embeddings

The goal of this notebook is to use the tools in the [Tutte Institute ``vectorizers`` library](https://github.com/TutteInstitute/vectorizers) to construct word embeddings, and to compare those embeddings with the results from word2vec. We compare:
* Gensim word2vec
* 150 dimensional `word_vectorizer` (vectorizers library)
* 300 dimensional `word_vectorizer` (vectorizers library)

We will compare these techniques using some standard benchmarking.

Rather than comparing against a pre-trained word2vec model, we will train all of our word embedding algorithms on a standard corpus, and then compare them. This provides a common "start line", and avoids the implicit  hidden cost (and advantage) of using a pre-trained model coming from labs or institutions with large compute and better data access. 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gensim.downloader as api



In [3]:
api.info('text8')

{'num_records': 1701,
 'record_format': 'list of str (tokens)',
 'file_size': 33182058,
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/text8/__init__.py',
 'license': 'not found',
 'description': 'First 100,000,000 bytes of plain text from Wikipedia. Used for testing purposes; see wiki-english-* for proper full Wikipedia datasets.',
 'checksum': '68799af40b6bda07dfa47a32612e5364',
 'file_name': 'text8.gz',
 'read_more': ['http://mattmahoney.net/dc/textdata.html'],
 'parts': 1}

---
## Setup

In [4]:
import numpy as np
import pandas as pd
import scipy.stats

from svd2vec import FilesIO
from gensim.models import Word2Vec
from gensim.models.keyedvectors import Word2VecKeyedVectors

import vectorizers

import matplotlib.pyplot as plt
import seaborn as sns

import umap
import umap.plot
from bokeh.plotting import show

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

umap.plot.output_notebook()
sns.set_palette('deep')
sns.set(style="white", context="poster")

In [5]:
#documents = FilesIO.load_corpus("text8")
#comes tokenized
documents = list(api.load("text8"))

In [6]:
from src.user.utils import save_word2vec_formatting

ModuleNotFoundError: No module named 'src.user.utils'

----
## Vectorizer Word Embeddings

In [None]:
%%time
word_vectorizer = vectorizers.TokenCooccurrenceVectorizer(
    min_occurrences=50,
    window_radii=(1, 15),
    window_functions=("fixed", "variable"),
    kernel_functions=("geometric", "geometric"),
    kernel_args=({}, {"offset":1}),
    mask_string="[##MASK##]",
    nullify_mask=True,
    n_iter=4,
    normalize_windows=True,
    normalization="frequentist",
).fit(documents)

### 150 Dimensional Word Embedding

In [None]:
%%time
word_vectors = word_vectorizer.reduce_dimension(dimension=150, algorithm="randomized")
save_word2vec_format("./tcv_150_dim.word2vec", word_vectorizer.token_label_dictionary_, word_vectors)
model1 = Word2VecKeyedVectors.load_word2vec_format("./tcv_150_dim.word2vec")

### 300 Dimensional Word Embedding

In [None]:
%%time
word_vectors = word_vectorizer.reduce_dimension(dimension=300, algorithm="randomized")
save_word2vec_format("./tcv_300_dim.word2vec", word_vectorizer.token_label_dictionary_, word_vectors)
model2 = Word2VecKeyedVectors.load_word2vec_format("./tcv_300_dim.word2vec")

## Word2Vec Embedding

In [None]:
%%time
if not os.path.isfile("gensim_w2v_50_window_10.word2vec"):
    g_w2v = Word2Vec(documents, vector_size=300, window=10, min_count=50, workers=16, sample=1e-3, epochs=30)
    g_w2v.wv.save_word2vec_format("gensim_w2v_50_window_10.word2vec")

gensim_w2v = Word2VecKeyedVectors.load_word2vec_format("gensim_w2v_50_window_10.word2vec")

----
## Similarity comparison plots

In [None]:
def compare_similarity(model, model_name, d="\t", n_bootstrap_samples=50):
    result = {}
    ok_vocab = {k.lower(): model.key_to_index[k] for k in reversed(model.index_to_key) if k != "[##MASK##]"}
    original_vocab = model.key_to_index
    model.key_to_index = ok_vocab
    for benchmark in ("wordsim353", "men_dataset", "mturk", "simlex999", "rarewords"):
        benchmark_path = FilesIO.path(f"similarities/{benchmark}.txt")
        all_pairs = []
        with open(benchmark_path) as f:
            for line in f:
                if line.startswith("#"):
                    continue
                else:
                    a, b, sim = [word.lower() for word in line.split(d)]
                    all_pairs.append((a, b, float(sim)))
        all_pairs = pd.DataFrame(all_pairs, columns=("word", "comparison", "similarity"))
        good_pairs = all_pairs[(all_pairs.word.isin(ok_vocab)) & (all_pairs.comparison.isin(ok_vocab))]
        result[benchmark] = []
        for i in range(n_bootstrap_samples):
            sample = good_pairs.sample(len(all_pairs), replace=True)
            similarity_gold = []
            similarity_model = []
            oov = 0
            for a, b, sim in sample.itertuples(index=False):
                if a not in ok_vocab or b not in ok_vocab:
                    oov += 1
                    continue
                similarity_gold.append(sim)  # Similarity from the dataset
                similarity_model.append(model.similarity(a, b))  # Similarity from the model
                
            spearman = scipy.stats.spearmanr(similarity_gold, similarity_model)[0]
            pearson = scipy.stats.pearsonr(similarity_gold, similarity_model)[0]
            kendalltau = scipy.stats.kendalltau(similarity_gold, similarity_model)[0]
            oov_ratio = float(oov) / (len(similarity_gold) + oov)
            
            result[benchmark].append((spearman, pearson, kendalltau, oov_ratio))
    model.key_to_index = original_vocab
    
    dfs = []
    for benchmark in result:
        df = pd.DataFrame(result[benchmark], columns=("spearman", "pearson", "kendalltau", "missed_word_ratio"))
        df["benchmark"] = benchmark
        dfs.append(df)
    df = pd.concat(dfs)
    
    df = df.melt("benchmark")
    df.columns = ("benchmark", "correlation_type", "correlation")
    df["model"] = model_name
    
    return df

In [None]:
%%time
results = pd.concat([
    compare_similarity(gensim_w2v, "gensim"),
    #compare_similarity(word2vec_w2v, "word2vec"),
    compare_similarity(model1, "150 dim"),
    compare_similarity(model2, "300 dim"),
])

In [None]:
df = results[
    (results.benchmark.isin(["men_dataset", "wordsim353"])) &
    (results.correlation_type.isin(["spearman", "pearson"]))
].copy()
df.benchmark = df.benchmark.map({"wordsim353": "WordSim 353", "men_dataset": "MEN Word Similarity"})
plt.figure(figsize=(12,8));
g = sns.catplot(
    kind="swarm", 
    x="model", 
    y="correlation", 
    col="benchmark",
    row="correlation_type",
    order=[
        #"word2vec",
        "gensim", 
        "150 dim",
        "300 dim",
    ],
    data=df, 
    height=8, 
    alpha=0.33,
    zorder=1,
    s=10,
)
g.map_dataframe(
    sns.pointplot, 
    x="model", 
    y="correlation", 
    col="benchmark",
    row="correlation_type",
    data=df,  
    height=6, 
    aspect=1.3,
    palette="dark",
    join=False,
    order=[
        #"word2vec",
        "gensim", 
        "150 dim",
        "300 dim",
    ],
    ci="sd",
    zorder=2,
    scale=0.9,
)
g.set_titles(template="{col_name}\n{row_name} correlation")
g.set_xticklabels([
        #"Google\nWord2Vec",
        "Gensim\nWord2Vec", 
        "Vectorizers\n150 dim", 
        "Vectorizers\n300 dim", 
])
g.tight_layout()

In [None]:
# Some weirdness occasionally occurs due to our use of MASK values for uncommon vocab; fix this if required.
if "[##MASK##]" in model1.index_to_key: model1.index_to_key.remove("[##MASK##]")
if "[##MASK##]" in model2.index_to_key: model2.index_to_key.remove("[##MASK##]")

In [None]:
for benchmark in ("wordsim353", "men_dataset", "mturk", "simlex999", "rarewords"):
    benchmark_path = FilesIO.path(f"similarities/{benchmark}.txt")
    print("pearson correlation of", os.path.basename(benchmark_path))
    print("\tgensim_w2v   \t\t", gensim_w2v.evaluate_word_pairs(benchmark_path, delimiter="\t")[0][0])
    #print("\tword2vec_w2v \t\t", word2vec_w2v.evaluate_word_pairs(benchmark_path, delimiter="\t")[0][0])
    print("\t150 dim      \t\t", model1.evaluate_word_pairs(benchmark_path, delimiter="\t")[0][0])
    print("\t300 dim      \t\t", model2.evaluate_word_pairs(benchmark_path, delimiter="\t")[0][0])    

---
## Analogy comparison plots

In [None]:
def compare_analogies(model, model_name, n_bootstrap_samples=20):
    result = {}
    ok_vocab = {k.lower(): model.key_to_index[k] for k in reversed(model.index_to_key) if k != "[##MASK##]"}
    original_vocab = model.key_to_index
    model.key_to_index = ok_vocab
    for benchmark in ("questions-words", "msr"):
        benchmark_path = FilesIO.path(f"analogies/{benchmark}.txt")
        analogies = []
        with open(benchmark_path) as f:
            for line in f:
                if line.startswith(': '):
                    continue
                else:
                    a, b, c, expected = [word.lower() for word in line.split()]
                    analogies.append((a, b, c, expected))
        analogies = pd.DataFrame(analogies, columns=("from", "to", "as_from", "as_to"))
        good_analogies = analogies[
            (analogies["from"].isin(ok_vocab)) &
            (analogies.to.isin(ok_vocab)) &
            (analogies.as_from.isin(ok_vocab)) &
            (analogies.as_to.isin(ok_vocab))
        ]
        result[benchmark] = []
        for i in range(n_bootstrap_samples):
            n_correct = 0
            n_incorrect = 0
            oov = 0
            sample = good_analogies.sample(len(good_analogies), replace=True)
            for a, b, c, expected in sample.itertuples(index=False):
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    oov += 1
                    continue
                
                ignore = {a, b, c}  # input words to be ignored
                predicted = None
                sims = model.most_similar(positive=[b, c], negative=[a], topn=5)
                for element in sims:
                    predicted = element[0].lower()
                    if predicted in ok_vocab and predicted not in ignore:
                        if predicted == expected:
                            break
                            
                if predicted == expected:
                    n_correct += 1
                else:
                    n_incorrect += 1
                    
            accuracy = n_correct / (n_correct + n_incorrect)
            missed = oov / (n_correct + n_incorrect + oov)
            result[benchmark].append((accuracy, missed))
            
    dfs = []
    for benchmark in result:
        df = pd.DataFrame(result[benchmark], columns=("accuracy", "missing_rate"))
        df["benchmark"] = benchmark
        dfs.append(df)
    df = pd.concat(dfs)
    
    df = df.melt("benchmark")
    df.columns = ("benchmark", "score_type", "score")
    df["model"] = model_name
    
    return df
                    

In [None]:
analogy_results = pd.concat([
    compare_analogies(gensim_w2v, "gensim"),
    #compare_analogies(word2vec_w2v, "word2vec"),
    compare_analogies(model1, "150 dim"),
    compare_analogies(model2, "300 dim"),
])

In [None]:
df = analogy_results[
    (analogy_results.score_type.isin(["accuracy"]))
].copy()
df.benchmark = df.benchmark.map({"questions-words": "Google Analogies", "msr": "Microsoft Research Analogies"})
plt.figure(figsize=(16,8))
g = sns.catplot(
    kind="swarm", 
    x="model", 
    y="score", 
    col="benchmark",
    order=[
        #"word2vec",
        "gensim", 
        "150 dim",
        "300 dim",
    ],
    data=df, 
    height=8, 
    alpha=0.33,
    zorder=1,
    s=10,
)
g.map_dataframe(
    sns.pointplot, 
    x="model", 
    y="score", 
    col="benchmark",
    data=df,  
    height=6, 
    aspect=1.3,
    palette="dark",
    join=False,
    order=[
        #"word2vec",
        "gensim", 
        "150 dim",
        "300 dim",
    ],
    ci="sd",
    zorder=2,
    scale=0.9,
)
g.set_titles(template="{col_name}")
g.set_xticklabels([
        #"Google\nWord2Vec",
        "Gensim\nWord2Vec", 
        "Vectorizers\n150 dim", 
        "Vectorizers\n300 dim", 
])
g.tight_layout()

In [None]:
# Some weirdness occasionally occurs due to our use of MASK values for uncommon vocab; fix this if required.
if "[##MASK##]" in model1.index_to_key: model1.index_to_key.remove("[##MASK##]")
if "[##MASK##]" in model2.index_to_key: model2.index_to_key.remove("[##MASK##]")

In [None]:
for benchmark in ("questions-words", "msr"):
    benchmark_path = FilesIO.path(f"analogies/{benchmark}.txt")
    print("analogies success rate of", os.path.basename(benchmark_path))
    print("\tgensim_w2v   \t\t", gensim_w2v.evaluate_word_analogies(benchmark_path)[0])
    #print("\tword2vec_w2v \t\t", word2vec_w2v.evaluate_word_analogies(benchmark_path)[0])
    print("\t150 dim      \t\t", model1.evaluate_word_analogies(benchmark_path)[0])
    print("\t300 dim      \t\t", model2.evaluate_word_analogies(benchmark_path)[0])    

#### The 150 dimensional word embedding seems like great value. 

---
## Visualization via UMAP Embedding 
We'll just look at the 150 dimensional word vectors

In [None]:
%%time
hover_data = pd.DataFrame()
hover_data['vocab'] = model2.index_to_key
mapper = umap.UMAP(n_neighbors=25, metric = 'cosine', random_state=42).fit(model1.vectors[:-1,:])

In [None]:
plt = umap.plot.interactive(
    mapper, 
    hover_data=hover_data, 
    point_size=5, 
    values=np.log(word_vectorizer._token_frequencies_), 
    interactive_text_search=True, 
    interactive_text_search_alpha_contrast=0.99)
show(plt)