In [1]:
import requests
from gensim.models import KeyedVectors

from embedtrics.intrusion import intrusion_score
from embedtrics.similarity import load_sim_data, similarity_score
from embedtrics.analogies import analogies_score

We then use the open-source [Gensim](https://radimrehurek.com/gensim/index.html) library for loading, manipulating and applying word2vec weights to our dataset
Downloading the Google News word2vec weights:

In [12]:
# download pretrained weights
url = 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz'
filepath = 'GoogleNews-vectors-negative300.bin.gz'

with requests.get(url) as r:
    open(filepath, 'wb').write(r.content)

In [2]:
#load pretrained weights into the Gensim word2vec model
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [3]:
embedding_weights = model.vectors
embedding_dict = {v:i for i,v in enumerate(model.index2word)}

### Word Intrusion

In [25]:
intrusion_score(embedding_weights, k=5, N=10)

0.8417331616670378

### Word Similarity

In [27]:
sim_data = load_sim_data('data/SimLex999.txt', 'SimLex999')
similarity_score(sim_data, embedding_weights, embedding_dict, verbose=False)

(0.44196551091403796, 5.068221892023142e-49)

### Word Analogies

In [4]:
analogies_score('data/questions-words.txt', embedding_weights, embedding_dict, lower=True, verbose=False)

defaultdict(list,
            {'overall_total_time': 207.9792959690094,
             'total_correct': 6972,
             'total_found': 13705,
             'total_accuracy': 0.5087194454578621,
             'scores': defaultdict(dict,
                         {'capital-common-countries': {'n_found': 380,
                           'n_not_found': 126,
                           'n_correct': 0},
                          'capital-world': {'n_found': 647,
                           'n_not_found': 3877,
                           'n_correct': 0},
                          'currency': {'n_found': 502,
                           'n_not_found': 364,
                           'n_correct': 0},
                          'city-in-state': {'n_found': 1627,
                           'n_not_found': 840,
                           'n_correct': 0},
                          'family': {'n_found': 506,
                           'n_not_found': 0,
                           'n_correct': 0},
           