In [23]:
import gensim
import gensim.downloader as api
from gensim.test.utils import datapath

import numpy as np
import pandas as pd

from scipy.stats import spearmanr

## Evaluate the off-the-shelf *Google News Corpus* word2vec

In [2]:
model = api.load('word2vec-google-news-300')

`gensim` comes with a number of standard benchmark datasets ([here](https://github.com/RaRe-Technologies/gensim/tree/develop/gensim/test/test_data))

## SimLex

source: [Hill et al. (2014)](https://arxiv.org/pdf/1408.3456v1.pdf)

In [16]:
fp = datapath('simlex999.txt')
simlex999 = pd.read_csv(fp, sep='\t', skiprows=1)
simlex999.head()

Unnamed: 0,# Word 1,Word 2,Human (mean)
0,old,new,1.58
1,smart,intelligent,9.2
2,hard,difficult,8.77
3,happy,cheerful,9.55
4,hard,easy,0.95


The values in "Human (mean)" records the average of a number of human coders word pair similarity ratings (scale 0-10).

In [24]:
# benchmark
sims = list()
for i, row in simlex999.iterrows():
    if row[0] in model and row[1] in model:
        sims.append(model.similarity(row[0], row[1]))
    else:
        sims.append(np.naa)

# compute correlation between human similarity ratings and cosine similarity scores
spearmanr(simlex999.iloc[:,-1], sims)

SignificanceResult(statistic=0.44196551091403796, pvalue=5.068221892023142e-49)

## WordSim535

Source: [Agirre et al. (2009)](https://aclanthology.org/N09-1003/)

In [21]:
fp = datapath('wordsim353.tsv')
wordsim353 = pd.read_csv(fp, sep='\t', skiprows=1)
wordsim353.head()

Unnamed: 0,# Word 1,Word 2,Human (mean)
0,love,sex,6.77
1,tiger,cat,7.35
2,tiger,tiger,10.0
3,book,paper,7.46
4,computer,keyboard,7.62


In [25]:
# benchmark
sims = list()
for i, row in wordsim353.iterrows():
    if row[0] in model and row[1] in model:
        sims.append(model.similarity(row[0], row[1]))
    else:
        sims.append(np.naa)

# compute correlation between human similarity ratings and cosine similarity scores
spearmanr(wordsim353.iloc[:,-1], sims)

SignificanceResult(statistic=0.7000166486272194, pvalue=2.8686666605142608e-53)

In [31]:
wv = np.load('/Users/hlicht/Downloads/sgns/1840-w.npy')
print(wv.shape)

import pickle

with open('/Users/hlicht/Downloads/sgns/1840-vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)
print(len(vocab))
vocab[:10]

(100000, 300)
100000


['the', 'of', 'to', 'and', 'in', 'a', 'that', 'is', 'it', 'be']