# Evaluation style thesauruses and underlying embeddings

1. Look at word-level PIP loss
2. Look at overlap between simple_lookup and synonym/related word lists

In [2]:
import gensim
import numpy as np
import pickle
import random
import re
import requests
import spacy

from time import time

from src.simple_lookup import simple_lookup, get_pos, get_vocab

Loading embeddings from dat/annoy directory...
EMBEDDINGS:
 dict_keys(['word2vec-slim', 'poetry', 'australia', 'joyce', 'oneb', 'dickens', 'law', 'joyce-dep', 'merge-science-big', 'darwin100dep', 'darwin_0', 'darwin', 'darwin_1', 'gandhi', 'darwin_3', 'nyt-science', 'darwin_2', 'darwin_4', 'aretha', 'sherlock', 'arxiv_abs', 'merge-science-small', 'glove-slim'])
VOCAB:
 dict_keys(['glove-slim', 'foodreviews', 'oneb', 'poetry', 'gandhi', 'nyt-science', 'darwin', 'joyce-dep', 'darwin_1', 'aretha', 'darwin_3', 'darwin100dep', 'dickens', 'merge-science-small', 'arxiv_abs', 'darwin_2', 'joyce', 'word2vec-slim', 'australia', 'darwin_0', 'law', 'darwin_4', 'sherlock', 'merge-science-big'])
PART OF SPEECH:
 dict_keys(['merge-science-big', 'dickens', 'joyce', 'poetry', 'merge-science-small', 'arxiv_abs', 'sherlock', 'nyt-science', 'darwin', 'gandhi'])
Ready!




In [3]:
simple_lookup('propose', 'VERB', embkey="arxiv_abs")['words']

['present',
 'develop',
 'devise',
 'describe',
 'adopt',
 'employ',
 'formulate',
 'advocate',
 'explore',
 'extend']

In [4]:
base_path = 'dat/vecs/'
embd_nmes = ['glove-slim', 'word2vec-slim', 'arxiv_abs', 'nyt-science']
embd_vecs = []
for nme in embd_nmes:
    fle = base_path + nme + '.txt'
    vec = gensim.models.KeyedVectors.load_word2vec_format(fle)
    embd_vecs.append(vec)
    print('.', end=' ')

. . . . 

## Look at PIP loss

Function for single word PIP loss, both local and global. Look at difference between lots of embeddings for a single word, or between just two embeddings for a bunch of words.

In [5]:
def norm(v):
    return v / np.sqrt((np.sum(v**2)))

def pip_singleword(response, embeddings, emb_names, local=False):
    """Return dict of PIP 'vector' for response word; one for each embedding.
    
    If local, PIP matrix is only over top 50 words nearest to response.
        (Has to be shared in all embeddings)
    Else PIP matrix is over shared vocab words across all embeddings.
    
    PIP 'vector' is the distance from response to every other word in relevant vocab.
    
    :param response: word to calculate distances from
    :param embeddings: list of gensim embeddings
    :param emb_names: list of names of gensim embeddings
    :param local: boolean indicator
    :return pips: dict, key is embedding name, value is pip vector
    """
    vocabs = []
    for emb, nme in zip(embeddings, emb_names):
        if response not in emb.vocab:
            raise ValueError('response word not in {} embedding vocab'.format(nme))
        if local:
            vocabs.append(set([w for w, d in emb.most_similar(response, topn=50)]))
        else:
            vocabs.append(set(emb.vocab))
    
    shared_vocab = list(set.intersection(*vocabs))
    
    pips = {}
    for emb, nme in zip(embeddings, emb_names):
        pip = np.zeros(len(shared_vocab))
        for i, w in enumerate(shared_vocab):
            pip[i] = np.dot(norm(emb[response]), norm(emb[w]))
        pips[nme] = pip
    
    return pips

def get_piploss(pips, emb1, emb2):
    """Return pip loss between pips[emb1] and pips[emb2].
    
    :param pips: dict of emb_name: PIP vector for a single word (precalculated)
    :param emb1: key for pips dict
    :param emb2: key for pips dict
    :return: float
    """
    return np.linalg.norm(pips[emb1] - pips[emb2])

In [6]:
word = 'model'
output = []
for i, (vec, nme) in enumerate(zip(embd_vecs, embd_nmes)):
    for vec1, nme1 in zip(embd_vecs[i:], embd_nmes[i:]):
        pips = pip_singleword(word, [vec, vec1], [nme, nme1])
        output.append((nme+' | '+nme1, get_piploss(pips, nme, nme1)))

sorted(output, key=lambda tup: tup[1])

[('glove-slim | glove-slim', 0.0),
 ('word2vec-slim | word2vec-slim', 0.0),
 ('arxiv_abs | arxiv_abs', 0.0),
 ('nyt-science | nyt-science', 0.0),
 ('word2vec-slim | arxiv_abs', 18.993049814164067),
 ('word2vec-slim | nyt-science', 22.033235389497364),
 ('arxiv_abs | nyt-science', 23.46319974984484),
 ('glove-slim | word2vec-slim', 25.357969622687676),
 ('glove-slim | arxiv_abs', 33.35568080684847),
 ('glove-slim | nyt-science', 33.46942931278002)]

In [7]:
word = 'model'
output = []
for i, (vec, nme) in enumerate(zip(embd_vecs, embd_nmes)):
    for vec1, nme1 in zip(embd_vecs[i:], embd_nmes[i:]):
        pips = pip_singleword(word, [vec, vec1], [nme, nme1], local=True)
        output.append((nme+' | '+nme1, get_piploss(pips, nme, nme1)))

sorted(output, key=lambda tup: tup[1])

  if np.issubdtype(vec.dtype, np.int):


[('glove-slim | glove-slim', 0.0),
 ('word2vec-slim | word2vec-slim', 0.0),
 ('arxiv_abs | arxiv_abs', 0.0),
 ('nyt-science | nyt-science', 0.0),
 ('glove-slim | arxiv_abs', 0.21182660611786047),
 ('arxiv_abs | nyt-science', 0.31366543889324544),
 ('glove-slim | nyt-science', 0.31642927957772454),
 ('word2vec-slim | arxiv_abs', 0.5275869984961908),
 ('word2vec-slim | nyt-science', 0.799887196476512),
 ('glove-slim | word2vec-slim', 0.8978041714705951)]

In [9]:
# get shared vocab
vocabs = []
for emb, nme in zip(embd_vecs, embd_nmes):
    vocabs.append(set(emb.vocab))
shared_vocab = set.intersection(*vocabs)
len(shared_vocab)

11152

In [10]:
embd_nmes

['glove-slim', 'word2vec-slim', 'arxiv_abs', 'nyt-science']

In [12]:
np.set_printoptions(precision=3)

n = 30
words = random.sample(shared_vocab, n)
names = [embd_nmes[2], embd_nmes[0]]
vecs = [embd_vecs[2], embd_vecs[0]]
print(names)

piploss = []
for word in words:
    pips = pip_singleword(word, embd_vecs, embd_nmes)
    piploss.append(get_piploss(pips, names[0], names[1]))
piploss = np.array(piploss)

id_ord = np.argsort(piploss)
for i in id_ord:
    print('{:<15}{:.1f}'.format(words[i], piploss[i]))

['arxiv_abs', 'glove-slim']
answered       18.2
textures       18.5
enrich         18.8
magnitudes     19.8
mutually       19.9
satisfied      20.1
temperature    20.3
shattering     21.4
photographs    22.5
mimics         24.0
projection     24.4
scene          25.0
principles     25.7
conventions    28.0
domination     28.6
disclosed      30.1
translators    30.8
swarms         31.0
abruptly       31.5
allow          32.3
dermatologists 32.5
followed       32.7
because        34.5
diminished     34.6
uncontrollable 35.1
funny          35.1
leg            35.4
professionally 36.7
chi            42.2
dec            44.4


## Look at similarity to synonym and related words lists

Using macos thesaurus as synonym list and moby thesaurus as related word list.

In [21]:
def load_macthes(filepath='dat/other/mac_thes.txt'):
    """Return dict of word: set of synonyms.
    
    Thsi thesaurus is split by pos; ignore for now and throw all
    synoyms into same entry.
    """
    thes = dict()
    with open(filepath, 'r') as fle:
        for line in fle:
            entry, words = line.strip('\n').split(':')
            key, pos = entry.split(' ')
            words = words.split(', ')
            if key in thes:
                thes[key].update(words)
            else:
                thes[key] = set(words)
    return thes

def load_mobythes(filepath='dat/other/mthesaur.txt'):
    """Return dict of word: set of synonyms."""
    thes = dict()
    with open(filepath, 'r') as fle:
        for line in fle:
            words = line.strip('\n').split(',')
            thes[words[0]] = set(words[1:])
    return thes

def get_thes_count(key, words, thes):
    """Return num of words in words that are 'synoyms' of key according to thes.
    
    :param key: str
    :param words: list of str
    :param thes: dict of str to list of str
    :return count: int
    """
    if key not in thes:
        raise ValueError('key ({}) not in thes'.format(key))
    syns = thes[key]
    count = 0
    for w in words:
        if w in syns:
            count += 1
    return count

mac = load_macthes()
mob = load_mobythes()

In [28]:
key = 'deter'
print('mac thes', key.upper(), len(mac[key]), 'entries:')
for w in mac[key]:
    print(w, end=', ')
print('\n\nmob thes', key.upper(), len(mob[key]), 'entries:')
for w in mob[key]:
    print(w, end=', ')

mac thes DETER 27 entries:
hinder, demoralize, check, inhibit, stave off, fend off, avert, disincentivize, obstruct, stop, impede, scare off, prevent, dissuade, halt, block, foil, forestall, intimidate, daunt, hamper, put off, discourage, counteract, ward off, curb, dishearten, 

mob thes DETER 64 entries:
estop, hinder, check, disinterest, faze, bar, debar, scare, wean from, preclude, inhibit, stave off, obviate, awe, fend off, foreclose, deflect, avert, restrain, damp, turn from, quench, obstruct, frighten, keep from, repel, turn aside, disincline, overawe, shake, dampen, exclude, impede, scare off, stop, prevent, ward, turn off, dissuade, shut out, save, fend, block, blunt, prohibit, indispose, forestall, daunt, intimidate, put off, cool, distract, divert, anticipate, turn away, help, chill, discourage, forbid, keep off, ward off, disaffect, dishearten, rule out, 

In [30]:
embd_nmes, embd_vecs

(['glove-slim', 'word2vec-slim', 'arxiv_abs', 'nyt-science'],
 [<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x10eb5c860>,
  <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x16dcdcc88>,
  <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x1771f12e8>,
  <gensim.models.keyedvectors.Word2VecKeyedVectors at 0x17791be80>])

In [37]:
key = 'model'
response = [w for w, d in embd_vecs[0].most_similar(key, topn=10)]
print(response)
get_thes_count(key, response, mac)

['models', 'design', 'concept', 'prototype', 'introduced', 'example', 'type', 'developed', 'same', 'version']


4

In [41]:
def get_vocab(emb):
    """Return vocab of gensim emb object as set."""
    return set(emb.vocab.keys())

def get_syn_meas(emb, thes, n=30):
    """Randomly select n words from emb and return array of get_thes_count from top 10.
    
    :param emb: gensim emb ob
    :param thes: dict of words to list of syn
    :param n: number of words from emb vocab to randomly sample
    :return counts: np array
    """
    vocab = list(get_vocab(emb))
    words = random.sample(vocab, n)
    counts = np.zeros(n)
    for i, w in enumerate(words):
        while w not in thes:
            w = random.sample(vocab, 1)[0]
        response = [w for w, d in emb.most_similar(w, topn=10)]
        counts[i] = get_thes_count(w, response, thes)
    return counts

get_syn_meas(embd_vecs[0], mob)

array([0., 4., 0., 1., 6., 4., 0., 1., 1., 0., 1., 3., 0., 2., 4., 0., 5.,
       0., 2., 1., 1., 3., 3., 0., 0., 1., 0., 1., 2., 0.])

In [43]:
for emb, nme in zip(embd_vecs, embd_nmes):
    syns = get_syn_meas(emb, mac, n=1000)
    rels = get_syn_meas(emb, mob, n=1000)
    print('{:<15}{:<15.3f}{:<15.3f}'.format(nme, np.mean(syns), np.mean(rels)))

emb = gensim.models.KeyedVectors.load_word2vec_format('dat/vecs/arxiv_abs_lemma.txt')
syns = get_syn_meas(emb, mac, n=1000)
rels = get_syn_meas(emb, mob, n=1000)
print('{:<15}{:<15.3f}{:<15.3f}'.format('arxiv_abs_lem', np.mean(syns), np.mean(rels)))

glove-slim     1.349          1.474          
word2vec-slim  1.749          2.074          
arxiv_abs      0.362          0.628          
nyt-science    0.572          0.715          
arxiv_abs_lem  0.466          0.650          


'poetry', 'australia', 'joyce', 'oneb', 'dickens', 'law', 'joyce-dep', 'merge-science-big', 'darwin100dep', 'darwin_0', 'darwin', 'darwin_1', 'gandhi', 'darwin_3', 'nyt-science', 'darwin_2', 'darwin_4', 'aretha', 'sherlock', 'arxiv_abs', 'merge-science-small', 'glove-slim