# Stability of word embeddings

Cite: [Evaluating the Stability of Embedding-based Word Similarities](https://transacl.org/ojs/index.php/tacl/article/view/1202/286)

Should look into if jaccard sim correlates to pip loss.

In [1]:
import sys
sys.executable

'/Users/katy/Documents/Grad/thesaurusx/env/bin/python'

In [8]:
import gensim
import numpy as np
import os
import random
import string

from time import time

## Jaccard similarity

Look at overlap in top 10 words across different embeddings

In [5]:

def get_jaccard(embeddings, keystones):
    """Return dict of keystones: jaccard sim.
    
    Where jaccard sim is the number of intersecting words/number of unioned words
    in words similar to a keystone.
    
    :param embeddings: list of gensim embedding objects
    :param keystones: list of words in vocab to calculate sim on
    :return jac_sims: dict where each key is a keystone, each value is the avg jaccard sim
    """
    data = {}
    for vectors in embeddings:
        for key in keystones:
            if key in data:
                data[key].append(set([w for w,d in vectors.most_similar(key, topn=10)]))
            else:
                data[key] = [set([w for w, d in vectors.most_similar(key, topn=10)])]

    jac_sims = {}
    for key in keystones:
        jaccard_sims = []
        for i, la in enumerate(data[key]):
            for lo in data[key][i+1:]:
                jac = len(la.intersection(lo)) / len(la.union(lo))
                jaccard_sims.append(jac)
        jac_sims[key] = sum(jaccard_sims)/len(jaccard_sims)
    return jac_sims

def load_emb(fnames):
    """Return list of gensim embedding objects, loaded from filepaths in fnames"""
    vectors = []
    for fname in fnames:
        vec = gensim.models.KeyedVectors.load_word2vec_format(fname)
        print('.', end=' ')
        vectors.append(vec)
    print('done')
    return vectors

# darwin = load_emb(['../dat/vecs/{}_{}.txt'.format('darwin', i) for i in range(5)])
# oneb = load_emb(['../dat/vecs/{}_{}.txt'.format('oneb', i) for i in range(3)])
keystones = ['darwin', 'observations', 'books', 'open', 'variation', 'cat', 'facts']

jac_sims = get_jaccard(oneb, keystones)
for key in jac_sims:
    print(key, jac_sims[key])

  if np.issubdtype(vec.dtype, np.int):


darwin 0.3127917833800187
observations 0.8787878787878789
books 0.5445665445665445
open 0.5384615384615384
variation 0.7676767676767677
cat 0.6666666666666666
facts 0.6744366744366744


In [151]:
scifi = load_emb(['../dat/vecs/{}_{}.txt'.format('scifi_100kb', i) for i in range(4)])

. . . . done


In [152]:
def load_macthes(filepath='../dat/other/mac_thes.txt'):
    """Return dict of word: set of synonyms.
    
    Thsi thesaurus is split by pos; ignore for now and throw all
    synoyms into same entry.
    """
    thes = dict()
    with open(filepath, 'r') as fle:
        for line in fle:
            entry, words = line.strip('\n').split(':')
            key, pos = entry.split(' ')
            words = words.split(', ')
            if key in thes:
                thes[key].update(words)
            else:
                thes[key] = set(words)
    return thes

ogthes = load_macthes()
thes = {}
for k in ogthes:
    if '-' in k:
        w = k.replace('-', '')
        thes[w] = ogthes[k]
    else:
        thes[k] = ogthes[k]

shared_vocab = set.intersection(set(thes.keys()), get_vocab(scifi[0]))
print(list(shared_vocab)[:10])
len(shared_vocab)

['prank', 'crowded', 'trappings', 'ceasefire', 'demonstrate', 'hymn', 'maniac', 'beginning', 'plenty', 'preconceived']


12440

In [153]:
local_vocab = random.sample(shared_vocab, 5000)
l = len(local_vocab)
print(l)
jac_sims = get_jaccard(scifi, list(local_vocab))

5000


In [154]:
jac_sims_array = np.zeros(l)
for i, k in enumerate(jac_sims.keys()):
    jac_sims_array[i] = jac_sims[k]
np.mean(jac_sims_array), np.std(jac_sims_array)

(0.3308662354501364, 0.23359305955766385)

In [133]:
in_vocab = True
while in_vocab:
    w = random.choice(list(thes.keys()))
    in_vocab = w in oneb[0].vocab
w, w in oneb[0].vocab

('gushing,', False)

In [4]:
joyce = load_emb(['../dat/vecs/{}_{}.txt'.format('joyce', i) for i in range(3)])
joyce_lemma = load_emb(['../dat/vecs/{}_{}.txt'.format('joyce_lemma', i) for i in range(3)])      

. . . done
. . . done


In [4]:
keystones = ['love', 'uniform', 'labour', 'open', 'receive', 'cat', 'fact']

jac_sims = get_jaccard(joyce, keystones)
jac_sims_l = get_jaccard(joyce_lemma, keystones)
print('{:<15}{:<15}{:<15}'.format('key', 'reg emb', 'lemma emb'))
for key in keystones:
    print('{:<15}{:<15.3f}{:<15.3f}'.format(key, jac_sims[key], jac_sims_l[key]))

key            reg emb        lemma emb      
love           0.133          0.337          
uniform        0.000          0.035          
labour         0.000          0.000          
open           0.055          0.306          
receive        0.035          0.092          
cat            0.000          0.055          
fact           0.204          0.113          


In [143]:
def get_vocab(emb):
    """Return vocab of gensim emb object as set."""
    return set(emb.vocab.keys())

def compare_jaccard(emb1, emb2, thes=None):
    """Return array of jaccard sim for all shared vocab in two sets of emb fles.
    
    If thes is included, only do words in the thes.
    
    vocab in all emb1 embeddings should be the same
    vocab in all emb2 embeddings should be the same
    
    :param emb1: list of gensim embedding objs from same data + algo (diff runs)
    :param emb2: like above, but different algo (same data! diff runs)
    :return: array of jaccard sim for each word in shared vocab
    """
    if thes:
        shared = list(set.intersection(*[get_vocab(emb1[0]), get_vocab(emb2[0]), set(thes.keys())]))
    else:
        shared = list(set.intersection(*[get_vocab(emb1[0]), get_vocab(emb2[0])]))
    print('shared vocab', len(shared))
    jac1_d = get_jaccard(emb1, shared)
    jac2_d = get_jaccard(emb2, shared)
    jac1_a = np.zeros(len(shared))
    jac2_a = np.zeros(len(shared))
    
    for i, w in enumerate(shared):
        jac1_a[i] = jac1_d[w]
        jac2_a[i] = jac2_d[w]
    return jac1_a, jac2_a

In [138]:
arxiv = load_emb(['../dat/vecs/{}_{}.txt'.format('arxiv_abs', i) for i in range(3)])
arxiv_lemma = load_emb(['../dat/vecs/{}_{}.txt'.format('arxiv_abs_lemma', i) for i in range(3)])      

. . . done
. . . done


In [144]:
reg, lem = compare_jaccard(arxiv, arxiv_lemma, thes)
print('{:.3f} ({:.3f}), {:.3f} ({:.3f})'.format(np.mean(reg), np.std(reg), np.mean(lem), np.std(lem)))

shared vocab 4854
0.317 (0.265), 0.372 (0.262)


In [52]:
areg, alem = compare_jaccard(arxiv, arxiv_lemma)
print('{:.3f} ({:.3f}), {:.3f} ({:.3f})'.format(np.mean(areg), np.std(areg), np.mean(alem), np.std(alem)))

shared vocab 14377
0.239 (0.241), 0.268 (0.241)


In [12]:
def get_piploss(embeddings):
    """Return array of pip losses between all embeddings.
    
    :param embeddings: list of gensim emb objs; should have same vocab
    :return piploss: array of pip loss values
    """
    
    pips = []
    shared = get_vocab(embeddings[0])
    print('vocab:', len(shared))
    for i, vec in enumerate(embeddings):
        t0 = time()
        m = np.zeros((len(shared), len(vec['the'])))
        for j, word in enumerate(shared):
            v = vec.word_vec(word)
            m[j, :] = v / np.sqrt((np.sum(v**2)))
        pip = np.matmul(m, m.transpose())
        np.save('tmp_{}.npy'.format(i), pip)
        del pip
        print('A: took (min)', (time()-t0)/60)
    
    
    piploss = []
    num = len(embeddings)
    for i in range(num):
        for j in range(num):
            if j >= i:
                break
            t0 = time()
            pip1 = np.load('tmp_{}.npy'.format(i))
            pip2 = np.load('tmp_{}.npy'.format(j))
            piploss.append(np.linalg.norm(pip1 - pip2))
            print('B: took (min)', (time()-t0)/60)
    return np.array(piploss)

In [18]:
t0 = time()
j_pip = get_piploss(joyce)
print('took (min)', (time()-t0)/60)
t0 = time()
jl_pip = get_piploss(joyce_lemma)
print('took (min)', (time()-t0)/60)

vocab: 7084
A: took (min) 0.01859915256500244
A: took (min) 0.018233267466227214
A: took (min) 0.01624951362609863
B: took (min) 0.014559221267700196
B: took (min) 0.022881698608398438
B: took (min) 0.02160645325978597
took (min) 0.11323598225911459
vocab: 6094
A: took (min) 0.01271958351135254
A: took (min) 0.016518636544545492
A: took (min) 0.013743070761362712
B: took (min) 0.009520856539408366
B: took (min) 0.013298630714416504
B: took (min) 0.014247620105743408
took (min) 0.08113964796066284


In [19]:
j_pip = j_pip / len(get_vocab(joyce[0]))
jl_pip = jl_pip / len(get_vocab(joyce_lemma[0]))
print('{:.3f} ({:.3f}), {:.3f} ({:.3f})'.format(np.mean(j_pip), np.std(j_pip), np.mean(jl_pip), np.std(jl_pip)))

0.034 (0.001), 0.043 (0.001)


In [14]:
t0 = time()
j_pip = get_piploss(arxiv)
print('took (min)', (time()-t0)/60)

vocab: 23299
A: took (min) 0.4377033829689026
A: took (min) 0.34359745581944784
A: took (min) 0.34090091784795123
B: took (min) 1.6022106488545735
B: took (min) 1.6592345317204793
B: took (min) 1.7088287830352784
took (min) 6.103492780526479


In [15]:
t0 = time()
al_pip = get_piploss(arxiv_lemma)
print('took (min)', (time()-t0)/60)

vocab: 17455
A: took (min) 0.09230528275171916
A: took (min) 0.11323066552480061
A: took (min) 0.1390928308169047
B: took (min) 0.6240556001663208
B: took (min) 0.7902705033620199
B: took (min) 0.6391923467318217
took (min) 2.408839929103851


In [None]:
a_pip = get_piploss(arxiv)
al_pip = get_piploss(arxiv_lemma)

In [17]:
a_pip_n = a_pip / len(get_vocab(arxiv[0]))
al_pip_n = al_pip / len(get_vocab(arxiv_lemma[0]))
print('{:.3f} ({:.3f}), {:.3f} ({:.3f})'.format(np.mean(a_pip_n), np.std(a_pip_n), np.mean(al_pip_n), np.std(al_pip_n)))

0.088 (0.001), 0.079 (0.002)
