# Stability of word embeddings

Cite: [Evaluating the Stability of Embedding-based Word Similarities](https://transacl.org/ojs/index.php/tacl/article/view/1202/286)

In [1]:
import sys
sys.executable

'/Users/katy/Documents/Grad/thesaurusx/env/bin/python'

In [7]:
import gensim
import os
import string

from time import time

In [12]:
table = str.maketrans({key: None for key in string.punctuation})   

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

# for one sentence per line
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname), errors='ignore'):
                clean = line.lower().translate(table)
                yield clean.split()


def make_embeddings(name, suffix=''):
    data = 'dat/' + name
    filepath = 'tmp/{}{}.txt'.format(name, suffix)
    words = 0
    for fname in os.listdir(data):
        for line in open(os.path.join(data, fname), errors='ignore'):
            words += len(line.split(' '))
    print(filepath, ":", words, 'words')

    start = time()
    sentences = MySentences(data)  # a memory-friendly iterator
    model = gensim.models.Word2Vec(sentences, size=100, window=5)
    model.wv.save_word2vec_format(filepath)
    print("took {:<2f} seconds".format(time() - start))


In [13]:
for i in range(5):
    make_embeddings('darwin', suffix='_{}'.format(i))

tmp/darwin_0.txt : 2944538 words
took 14.944857 seconds
tmp/darwin_1.txt : 2944538 words
took 14.825755 seconds
tmp/darwin_2.txt : 2944538 words
took 14.348472 seconds
tmp/darwin_3.txt : 2944538 words
took 15.933205 seconds
tmp/darwin_4.txt : 2944538 words
took 14.396053 seconds


In [20]:
vectors = gensim.models.KeyedVectors.load_word2vec_format(filepath)

keystones = ['darwin', 'observations', 'books', 'open', 'variation', 'cat', 'facts']
data = {}
for i in range(5):
    vectors = gensim.models.KeyedVectors.load_word2vec_format('tmp/{}_{}.txt'.format('darwin', i))
    for key in keystones:
        if key in data:
            data[key].append(set([w for w,d in vectors.most_similar(key, topn=10)]))
        else:
            data[key] = [set([w for w, d in vectors.most_similar(key, topn=10)])]


In [26]:
for key in keystones:
    jaccard_sims = []
    for i, la in enumerate(data[key]):
        for lo in data[key][i+1:]:
            jac = len(la.intersection(lo)) / len(la.union(lo))
            jaccard_sims.append(jac)
    print(key.upper(), ':', sum(jaccard_sims)/len(jaccard_sims))
    

DARWIN : 0.5829170829170829
OBSERVATIONS : 0.7727272727272728
BOOKS : 0.18464052287581703
OPEN : 0.6242091242091241
VARIATION : 0.5957375957375957
CAT : 0.5514152514152514
FACTS : 0.8575757575757577
