In [1]:
import os
os.chdir('../histwords_py3')
from representations.sequentialembedding import SequentialEmbedding

"""
Example showing how to load a series of historical embeddings and compute similarities over time.
Warning that loading all the embeddings into main memory can take a lot of RAM
"""

if __name__ == "__main__":
    fiction_embeddings = SequentialEmbedding.load("../coha-lemma_sgns/sgns", range(1950, 2000, 10))
    time_sims = fiction_embeddings.get_time_sims("lesbian", "gay")   
    print ("Similarity between gay and lesbian drastically increases from 1950s to the 1990s:")
    for year, sim in time_sims.items():
        print ("{year:d}, cosine similarity={sim:0.2f}".format(year=year,sim=sim))
        
embeds = fiction_embeddings.get_embed(1950)
word = "frequency"
v_word = embeds[word]

Loading SequentialEmbedding from ../coha-lemma_sgns/sgns years=range(1950, 2000, 10)
Loading Embedding from ../coha-lemma_sgns/sgns/1950 normalize=True
Loading pickle file ../coha-lemma_sgns/sgns/1950-vocab.pkl
Loading Embedding from ../coha-lemma_sgns/sgns/1960 normalize=True
Loading pickle file ../coha-lemma_sgns/sgns/1960-vocab.pkl
Loading Embedding from ../coha-lemma_sgns/sgns/1970 normalize=True
Loading pickle file ../coha-lemma_sgns/sgns/1970-vocab.pkl
Loading Embedding from ../coha-lemma_sgns/sgns/1980 normalize=True
Loading pickle file ../coha-lemma_sgns/sgns/1980-vocab.pkl
Loading Embedding from ../coha-lemma_sgns/sgns/1990 normalize=True
Loading pickle file ../coha-lemma_sgns/sgns/1990-vocab.pkl
Similarity between gay and lesbian drastically increases from 1950s to the 1990s:
1950, cosine similarity=0.00
1960, cosine similarity=0.00
1970, cosine similarity=0.00
1980, cosine similarity=0.00
1990, cosine similarity=0.56


In [59]:
import numpy as np

embs = []
word = "frequency"
for y in range(1980, 2000, 10):
    embeds = fiction_embeddings.get_embed(y)
    embs.append(np.array(embeds[word]))
    
for i in range(len(embs)-1):
    print(np.linalg.norm(embs[i] - embs[i+1]))
    
y = 1990
print(np.linalg.norm(np.array(fiction_embeddings.get_embed(y)['animal']) - np.array(fiction_embeddings.get_embed(y)['frequency'])))

0.776406466778001
1.3021957925612242


In [110]:
type(embeds.m)
import numpy as np
import random
import scipy as sp

embeds = fiction_embeddings.get_embed(1960)

def synonymity(e,w):
    w2_emb = e.represent(w)
    embs = e.m
    distances = []
    for w1_emb in embs[:10]:
        #distances.append(sp.spatial.distance.cosine(w1_emb, w2_emb))
        a = sp.spatial.distance.cosine(w1_emb, w2_emb)
        if(not np.isnan(a)):
            distances.append(a)
    #print(distances[:10])
    #print(distances.shape)
    distances = np.array(distances)
    sigma = 0.4
    def f(x):
        return np.exp(-(x**2)/(2*sigma**2))/(sigma * np.sqrt(2*np.pi))
    vf = np.vectorize(f)
    if(len(distances) == 0):
        scores = []
    else:
        scores = vf(distances)
    return np.sum(scores)/sigma**2.8

for i in range(0):
    wembed = [0]
    while(np.linalg.norm(wembed)<=0.001):
        word = embeds.iw[random.randint(1,50000)-1]
        wembed = embeds.represent(word)
    syn = synonymity(embeds,word)
    print(word.ljust(15), ':\t', syn)
    if(syn<1):
        print(word)
    
for word in ['the']:
    print(word.ljust(15), ':\t', synonymity(embeds,word))
    


the             :	 38.96578150776602
