We will download the glove.6B.zip from https://nlp.stanford.edu/projects/glove/

In [None]:
!pip install --user gensim
!pip install --user spacy

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip glove.6B.zip

In [None]:
!python -m spacy download en_vectors_web_lg

After downloads and installs you need to restart the kernel.

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import gensim
import spacy
from scipy.spatial import distance
from sklearn.decomposition import PCA
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [None]:
%matplotlib inline
plt.style.use('ggplot')

In [None]:
glove_file = 'glove.6B.50d.txt'
word2vec_glove_file = get_tmpfile("glove.6B.50d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

In [None]:
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

In [None]:
def analogy(x1, x2, y1):
    result = model.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

In [None]:
analogy('france', 'paris', 'russia')

In [None]:
analogy('sweden', 'stockholm', 'norway')

In [None]:
analogy('germany', 'berlin', 'austria')

In [None]:
def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()), sample)
        else:
            words = [ word for word in model.vocab ]
    word_vectors = np.array([model[w] for w in words])
    twodim = PCA().fit_transform(word_vectors)[:,:2]
    plt.figure(figsize=(10,10))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for pair in range(0, twodim.shape[0], 2):
        plt.plot([twodim[pair,0], twodim[pair+1,0]], [twodim[pair,1], twodim[pair+1,1]], linewidth=1., c='b')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

In [None]:
display_pca_scatterplot(model, ['france', 'paris', 'russia', 'moscow', 'finland', 'helsinki', 'sweden', 'stockholm', 'nepal', 'kathmandu', 'germany', 'berlin'])

In [None]:
display_pca_scatterplot(model, ['science', 'scientist', 'physics', 'physicist', 'chemistry', 'chemist', 'biology', 'biologist', 'mathematics', 'mathematician', 'art', 'painter'])

In [None]:
nlp_lg = spacy.load("en_vectors_web_lg")

In [None]:
def spacy_analogy(nlp, w1, w2, w3):
    return nlp(w2).vector - nlp(w1).vector + nlp(w3).vector

In [None]:
# Format the vocabulary for use in the distance function
ids = [x for x in nlp_lg.vocab.vectors.keys()]
vectors = [nlp_lg.vocab.vectors[x] for x in ids]
vectors = np.array(vectors)

In [None]:
def analogy(nlp, w1, w2, w3):
    vec = spacy_analogy(nlp, w1, w2, w3)
    p = np.array([vec])
    distances = distance.cdist(p, vectors, 'cosine')
    distances = np.argsort(distances)
    closest_index = distances[0][3]
    word_id = ids[closest_index]
    return nlp.vocab[word_id].text.lower()

In [None]:
analogy(nlp_lg, 'france', 'paris', 'russia')

In [None]:
analogy(nlp_lg, 'finland', 'helsinki', 'sweden')

In [None]:
analogy(nlp_lg, 'switzerland', 'bern', 'germany')

In [None]:
def display_spacy_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()), sample)
        else:
            words = [ word for word in model.vocab.strings ]
    word_vectors = np.array([model(w).vector for w in words])
    twodim = PCA().fit_transform(word_vectors)[:,:2]
    plt.figure(figsize=(10,10))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for pair in range(0, twodim.shape[0], 2):
        plt.plot([twodim[pair,0], twodim[pair+1,0]], [twodim[pair,1], twodim[pair+1,1]], linewidth=1., c='b')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

In [None]:
display_spacy_pca_scatterplot(nlp_lg, ['france', 'paris', 'russia', 'moscow', 'finland', 'helsinki', 'sweden', 'stockholm', 'armenia', 'yerevan', 'switzerland', 'bern', 'germany', 'berlin'])

In [None]:
display_spacy_pca_scatterplot(nlp_lg, ['science', 'scientist', 'physics', 'physicist', 'biology', 'biologist', 'mathematics', 'mathematician', 'art', 'artist'])

- Play around with the analogies, add some more countries/capitals, see if they are still following the pattern.
- In spacy analogy, change the distance from cosine to euclidean, what's the capital of Germany now?
- Think of other type of analogies, let's say syntactic this time (past vs present of verbs, singular vs plular, etc.). See if the relationships hold.
- Homework: Try to visualize three way relationships, like (take, took, taken), (shake, shook, shaken), etc.