# Word Embedding


![](http://i.imgur.com/agTBWiT.png)

In [1]:
import torch

## Word2Vec (2013)

[Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf)

by Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean

![](https://ascelibrary.org/cms/attachment/83d45b70-be2d-4dae-a37a-e3b51af0b7c4/figure3.jpg)

* CBOW: guessing the blank
* Skip-gram: guessing the neighbors

In [2]:
NANO_CORPUS = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells."""

In [3]:
corpus = NANO_CORPUS.lower().replace(',', ' ').replace('.', ' ').split()
print(corpus)

['we', 'are', 'about', 'to', 'study', 'the', 'idea', 'of', 'a', 'computational', 'process', 'computational', 'processes', 'are', 'abstract', 'beings', 'that', 'inhabit', 'computers', 'as', 'they', 'evolve', 'processes', 'manipulate', 'other', 'abstract', 'things', 'called', 'data', 'the', 'evolution', 'of', 'a', 'process', 'is', 'directed', 'by', 'a', 'pattern', 'of', 'rules', 'called', 'a', 'program', 'people', 'create', 'programs', 'to', 'direct', 'processes', 'in', 'effect', 'we', 'conjure', 'the', 'spirits', 'of', 'the', 'computer', 'with', 'our', 'spells']


In [4]:
import pandas as pd

vocabulary = list(set(corpus))
vocabulary

['a',
 'rules',
 'conjure',
 'data',
 'things',
 'pattern',
 'called',
 'computers',
 'program',
 'beings',
 'effect',
 'is',
 'with',
 'about',
 'are',
 'other',
 'by',
 'the',
 'idea',
 'direct',
 'spells',
 'process',
 'inhabit',
 'computational',
 'evolve',
 'our',
 'as',
 'to',
 'programs',
 'manipulate',
 'of',
 'study',
 'create',
 'in',
 'abstract',
 'spirits',
 'directed',
 'computer',
 'we',
 'they',
 'people',
 'evolution',
 'processes',
 'that']

Before we begin anything, we need to create a one-hot vector of the words. Pandas is great at this.

In [5]:
one_hot = pd.get_dummies(vocabulary)
one_hot['about']

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    1
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
Name: about, dtype: uint8

### Continuous Bag-Of-Words

In [6]:
EMBEDDING_SIZE = 128

class CBOW(torch.nn.Module):
    def __init__(self):
        super(CBOW, self).__init__()
        
        self.embeddings = torch.FloatTensor(len(vocabulary), EMBEDDING_SIZE).normal_()
        self.linear1 = torch.nn.Linear(EMBEDDING_SIZE, 128)
        self.relu1 = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(128, len(vocabulary))
    
    def forward(self, x):
        x = torch.sum(self.embeddings * x.sum(dim=0).view(-1, 1), dim=0) # (4, 44) -> (1, 44) -> (44, 1) -> (128)
        x = self.linear1(x) # (128)
        x = self.relu1(x) # (128)
        x = self.linear2(x) # (44)
        
        return x.view(1, -1) # (1,44)
    
    def get_word_embedding(self, word):
        return self.embeddings[vocabulary.index(word)].view(1, -1)

cbow = CBOW()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001)

In [7]:
EPOCHS = 64
WINDOW_SIZE = 2
EMBEDDING_SIZE = 128

def get_context(i, corpus):
    context = []
    
    start = max(i - WINDOW_SIZE, 0)
    end = min(i + WINDOW_SIZE, len(corpus) - 1)
    
    for n in range(start, end):
        if n == i:
            continue
        context.append(corpus[n])
    
    return context

for epoch in range(EPOCHS):
    n_words = 0
    acc_loss = 0
    for i, word in enumerate(corpus):
        context = torch.FloatTensor(
            [one_hot[word] for word in get_context(i, corpus)])
        target = torch.LongTensor([vocabulary.index(word)])

        with torch.set_grad_enabled(True):
            output = cbow(context) # (4, 44) -> (1, 44)
            loss = criterion(output, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            acc_loss += float(loss)
            n_words += 1

    print(f'Epoch {epoch}: loss {acc_loss/n_words:.4f}')

Epoch 0: loss 3.8296
Epoch 1: loss 3.7166
Epoch 2: loss 3.6075
Epoch 3: loss 3.5016
Epoch 4: loss 3.3993
Epoch 5: loss 3.2989
Epoch 6: loss 3.2000
Epoch 7: loss 3.1023
Epoch 8: loss 3.0059
Epoch 9: loss 2.9115
Epoch 10: loss 2.8184
Epoch 11: loss 2.7272
Epoch 12: loss 2.6369
Epoch 13: loss 2.5483
Epoch 14: loss 2.4611
Epoch 15: loss 2.3758
Epoch 16: loss 2.2919
Epoch 17: loss 2.2095
Epoch 18: loss 2.1286
Epoch 19: loss 2.0491
Epoch 20: loss 1.9715
Epoch 21: loss 1.8952
Epoch 22: loss 1.8203
Epoch 23: loss 1.7476
Epoch 24: loss 1.6762
Epoch 25: loss 1.6067
Epoch 26: loss 1.5384
Epoch 27: loss 1.4719
Epoch 28: loss 1.4073
Epoch 29: loss 1.3445
Epoch 30: loss 1.2838
Epoch 31: loss 1.2249
Epoch 32: loss 1.1680
Epoch 33: loss 1.1135
Epoch 34: loss 1.0608
Epoch 35: loss 1.0105
Epoch 36: loss 0.9623
Epoch 37: loss 0.9161
Epoch 38: loss 0.8722
Epoch 39: loss 0.8302
Epoch 40: loss 0.7902
Epoch 41: loss 0.7526
Epoch 42: loss 0.7165
Epoch 43: loss 0.6826
Epoch 44: loss 0.6505
Epoch 45: loss 0.620

Our model is a legit PyTorch module, so it can be saved just like other models:

In [8]:
import os

if os.path.isfile('cbow.pth'):
    print('A saved checkpoint exists.')
else:
    torch.save(cbow.state_dict(), 'cbow.pth')

In [9]:
cbow.load_state_dict(torch.load('cbow.pth'))

Now, remember our corpus?

> We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create **programs** to direct processes. In effect,
we conjure the spirits of the computer with our spells.

Let's see if our model can guess the highlighted word.

In [10]:
quiz = ['people', 'create', 'to', 'direct']
output = cbow(torch.FloatTensor([one_hot[w] for w in quiz]))
_, i = output.max(dim=1)
print(vocabulary[i])

programs


In [11]:
cbow.get_word_embedding('programs')

tensor([[ 0.2883, -0.4969, -1.4704, -2.0681, -0.4435, -0.7227,  0.5137,  0.6285,
         -0.8489, -0.9536, -1.4041, -0.9300, -1.3667,  0.2518,  0.2681, -0.5468,
         -0.4464, -0.4136, -0.5547,  0.4798,  0.7436,  1.7142,  0.8034, -0.4802,
          0.2693,  1.3984,  1.0464, -0.8054,  2.4965,  0.5192, -1.4217, -0.7273,
         -0.1452, -0.9530,  0.4984,  2.1563, -1.8848,  0.3289,  0.0919, -1.8581,
         -1.0333,  0.7064,  1.1205,  0.2544, -0.6153, -1.3527, -0.8679,  0.1191,
          0.5661, -2.2058, -1.9264, -1.3815,  0.3162,  0.5200,  0.7684, -0.1095,
          0.5124, -0.1350, -1.4202, -0.5448,  1.8300, -1.3534,  0.7497, -0.5918,
         -0.1150, -1.0703, -0.7705,  0.4716,  0.2581,  0.3438,  0.4342,  1.8800,
          0.0560,  0.9551,  0.5188,  1.8680,  1.2901, -1.1749, -0.2217, -1.0311,
          0.8857, -1.1499, -0.0370,  0.5531, -0.8168,  0.1651,  0.9904,  1.3866,
          1.4316, -0.1049,  0.2189, -1.1159, -1.2116, -3.1350, -1.2240,  0.5033,
          0.0931,  0.3866, -

### Skipgram

In [12]:
class Skipgram(torch.nn.Module):
    def __init__(self):
        super(Skipgram, self).__init__()
        self.embeddings = torch.FloatTensor(len(vocabulary), EMBEDDING_SIZE).normal_()
        self.linear1 = torch.nn.Linear(EMBEDDING_SIZE, 128)
        self.relu1 = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(128, len(vocabulary))
    
    def forward(self, x):
        x = self.embeddings[x]
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        return x.view(1, -1)
    
    def get_word_embedding(self, word):
        return self.embeddings[vocabulary.index(word)].view(1, -1)

skipgram = Skipgram()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(skipgram.parameters(), lr=0.01)

In [14]:
EPOCHS = 128
WINDOW_SIZE = 2
EMBEDDING_SIZE = 128

def get_context(i, corpus):
    context = []
    
    start = max(i - WINDOW_SIZE, 0)
    end = min(i + WINDOW_SIZE, len(corpus) - 1)
    
    for n in range(start, end):
        if n == i:
            continue
        context.append(corpus[n])
    
    return context

for epoch in range(EPOCHS):
    n_words = 0
    acc_loss = 0
    for i, word in enumerate(corpus):
        center = vocabulary.index(word)

        for word in get_context(i, corpus):
            context = torch.LongTensor([vocabulary.index(word)])

            with torch.set_grad_enabled(True):
                output = skipgram(center)
                loss = criterion(output, context)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                acc_loss += float(loss)
                n_words += 1

    print(f'Epoch {epoch}: loss {acc_loss/n_words:.4f}')

Epoch 0: loss 1.8785
Epoch 1: loss 1.8770
Epoch 2: loss 1.8744
Epoch 3: loss 1.8735
Epoch 4: loss 1.8713
Epoch 5: loss 1.8698
Epoch 6: loss 1.8685
Epoch 7: loss 1.8659
Epoch 8: loss 1.8637
Epoch 9: loss 1.8618
Epoch 10: loss 1.8598
Epoch 11: loss 1.8580
Epoch 12: loss 1.8566
Epoch 13: loss 1.8553
Epoch 14: loss 1.8529
Epoch 15: loss 1.8514
Epoch 16: loss 1.8497
Epoch 17: loss 1.8476
Epoch 18: loss 1.8458
Epoch 19: loss 1.8441
Epoch 20: loss 1.8423
Epoch 21: loss 1.8409
Epoch 22: loss 1.8395
Epoch 23: loss 1.8378
Epoch 24: loss 1.8370
Epoch 25: loss 1.8347
Epoch 26: loss 1.8325
Epoch 27: loss 1.8315
Epoch 28: loss 1.8309
Epoch 29: loss 1.8275
Epoch 30: loss 1.8264
Epoch 31: loss 1.8248
Epoch 32: loss 1.8239
Epoch 33: loss 1.8225
Epoch 34: loss 1.8210
Epoch 35: loss 1.8211
Epoch 36: loss 1.8198
Epoch 37: loss 1.8171
Epoch 38: loss 1.8157
Epoch 39: loss 1.8152
Epoch 40: loss 1.8152
Epoch 41: loss 1.8128
Epoch 42: loss 1.8115
Epoch 43: loss 1.8106
Epoch 44: loss 1.8089
Epoch 45: loss 1.807

In [18]:
if os.path.isfile('skipgram.pth'):
    print('A checkpoint already exists.')
else:
    torch.save(skipgram.state_dict(), 'skipgram.pth')

A checkpoint already exists.


In [19]:
skipgram.load_state_dict(torch.load('skipgram.pth'))

In [20]:
def get_similar(query, embeddings, top_k=10):
    embeddings = embeddings.cpu()
    query = embeddings[vocabulary.index(query)]
    similarity = (embeddings @ query) / (embeddings.norm() * query.norm())
    similarity = pd.Series(dict(zip(vocabulary, similarity.numpy())))
    similarity = similarity.sort_values(ascending=False)
    
    return similarity[:top_k]

get_similar('computer', skipgram.embeddings)

computer     0.148953
computers    0.023448
programs     0.018824
rules        0.016518
inhabit      0.015115
we           0.014397
in           0.010285
things       0.007061
conjure      0.006120
they         0.005403
dtype: float64

## GloVe: Global Vectors for Word Representation (2014)

by Jeffrey Pennington, Richard Socher, Christopher D. Manning

https://www.aclweb.org/anthology/D14-1162

On page 1534:

> We begin with a simple example that showcases
how certain aspects of meaning can be extracted
directly from co-occurrence probabilities. Consider
two words $i$ and $j$ that exhibit a particular aspect
of interest; for concreteness, suppose we are
interested in the concept of thermodynamic phase,
for which we might take $i = ice$ and $j = steam$.
The relationship of these words can be examined
by studying the ratio of their co-occurrence probabilities
with various probe words, $k$. For words
$k$ related to $ice$ but not $steam$, say $k = solid$, we
expect the ratio $Pik / Pjk$ will be large. Similarly,
for words $k$ related to $steam$ but not $ice$, say $k =
gas$, the ratio should be small. For words $k$ like
$water$ or $fashion$, that are either related to both $ice$
and $steam$, or to neither, the ratio should be close
to one. Table 1 shows these probabilities and their
ratios for a large corpus, and the numbers confirm
these expectations. Compared to the raw probabilities,
the ratio is better able to distinguish relevant
words ($solid$ and $gas$) from irrelevant words ($water$
and $fashion$) and it is also better able to discriminate
between the two relevant words.

$$
\frac{P_{solid | ice}}{P_{solid | steam}} >
\frac{P_{fashion | ice}}{P_{fashion | steam}} >
\frac{P_{gas | ice}}{P_{gas | steam}}
$$

> The above argument suggests that the appropriate
starting point for word vector learning should
be with ratios of co-occurrence probabilities rather
than the probabilities themselves. Noting that the
ratio $P_{ik} /P_{jk}$ depends on three words $i$, $j$, and $k$,
the most general model takes the form,

$$
F(w_i, w_j, \tilde{w}_k) = \frac{P_{ik}}{P_{jk}}
$$

> The number of possibilities for $F$ is vast,
but by enforcing a few desiderata we can select a
unique choice. First, we would like $F$ to encode
the information present the ratio $Pik / Pjk$ in the
word vector space. Since vector spaces are inherently
linear structures, the most natural way to do
this is with vector differences.

$$
F(w_i - w_j, \tilde{w}_k) = \frac{P_{ik}}{P_{jk}}
$$

> Next, we note that the arguments of $F$ in Eqn. (2)
are vectors while the right-hand side is a scalar.
While $F$ could be taken to be a complicated function
parameterized by, e.g., a neural network, doing
so would obfuscate the linear structure we are
trying to capture. To avoid this issue, we can first
take the dot product of the arguments,

$$
F((w_i - w_j)^T \tilde{w}_k) = \frac{P_{ik}}{P_{jk}}
$$

> Next, note that for
word-word co-occurrence matrices, the distinction
between a word and a context word is arbitrary and
that we are free to exchange the two roles. To do so
consistently, we must not only exchange $w \leftrightarrow \tilde{w}$
but also $X \leftrightarrow X^T$. Our final model should be invariant
under this relabeling, but Eqn. (3) is not.
However, the symmetry can be restored in two
steps. First, we require that $F$ be a homomorphism
between the groups $(\mathbb{R}, +)$ and $(\mathbb{R}_{>0}, \times)$, i.e.,

$$
F(X-Y)=\frac { F(X) }{ F(Y) }
$$

$$
F(w_i^T \tilde{w}_k - w_j^T \tilde{w}_k) = \frac{P_{ik}}{P_{jk}}
$$

$$
F(w_i^T \tilde{w}_k - w_j^T \tilde{w}_k) = \frac{F(w_i^T \tilde{w}_k)}{F(w_j^T \tilde{w}_k)}
$$

$$
\exp(w_i^T \tilde{w}_k - w_j^T \tilde{w}_k) = \frac{\exp(w_i^T \tilde{w}_k)}{\exp(w_j^T \tilde{w}_k)}
$$

$$F = \exp$$

Page 1533:
> Let the matrix
of word-word co-occurrence counts be denoted by
$X$, whose entries $X_{ij}$ tabulate the number of times
word $j$ occurs in the context of word $i$. Let $X_i = \sum_k X_{ik}$
be the number of times any word appears
in the context of word $i$. Finally, let
$P_{ij} = P(j|i) = X_{ij}/X_i$be the probability that word $j$ appear in the
context of word $i$.

$$
F(w_i^T \tilde{w}_k) = P_{ik} = \frac{X_{ik}}{X_i}
$$

$$
w_i^T \tilde { w }_k =\log { { P }_{ ik } } =\log ({ X_{ ik })-\log ({ X_{ i } })  }
$$

Page 1535:
> Next, we note that Eqn. (6) would exhibit the exchange
symmetry if not for the $log(X_i)$ on the
right-hand side. 

$$
\log(X_{ik})-\log(X_i) \neq \log(X_{ki})-\log(X_k)
$$

> However, this term is independent
of $k$ so it can be absorbed into a bias $b_i$ for
$w+i$. Finally, adding an additional bias $\tilde{b}_k$ for $\tilde{w}_k$
restores the symmetry,

$$
{ w }_{ i }^{ T }\tilde { { w }_{ k } } +{ b }_{ i }+\tilde { { b }_{ k } } =\log ({ X_{ ik }})
$$

## Building the vocabulary and counting co-occurrence (again)

Today's dataset, an English monolingual corpus, can be found [here](https://drive.google.com/open?id=1__lK0x_k8gtyV27QZqQUGSC4jlaQAZSC).

In [22]:
from collections import defaultdict

FILE = 'ted.en.txt'
WINDOW_SIZE = 10

vocabulary = defaultdict(int)
co_occurrence = defaultdict(int)

with open(FILE) as f:
    sentences = f.readlines()

for sentence in sentences:
    words = sentence.split(' ')
    for i in range(len(words)):
        vocabulary[words[i]] += 1

        for j in range(i + 1, i + WINDOW_SIZE + 1):
            if j >= len(words):
                break
            keys = tuple(sorted([words[i], words[j]]))
            co_occurrence[keys] += 1

Let's see how much words we have gathered.

In [23]:
len(vocabulary)

77599

Show some love!

In [24]:
'love' in vocabulary

True

How much?

In [25]:
vocabulary['love']

2444

Let's convert the dictionary into a Pandas Series for convinience.

In [26]:
import pandas as pd

MIN_OCCURRENCE = 10

vocabulary = pd.Series(vocabulary, dtype='uint16')

And with the help of Pandas, let's set a minimum frequency threshold to trim the vocabulary.

In [27]:
vocabulary = vocabulary[vocabulary >= MIN_OCCURRENCE]
len(vocabulary)

16754

In [28]:
'love' in vocabulary

True

In [29]:
import numpy as np

X_ij = np.zeros((len(vocabulary), len(vocabulary)), dtype='uint16')

for (word_i, word_j), value in co_occurrence.items():
    try:
        i = vocabulary.index.get_loc(word_i)
        j = vocabulary.index.get_loc(word_j)
    except KeyError:
        continue

    X_ij[i][j] = value
    X_ij[j][i] = value

In [30]:
X_ij

array([[  28, 2145,    3, ...,    0,    0,    0],
       [2145, 5716,   49, ...,    4,    0,    4],
       [   3,   49,    2, ...,    0,    0,    0],
       ...,
       [   0,    4,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    4,    0, ...,    0,    0,    0]], dtype=uint16)

$$
{ w }_{ i }^{ T }\tilde { { w }_{ k } } +{ b }_{ i }+\tilde { { b }_{ k } } =\log ({ X_{ ik }})
$$

In [31]:
from itertools import chain
import torch

DIM = 128
ITERATIONS = 32
X_MAX = 100
ALPHA = 3/4
GPU_ID = 2

n_words = X_ij.shape[0]

X = torch.from_numpy(X_ij.astype('float32')).add_(1)
w_main = torch.FloatTensor(n_words, DIM).uniform_(-0.5, 0.5)
w_context = torch.FloatTensor(n_words, DIM).uniform_(-0.5, 0.5)
b_main = torch.FloatTensor(n_words).uniform_(-0.5, 0.5)
b_context = torch.FloatTensor(n_words).uniform_(-0.5, 0.5)

if torch.cuda.is_available():
    X = X.cuda(device=GPU_ID)
    w_main = w_main.cuda(device=GPU_ID)
    w_context = w_context.cuda(device=GPU_ID)
    b_main = b_main.cuda(device=GPU_ID)
    b_context = b_context.cuda(device=GPU_ID)

X.requires_grad_(False)
w_main.requires_grad_(True)
w_context.requires_grad_(True)
b_main.requires_grad_(True)
b_context.requires_grad_(True)

criterion = torch.nn.MSELoss(reduction='none')
optimizer = torch.optim.Adam([w_main, w_context, b_main, b_context],
                             lr=1e-3, weight_decay=1e-15)

with torch.set_grad_enabled(True):
    for iteration in range(ITERATIONS):
        acc_loss = 0
        for j in torch.randperm(n_words):
            output = w_main @ w_context[j]
            output += b_main
            output += b_context[j]
            
            loss = criterion(output, X[:, j].log() + 1e-15)
            
            loss_weight = (X[:, j] / X_MAX) ** ALPHA
            loss_weight[X[:, j] > X_MAX] = 1

            optimizer.zero_grad()
            loss.backward(loss_weight)
            optimizer.step()
            
            acc_loss += float(loss.mean())
        
        print(f'iteration {iteration}, loss {acc_loss/n_words:.4f}')

iteration 0, loss 0.2922
iteration 1, loss 0.1486
iteration 2, loss 0.0908
iteration 3, loss 0.0709
iteration 4, loss 0.0592
iteration 5, loss 0.0532
iteration 6, loss 0.0490
iteration 7, loss 0.0460
iteration 8, loss 0.0436
iteration 9, loss 0.0404
iteration 10, loss 0.0402
iteration 11, loss 0.0377
iteration 12, loss 0.0370
iteration 13, loss 0.0349
iteration 14, loss 0.0342
iteration 15, loss 0.0326
iteration 16, loss 0.0322
iteration 17, loss 0.0319
iteration 18, loss 0.0311
iteration 19, loss 0.0305
iteration 20, loss 0.0299
iteration 21, loss 0.0296
iteration 22, loss 0.0290
iteration 23, loss 0.0287
iteration 24, loss 0.0285
iteration 25, loss 0.0281
iteration 26, loss 0.0278
iteration 27, loss 0.0276
iteration 28, loss 0.0275
iteration 29, loss 0.0270
iteration 30, loss 0.0270
iteration 31, loss 0.0268


In [33]:
if os.path.isfile('glove.pth'):
    print('A checkpoint already exists.')
else:
    torch.save([w_main, w_context, b_main, b_context], 'glove.pth')

In [34]:
w_main, w_context, b_main, b_context = torch.load('glove.pth')

In [57]:
def v(word):
    i = vocabulary.index.get_loc(word)
    return w_main[i].cpu()

def analogy(target, top_k=20):
    target /= target.norm()
    
    with torch.no_grad():
        similarity = (w_main.cpu() @ target) / (w_main.cpu().norm() * target.norm())
        similarity = pd.Series(dict(zip(vocabulary.keys(), similarity.numpy())))
        similarity = similarity[vocabulary < 500]
        similarity = similarity.sort_values(ascending=False)
    
    return similarity.sort_values(ascending=False)[:top_k]

analogy(v('wife') - v('man') + v('woman'))

son           0.013207
wife          0.012656
daughter      0.012159
mom           0.011667
college       0.011257
brother       0.011073
husband       0.011053
named         0.010624
sister        0.010571
teacher       0.010530
dad           0.010478
moved         0.010311
career        0.010187
hospital      0.010160
doctor        0.010071
boy           0.009906
colleagues    0.009823
walked        0.009803
month         0.009781
sat           0.009772
dtype: float64

In [55]:
analogy(v('robots') - v('robot') + v('body'))

bodies       0.011712
develop      0.011325
treat        0.010971
healthy      0.010836
skin         0.010540
genetic      0.010443
protect      0.010405
materials    0.010383
brains       0.010379
measure      0.010312
genes        0.010257
neurons      0.010108
objects      0.010095
models       0.010018
decisions    0.009981
function     0.009876
improve      0.009812
interact     0.009690
machines     0.009677
plants       0.009677
dtype: float64