# Word Embedding

In [1]:
import torch

## Word2Vec (2013)

[Efficient Estimation of Word Representations in Vector Space](https://arxiv.org/pdf/1301.3781.pdf)

by Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean

![](http://i.imgur.com/agTBWiT.png)

### Continuous Bag-Of-Words vs. Skip-gram

* CBOW: guessing the blank
* Skip-gram: guessing the neighbors

![](https://ascelibrary.org/cms/attachment/83d45b70-be2d-4dae-a37a-e3b51af0b7c4/figure3.jpg)

In [2]:
NANO_CORPUS = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells."""

In [3]:
corpus = NANO_CORPUS.lower().replace(',', ' ').replace('.', ' ').split()
print(corpus)

['we', 'are', 'about', 'to', 'study', 'the', 'idea', 'of', 'a', 'computational', 'process', 'computational', 'processes', 'are', 'abstract', 'beings', 'that', 'inhabit', 'computers', 'as', 'they', 'evolve', 'processes', 'manipulate', 'other', 'abstract', 'things', 'called', 'data', 'the', 'evolution', 'of', 'a', 'process', 'is', 'directed', 'by', 'a', 'pattern', 'of', 'rules', 'called', 'a', 'program', 'people', 'create', 'programs', 'to', 'direct', 'processes', 'in', 'effect', 'we', 'conjure', 'the', 'spirits', 'of', 'the', 'computer', 'with', 'our', 'spells']


In [4]:
import pandas as pd

vocabulary = list(set(corpus))

Before we begin anything, we need to create a one-hot vector of the words. Pandas is great at this.

In [5]:
one_hot = pd.get_dummies(vocabulary)

In [6]:
EMBEDDING_SIZE = 128

class CBOW(torch.nn.Module):
    def __init__(self):
        super(CBOW, self).__init__()
        self.embeddings = torch.FloatTensor(len(vocabulary), EMBEDDING_SIZE).uniform_()
        self.linear1 = torch.nn.Linear(EMBEDDING_SIZE, 128)
        self.relu1 = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(128, len(vocabulary))
    
    def forward(self, x):
        x = torch.sum(self.embeddings * x.sum(dim=0).view(-1, 1), dim=0)
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        
        return x.view(1, -1)
    
    def get_word_embedding(self, word):
        return self.embeddings[vocabulary.index(word)].view(1, -1)

cbow = CBOW()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(cbow.parameters(), lr=0.001)

In [7]:
EPOCHS = 64
WINDOW_SIZE = 2
EMBEDDING_SIZE = 128

def get_context(i, corpus):
    context = []
    
    start = max(i - WINDOW_SIZE, 0)
    end = min(i + WINDOW_SIZE, len(corpus) - 1)
    
    for n in range(start, end):
        if n == i:
            continue
        context.append(corpus[n])
    
    return context

for epoch in range(EPOCHS):
    n_words = 0
    acc_loss = 0
    for i, word in enumerate(corpus):
        context = torch.FloatTensor(
            [one_hot[word] for word in get_context(i, corpus)])
        target = torch.LongTensor([vocabulary.index(word)])

        with torch.set_grad_enabled(True):
            output = cbow(context)
            loss = criterion(output, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            acc_loss += float(loss)
            n_words += 1

    print(f'Epoch {epoch}: loss {acc_loss/n_words:.4f}')

Epoch 0: loss 3.8188
Epoch 1: loss 3.7716
Epoch 2: loss 3.7352
Epoch 3: loss 3.7072
Epoch 4: loss 3.6840
Epoch 5: loss 3.6644
Epoch 6: loss 3.6472
Epoch 7: loss 3.6329
Epoch 8: loss 3.6191
Epoch 9: loss 3.6072
Epoch 10: loss 3.5957
Epoch 11: loss 3.5846
Epoch 12: loss 3.5741
Epoch 13: loss 3.5641
Epoch 14: loss 3.5537
Epoch 15: loss 3.5436
Epoch 16: loss 3.5335
Epoch 17: loss 3.5236
Epoch 18: loss 3.5136
Epoch 19: loss 3.5038
Epoch 20: loss 3.4943
Epoch 21: loss 3.4844
Epoch 22: loss 3.4744
Epoch 23: loss 3.4649
Epoch 24: loss 3.4551
Epoch 25: loss 3.4453
Epoch 26: loss 3.4347
Epoch 27: loss 3.4244
Epoch 28: loss 3.4139
Epoch 29: loss 3.4038
Epoch 30: loss 3.3933
Epoch 31: loss 3.3828
Epoch 32: loss 3.3721
Epoch 33: loss 3.3616
Epoch 34: loss 3.3507
Epoch 35: loss 3.3397
Epoch 36: loss 3.3281
Epoch 37: loss 3.3167
Epoch 38: loss 3.3053
Epoch 39: loss 3.2938
Epoch 40: loss 3.2825
Epoch 41: loss 3.2704
Epoch 42: loss 3.2592
Epoch 43: loss 3.2466
Epoch 44: loss 3.2354
Epoch 45: loss 3.222

Now, remember our corpus?

> We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create **programs** to direct processes. In effect,
we conjure the spirits of the computer with our spells.

Let's see if our model can guess the highlighted word.

In [8]:
quiz = ['people', 'create', 'to', 'direct']
output = cbow(torch.FloatTensor([one_hot[w] for w in quiz]))
_, i = output.max(dim=1)
print(vocabulary[i])

processes


In [9]:
cbow.get_word_embedding('programs')

tensor([[0.8489, 0.9017, 0.1659, 0.9903, 0.4600, 0.0077, 0.8274, 0.8722, 0.4643,
         0.9406, 0.1229, 0.9057, 0.7660, 0.2854, 0.8600, 0.1224, 0.8084, 0.1031,
         0.0346, 0.8578, 0.3230, 0.7069, 0.6619, 0.9268, 0.3734, 0.4252, 0.5809,
         0.6659, 0.0659, 0.7780, 0.2868, 0.2424, 0.4232, 0.5825, 0.5407, 0.5262,
         0.5431, 0.6958, 0.7654, 0.7655, 0.9760, 0.6818, 0.2394, 0.6063, 0.5122,
         0.6355, 0.1387, 0.6136, 0.1484, 0.9226, 0.1001, 0.2557, 0.1510, 0.1334,
         0.8858, 0.8536, 0.8248, 0.1578, 0.3673, 0.1430, 0.4194, 0.4097, 0.2029,
         0.1457, 0.8110, 0.3857, 0.3127, 0.6986, 0.2857, 0.0669, 0.6471, 0.9752,
         0.1169, 0.6310, 0.7778, 0.0909, 0.2148, 0.5374, 0.5005, 0.6790, 0.9398,
         0.9529, 0.0839, 0.6369, 0.1430, 0.5669, 0.7131, 0.8976, 0.0221, 0.3495,
         0.9691, 0.9676, 0.8904, 0.6848, 0.7213, 0.9607, 0.1187, 0.0618, 0.3771,
         0.7712, 0.5259, 0.9896, 0.1445, 0.3457, 0.6096, 0.5555, 0.1200, 0.9508,
         0.9294, 0.4935, 0.3

In [10]:
class Skipgram(torch.nn.Module):
    def __init__(self):
        super(Skipgram, self).__init__()
        self.embeddings = torch.FloatTensor(len(vocabulary), EMBEDDING_SIZE).normal_()
        self.linear1 = torch.nn.Linear(EMBEDDING_SIZE, 128)
        self.relu1 = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(128, len(vocabulary))
    
    def forward(self, x):
        x = self.embeddings[x]
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        return x.view(1, -1)
    
    def get_word_embedding(self, word):
        return self.embeddings[vocabulary.index(word)].view(1, -1)

skipgram = Skipgram()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(skipgram.parameters(), lr=0.01)

In [11]:
EPOCHS = 64
WINDOW_SIZE = 2
EMBEDDING_SIZE = 128

def get_context(i, corpus):
    context = []
    
    start = max(i - WINDOW_SIZE, 0)
    end = min(i + WINDOW_SIZE, len(corpus) - 1)
    
    for n in range(start, end):
        if n == i:
            continue
        context.append(corpus[n])
    
    return context

for epoch in range(EPOCHS):
    n_words = 0
    acc_loss = 0
    for i, word in enumerate(corpus):
        center = vocabulary.index(word)

        for word in get_context(i, corpus):
            context = torch.LongTensor([vocabulary.index(word)])

            with torch.set_grad_enabled(True):
                output = skipgram(center)
                loss = criterion(output, context)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                acc_loss += float(loss)
                n_words += 1

    print(f'Epoch {epoch}: loss {acc_loss/n_words:.4f}')

Epoch 0: loss 3.7707
Epoch 1: loss 3.4202
Epoch 2: loss 3.1437
Epoch 3: loss 2.8998
Epoch 4: loss 2.6799
Epoch 5: loss 2.4936
Epoch 6: loss 2.3523
Epoch 7: loss 2.2548
Epoch 8: loss 2.1937
Epoch 9: loss 2.1552
Epoch 10: loss 2.1308
Epoch 11: loss 2.1155
Epoch 12: loss 2.1030
Epoch 13: loss 2.0913
Epoch 14: loss 2.0818
Epoch 15: loss 2.0730
Epoch 16: loss 2.0654
Epoch 17: loss 2.0574
Epoch 18: loss 2.0503
Epoch 19: loss 2.0435
Epoch 20: loss 2.0366
Epoch 21: loss 2.0307
Epoch 22: loss 2.0243
Epoch 23: loss 2.0168
Epoch 24: loss 2.0101
Epoch 25: loss 2.0066
Epoch 26: loss 1.9998
Epoch 27: loss 1.9944
Epoch 28: loss 1.9898
Epoch 29: loss 1.9845
Epoch 30: loss 1.9803
Epoch 31: loss 1.9746
Epoch 32: loss 1.9709
Epoch 33: loss 1.9668
Epoch 34: loss 1.9631
Epoch 35: loss 1.9589
Epoch 36: loss 1.9551
Epoch 37: loss 1.9510
Epoch 38: loss 1.9477
Epoch 39: loss 1.9440
Epoch 40: loss 1.9401
Epoch 41: loss 1.9391
Epoch 42: loss 1.9345
Epoch 43: loss 1.9311
Epoch 44: loss 1.9285
Epoch 45: loss 1.925

In [12]:
def get_similar(query, embeddings, top_k=10):
    embeddings = embeddings.cpu()
    query = embeddings[vocabulary.index(query)]
    similarity = (embeddings @ query) / (embeddings.norm() * query.norm())
    similarity = pd.Series(dict(zip(vocabulary, similarity.numpy())))
    similarity = similarity.sort_values(ascending=False)
    
    return similarity[:top_k]

get_similar('people', skipgram.embeddings)

people       0.150772
programs     0.032084
beings       0.029256
spirits      0.021663
abstract     0.020167
process      0.019939
direct       0.018144
data         0.017356
computers    0.014036
inhabit      0.013199
dtype: float64

## GloVe: Global Vectors for Word Representation (2014)

by Jeffrey Pennington, Richard Socher, Christopher D. Manning

https://www.aclweb.org/anthology/D14-1162

On page 1534:

> We begin with a simple example that showcases
how certain aspects of meaning can be extracted
directly from co-occurrence probabilities. Consider
two words $i$ and $j$ that exhibit a particular aspect
of interest; for concreteness, suppose we are
interested in the concept of thermodynamic phase,
for which we might take $i = ice$ and $j = steam$.
The relationship of these words can be examined
by studying the ratio of their co-occurrence probabilities
with various probe words, $k$. For words
$k$ related to $ice$ but not $steam$, say $k = solid$, we
expect the ratio $Pik / Pjk$ will be large. Similarly,
for words $k$ related to $steam$ but not $ice$, say $k =
gas$, the ratio should be small. For words $k$ like
$water$ or $fashion$, that are either related to both $ice$
and $steam$, or to neither, the ratio should be close
to one. Table 1 shows these probabilities and their
ratios for a large corpus, and the numbers confirm
these expectations. Compared to the raw probabilities,
the ratio is better able to distinguish relevant
words ($solid$ and $gas$) from irrelevant words ($water$
and $fashion$) and it is also better able to discriminate
between the two relevant words.

$$
\frac{P_{solid | ice}}{P_{solid | steam}} >
\frac{P_{fashion | ice}}{P_{fashion | steam}} >
\frac{P_{gas | ice}}{P_{gas | steam}}
$$

> The above argument suggests that the appropriate
starting point for word vector learning should
be with ratios of co-occurrence probabilities rather
than the probabilities themselves. Noting that the
ratio $P_{ik} /P_{jk}$ depends on three words $i$, $j$, and $k$,
the most general model takes the form,

$$
F(w_i, w_j, \tilde{w}_k) = \frac{P_{ik}}{P_{jk}}
$$

> The number of possibilities for $F$ is vast,
but by enforcing a few desiderata we can select a
unique choice. First, we would like $F$ to encode
the information present the ratio $Pik / Pjk$ in the
word vector space. Since vector spaces are inherently
linear structures, the most natural way to do
this is with vector differences.

$$
F(w_i - w_j, \tilde{w}_k) = \frac{P_{ik}}{P_{jk}}
$$

> Next, we note that the arguments of $F$ in Eqn. (2)
are vectors while the right-hand side is a scalar.
While $F$ could be taken to be a complicated function
parameterized by, e.g., a neural network, doing
so would obfuscate the linear structure we are
trying to capture. To avoid this issue, we can first
take the dot product of the arguments,

$$
F((w_i - w_j)^T \tilde{w}_k) = \frac{P_{ik}}{P_{jk}}
$$

> Next, note that for
word-word co-occurrence matrices, the distinction
between a word and a context word is arbitrary and
that we are free to exchange the two roles. To do so
consistently, we must not only exchange $w \leftrightarrow \tilde{w}$
but also $X \leftrightarrow X^T$. Our final model should be invariant
under this relabeling, but Eqn. (3) is not.
However, the symmetry can be restored in two
steps. First, we require that $F$ be a homomorphism
between the groups $(\mathbb{R}, +)$ and $(\mathbb{R}_{>0}, \times)$, i.e.,

$$
F(X-Y)=\frac { F(X) }{ F(Y) }
$$

$$
F(w_i^T \tilde{w}_k - w_j^T \tilde{w}_k) = \frac{P_{ik}}{P_{jk}}
$$

$$
F(w_i^T \tilde{w}_k - w_j^T \tilde{w}_k) = \frac{F(w_i^T \tilde{w}_k)}{F(w_j^T \tilde{w}_k)}
$$

$$
\exp(w_i^T \tilde{w}_k - w_j^T \tilde{w}_k) = \frac{\exp(w_i^T \tilde{w}_k)}{\exp(w_j^T \tilde{w}_k)}
$$

$$F = \exp$$

Page 1533:
> Let the matrix
of word-word co-occurrence counts be denoted by
$X$, whose entries $X_{ij}$ tabulate the number of times
word $j$ occurs in the context of word $i$. Let $X_i = \sum_k X_{ik}$
be the number of times any word appears
in the context of word $i$. Finally, let
$P_{ij} = P(j|i) = X_{ij}/X_i$be the probability that word $j$ appear in the
context of word $i$.

$$
F(w_i^T \tilde{w}_k) = P_{ik} = \frac{X_{ik}}{X_i}
$$

$$
w_i^T \tilde { w }_k =\log { { P }_{ ik } } =\log ({ X_{ ik })-\log ({ X_{ i } })  }
$$

Page 1535:
> Next, we note that Eqn. (6) would exhibit the exchange
symmetry if not for the $log(X_i)$ on the
right-hand side. 

$$
\log(X_{ik})-\log(X_i) \neq \log(X_{ki})-\log(X_k)
$$

> However, this term is independent
of $k$ so it can be absorbed into a bias $b_i$ for
$w+i$. Finally, adding an additional bias $\tilde{b}_k$ for $\tilde{w}_k$
restores the symmetry,

$$
{ w }_{ i }^{ T }\tilde { { w }_{ k } } +{ b }_{ i }+\tilde { { b }_{ k } } =\log ({ X_{ ik }})
$$

## Building the vocabulary and counting co-occurrence (again)

Today's dataset, an English monolingual corpus, can be found [here](https://drive.google.com/open?id=1__lK0x_k8gtyV27QZqQUGSC4jlaQAZSC).

In [13]:
from collections import defaultdict

FILE = 'ted.en.txt'
WINDOW_SIZE = 10

vocabulary = defaultdict(int)
co_occurrence = defaultdict(int)

with open(FILE) as f:
    sentences = f.readlines()

for sentence in sentences:
    words = sentence.split(' ')
    for i in range(len(words)):
        vocabulary[words[i]] += 1

        for j in range(i + 1, i + WINDOW_SIZE + 1):
            if j >= len(words):
                break
            keys = tuple(sorted([words[i], words[j]]))
            co_occurrence[keys] += 1

Let's see how much words we have gathered.

In [14]:
len(vocabulary)

77599

Show some love!

In [15]:
'love' in vocabulary

True

How much?

In [16]:
vocabulary['love']

2444

Let's convert the dictionary into a Pandas Series for convinience.

In [17]:
import pandas as pd

MIN_OCCURRENCE = 10

vocabulary = pd.Series(vocabulary, dtype='uint16')

And with the help of Pandas, let's set a minimum frequency threshold to trim the vocabulary.

In [18]:
vocabulary = vocabulary[vocabulary >= MIN_OCCURRENCE]
len(vocabulary)

16754

In [19]:
'love' in vocabulary

True

In [20]:
import numpy as np

X_ij = np.zeros((len(vocabulary), len(vocabulary)), dtype='uint16')

for (word_i, word_j), value in co_occurrence.items():
    try:
        i = vocabulary.index.get_loc(word_i)
        j = vocabulary.index.get_loc(word_j)
    except KeyError:
        continue

    X_ij[i][j] = value
    X_ij[j][i] = value

$$
{ w }_{ i }^{ T }\tilde { { w }_{ k } } +{ b }_{ i }+\tilde { { b }_{ k } } =\log ({ X_{ ik }})
$$

In [21]:
from itertools import chain
import torch

DIM = 128
ITERATIONS = 32
X_MAX = 100
ALPHA = 3/4
GPU_ID = 2

n_words = X_ij.shape[0]

X = torch.from_numpy(X_ij.astype('float32'))
w_main = torch.FloatTensor(n_words, DIM).uniform_(-0.5, 0.5)
w_context = torch.FloatTensor(n_words, DIM).uniform_(-0.5, 0.5)
b_main = torch.FloatTensor(n_words).uniform_(-0.5, 0.5)
b_context = torch.FloatTensor(n_words).uniform_(-0.5, 0.5)

if torch.cuda.is_available():
    X = X.cuda(device=GPU_ID)
    w_main = w_main.cuda(device=GPU_ID)
    w_context = w_context.cuda(device=GPU_ID)
    b_main = b_main.cuda(device=GPU_ID)
    b_context = b_context.cuda(device=GPU_ID)

X.requires_grad_(False)
w_main.requires_grad_(True)
w_context.requires_grad_(True)
b_main.requires_grad_(True)
b_context.requires_grad_(True)

criterion = torch.nn.MSELoss(reduction='none')
optimizer = torch.optim.Adam([w_main, w_context, b_main, b_context],
                             lr=1e-3, weight_decay=1e-15)

with torch.set_grad_enabled(True):
    for iteration in range(ITERATIONS):
        acc_loss = 0
        for j in torch.randperm(n_words):
            output = w_main @ w_context[j]
            output += b_main
            output += b_context[j]
            
            loss = criterion(output, X[:, j].log() + 1e-15)
            
            loss_weight = (X[:, j] / X_MAX) ** ALPHA
            loss_weight[X[:, j] > X_MAX] = 1

            optimizer.zero_grad()
            loss.backward(loss_weight)
            optimizer.step()
            
            acc_loss += float(loss.mean())
        
        print(f'iteration {iteration}, loss {acc_loss/n_words:.4f}')

iteration 0, loss 0.2815
iteration 1, loss 0.1468
iteration 2, loss 0.0932
iteration 3, loss 0.0726
iteration 4, loss 0.0588
iteration 5, loss 0.0514
iteration 6, loss 0.0481
iteration 7, loss 0.0448
iteration 8, loss 0.0430
iteration 9, loss 0.0412
iteration 10, loss 0.0402
iteration 11, loss 0.0376
iteration 12, loss 0.0369
iteration 13, loss 0.0347
iteration 14, loss 0.0340
iteration 15, loss 0.0331
iteration 16, loss 0.0321
iteration 17, loss 0.0317
iteration 18, loss 0.0306
iteration 19, loss 0.0302
iteration 20, loss 0.0296
iteration 21, loss 0.0295
iteration 22, loss 0.0288
iteration 23, loss 0.0287
iteration 24, loss 0.0280
iteration 25, loss 0.0281
iteration 26, loss 0.0275
iteration 27, loss 0.0281
iteration 28, loss 0.0273
iteration 29, loss 0.0277
iteration 30, loss 0.0271
iteration 31, loss 0.0270


In [22]:
def v(word):
    i = vocabulary.index.get_loc(word)
    return w_main[i].cpu()

def analogy(target, top_k=20):
    target /= target.norm()
    
    with torch.no_grad():
        similarity = (w_main.cpu() @ target) / (w_main.cpu().norm() * target.norm())
        similarity = pd.Series(dict(zip(vocabulary.keys(), similarity.numpy())))
        similarity = similarity[vocabulary < 500]
        similarity = similarity.sort_values(ascending=False)
    
    return similarity.sort_values(ascending=False)[:top_k]

analogy(v('husband') - v('man') + v('woman'))

son            0.012575
wife           0.012556
mom            0.012284
daughter       0.011946
college        0.011625
dad            0.011515
husband        0.011464
sister         0.011447
hospital       0.011309
student        0.011260
brother        0.011109
sat            0.010950
walked         0.010775
grandmother    0.010656
colleagues     0.010607
boy            0.010593
favorite       0.010455
bed            0.010370
University     0.010326
meeting        0.010304
dtype: float64

In [23]:
analogy(v('heaven') - v('good') + v('bad'))

oh             0.005673
Get            0.005164
Come           0.005076
yeah           0.004982
Man            0.004979
Ah             0.004951
title          0.004930
Don            0.004890
sister         0.004881
son            0.004869
threw          0.004859
bag            0.004843
mouth          0.004802
neck           0.004801
click          0.004799
dad            0.004791
finger         0.004739
Audience       0.004720
grandmother    0.004713
mom            0.004695
dtype: float64

## Visualization using PCA and t-SNE

![](https://scontent-icn1-1.xx.fbcdn.net/v/t1.0-9/41425661_1809264752526756_3946431284045152256_n.jpg?_nc_cat=107&oh=e0b118959eaf0d6c7c97ce71b8c1136d&oe=5C20EDF1)

* PCA
  * good for dimensionality reduction
  * not always good for visualization
  * weak against non-linear data
* t-SNE
  * good for visualization
  * not so good for dimensionality reduction
  * strong against non-linear data

[FastText](https://fasttext.cc/): Library for efficient text classification and representation learning

In [None]:
import torchtext
fasttext = torchtext.vocab.FastText(language='simple')

In [None]:
fasttext['love']

In [None]:
fasttext.vectors.size()

In [None]:
import numpy as np
from sklearn.decomposition import PCA

def pca(vocabulary, embeddings, n_points):
    np.random.seed(0)

    frequent = vocabulary[vocabulary < 2000].sort_values(ascending=False).index[:n_points]
    indices = [vocabulary.index.get_loc(word) for word in frequent]
    
    pca = PCA(n_components=2, random_state=0)
    with torch.no_grad():
        results = pca.fit_transform(embeddings[indices])
    
    plt.figure(figsize=(15, 15))
    for i in range(n_points):
        query = vocabulary.index[indices[i]]
        x, y = results[i]
        plt.scatter(x, y, label=query)
        
        # Prevent label overlapping by applying random offsets.
        offset_x = np.random.randint(-35, 12) / 2000
        offset_y = np.random.randint(-30, 15) / 2000
        
        plt.annotate(query, (x + offset_x, y + offset_y))
        
    plt.show()
    
pca(vocabulary, w_main, 100)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
    
def tsne(vocabulary, embeddings, n_points):
    np.random.seed(0)

    frequent = vocabulary[vocabulary < 2000].sort_values(ascending=False).index[:n_points]
    indices = [vocabulary.index.get_loc(word) for word in frequent]

    tsne = TSNE(n_components=2, random_state=0)
    with torch.no_grad():
        results = tsne.fit_transform(embeddings[indices])
    
    plt.figure(figsize=(15, 15))
    for i in range(n_points):
        query = vocabulary.index[indices[i]]
        x, y = results[i]
        plt.scatter(x, y, label=query)
        
        # Prevent label overlapping by applying random offsets.
        offset_x = np.random.randint(-35, 12) / 100
        offset_y = np.random.randint(-30, 15) / 100
        
        plt.annotate(query, (x + offset_x, y + offset_y))
        
    plt.show()

tsne(vocabulary, w_main, 200)