# CS224n Lecture 01
Word2Vec implementation in PyTorch

Date: 2021/01/11

In [8]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.nn.functional as F

### Preprocessing

In [9]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',   
]

In [10]:
def tokenize(corpus):
    tokens = [['BOS'] + sent.split() + ['EOS'] for sent in corpus]
    return tokens

tokenized_corpus = tokenize(corpus)
assert tokenized_corpus[0] == ['BOS', 'he', 'is', 'a', 'king', 'EOS']

In [12]:
vocab = []
for sent in tokenized_corpus:
    for token in sent:
        if token not in vocab:
            vocab.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocab)}
idx2word = {idx: w for (idx, w) in enumerate(vocab)}

vocab_size = len(vocab)

In [13]:
window_size = 2
idx_pairs = []

for sent in tokenized_corpus:
    indices = [word2idx[word] for word in sent]
    print(indices)
    for c_pos in range(2, len(indices)-2):
        for watch in range(1, window_size+1):
            idx_pairs.append((indices[c_pos], indices[c_pos-watch]))
            idx_pairs.append((indices[c_pos], indices[c_pos+watch]))

idx_pairs = np.array(idx_pairs)

[0, 1, 2, 3, 4, 5]
[0, 6, 2, 3, 7, 5]
[0, 1, 2, 3, 8, 5]
[0, 6, 2, 3, 9, 5]
[0, 10, 2, 11, 12, 5]
[0, 13, 2, 14, 12, 5]
[0, 15, 2, 16, 12, 5]


### Model

In [14]:
def get_input_layer(word_idx):
    x = torch.zeros(vocab_size).float()
    x[word_idx] = 1.0
    return x

In [15]:
embed_dims = 8
W1 = Variable(torch.randn(embed_dims, vocab_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocab_size, embed_dims).float(), requires_grad=True)
num_epochs = 1000
learning_rate = 0.001

for epoch in range(num_epochs+1):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())
        
        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
        
        log_softmax = F.log_softmax(z2, dim=0)
        
        loss = F.nll_loss(log_softmax.view(1, -1), y_true)
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data
        
        W1.grad.data.zero_()
        W2.grad.data.zero_()
    
    if epoch % 100 == 0:
        print(f'Loss at epoch {epoch}: {loss_val/len(idx_pairs)}')

Loss at epoch 0: 4.865460050957544
Loss at epoch 100: 2.899385786482266
Loss at epoch 200: 2.5387552976608276
Loss at epoch 300: 2.380845404097012
Loss at epoch 400: 2.2875186183622906
Loss at epoch 500: 2.227272304041045
Loss at epoch 600: 2.189772814512253
Loss at epoch 700: 2.164765096136502
Loss at epoch 800: 2.1459397737468993
Loss at epoch 900: 2.13125516474247
Loss at epoch 1000: 2.119752045188631


In [21]:
def similarity(v,u):
    return torch.dot(v,u)/(torch.norm(v)*torch.norm(u))

In [22]:
similarity(W2[word2idx["she"]], W2[word2idx["king"]])

tensor(0.0512, grad_fn=<DivBackward0>)

In [23]:
similarity(W2[word2idx["she"]], W2[word2idx["queen"]])

tensor(0.5964, grad_fn=<DivBackward0>)

In [24]:
similarity(W2[word2idx["he"]], W2[word2idx["queen"]])

tensor(0.2505, grad_fn=<DivBackward0>)

In [26]:
similarity(W2[word2idx["he"]], W2[word2idx["king"]])

tensor(0.5350, grad_fn=<DivBackward0>)