In [8]:
#  Word2vec
#  Skip-gram model (target:context=1:1 model, no negative sampling)
#  "he is a ..."
#     target: is
#        => context:[he, a] (when window_size=2)

import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital'
]

def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)

vocabulary = []
for sentence in tokenized_corpus:
    for word in sentence:
        if word not in vocabulary:
            vocabulary.append(word)
vocabulary_size = len(vocabulary)
word2id = {w: idx for (idx, w) in enumerate(vocabulary)}
id2word = {idx:w for (idx, w) in enumerate(vocabulary)}



In [5]:
window_size = 2

# generate (target, (context1,context2)) word pairs
idx_pairs = []
for sentence in tokenized_corpus:
    indices = [word2id[x] for x in sentence]
    for target in range(0, len(indices)):
        for w in range(-window_size, window_size+1):
            if(target + w <0 or target + w >= len(indices) or w==0):
                continue
            pair = indices[target],indices[target+w]
            idx_pairs.append(pair)

np_idx_pairs = np.array(idx_pairs)

print("(target,context) pairs:\n")
for (x,y) in np_idx_pairs[:10,]:
    print('(%s, %s)' % (id2word[x],id2word[y]))



(target,context) pairs:

(he, is)
(he, a)
(is, he)
(is, a)
(is, king)
(a, he)
(a, is)
(a, king)
(king, is)
(king, a)


In [6]:
def input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1
    return x

embedding_dims = 5
W1 = torch.autograd.Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = torch.autograd.Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)



In [7]:
num_epochs = 101
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = torch.autograd.Variable(input_layer(data)).float()
        y_true = torch.autograd.Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)

        # raw log-softmax value
        log_softmax = F.log_softmax(z2, dim=0)
        loss = F.nll_loss(log_softmax.view(1, -1), y_true)
        # loss value for printing
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()

    if epo % 10 == 0:
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')


Loss at epo 0: 5.229343138422284
Loss at epo 10: 4.444653618335724
Loss at epo 20: 3.9810801489012584
Loss at epo 30: 3.6753370591572354
Loss at epo 40: 3.4541127596582686
Loss at epo 50: 3.286831678662981
Loss at epo 60: 3.1577084388051713
Loss at epo 70: 3.0568619574819293
Loss at epo 80: 2.9771582092557636
Loss at epo 90: 2.913068832669939
Loss at epo 100: 2.860338951860155
