# Naive Implementation of Skip-Gram Language Model

Alfan F. Wicaksono (Fasilkom UI)

Di tutorial ini, kita akan belajar membangun Word2Vec's Skip-Gram model "from-the-scratch". Implementasi yang dilakukan adalah versi Naive dimana output layer masih berupa softmax layer biasa yang mempunyai ukuran sebesar ukuran vocabulary $|V|$. Kenyataannya, arsitektur ini akan sangat lambat jika $|V|$ sangat besar. Implementasi yang jauh lebih efisien adalah dengan Hierarchical Softmax atau dengan menggunakan NCE (Noise-Contrastive Estimation) loss function.

Walaupun implementasi pada tutorial ini adalah versi Naive, tutorial ini memberikan gambaran detail bagaimana Word2Vec's Skip-Gram bekerja (you will look inside the black box). Jika Anda memahami tutorial ini, Anda juga bisa memahami bagaimana model-model lain yang masih satu family seperti Continuous Bag-of-Words model dan FastText model. Serta, tutorial ini memberikan fondasi untuk memahami language model yang lebih state-of-the-art seperti ELMO, bahkan BERT dan segala variannya.

In [1]:
import torch
import torch.nn.functional as fun
import numpy as np

In [2]:
corpus = ["saya tidak makan",
          "kamu tidak pergi",
          "saya gk makan",
          "kamu gk pergi"]

In [3]:
def create_vocab(corpus):
  """
  membuat vocabulary, dengan 2 info penting: 
      1. word_id -> mapping kata ke id (int)
      2. id_word -> sebaliknya, mapping dari id ke word
  """
  word_id = {}
  i = 0
  for sentence in corpus:
      for word in sentence.split():
          if word not in word_id:
              word_id[word] = i
              i += 1
  id_word = {id:word for (word, id) in word_id.items()}
  return word_id, id_word

word_id, id_word = create_vocab(corpus)
print(word_id)
print(id_word)

{'saya': 0, 'tidak': 1, 'makan': 2, 'kamu': 3, 'pergi': 4, 'gk': 5}
{0: 'saya', 1: 'tidak', 2: 'makan', 3: 'kamu', 4: 'pergi', 5: 'gk'}


In [4]:
# dataset generator
def iter_dataset(corpus, context_window = 1):
  for sentence in corpus:
    words = sentence.split()
    for i in range(len(words)):
      center_word = words[i]
      # cari konteks ke kiri
      j = i - 1
      while (j >= i - context_window) and (j >= 0):
        context_word = words[j]
        yield (center_word, context_word)
        j -= 1
      # cari konteks ke kanan
      j = i + 1
      while (j <= i + context_window) and (j < len(words)):
        context_word = words[j]
        yield (center_word, context_word)
        j += 1

for center, context in iter_dataset(corpus):
  print(center, context)

saya tidak
tidak saya
tidak makan
makan tidak
kamu tidak
tidak kamu
tidak pergi
pergi tidak
saya gk
gk saya
gk makan
makan gk
kamu gk
gk kamu
gk pergi
pergi gk


In [5]:
# membuat dataset
X = []
Y = []
for center, context in iter_dataset(corpus):
  X.append(word_id[center])
  Y.append(word_id[context])

X_ctr = torch.tensor(X)
Y_ctx = torch.tensor(Y)

print(X_ctr)
print(Y_ctx)

tensor([0, 1, 1, 2, 3, 1, 1, 4, 0, 5, 5, 2, 3, 5, 5, 4])
tensor([1, 0, 2, 1, 1, 3, 4, 1, 5, 0, 2, 5, 5, 3, 4, 5])


In [6]:
vocab_size = len(word_id)

def get_input_tensor(tensor):
    return fun.one_hot(tensor, num_classes = vocab_size)

get_input_tensor(X_ctr)

tensor([[1, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 1],
        [0, 0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1, 0]])

In [7]:
EMBEDDING_DIMS = 5


initrange = 0.5 / EMBEDDING_DIMS

# trick: di awal, mode gradient jangan diaktifkan dahulu, karena ingin
# memodifikasi nilai di C dan W secara in-place
C = torch.rand(vocab_size, EMBEDDING_DIMS, requires_grad = False)
W = torch.rand(EMBEDDING_DIMS, vocab_size, requires_grad = False)

# agar nilai awal parameter di C dan W berkisar di antara -initrange hingga +initrange
C = -2 * initrange * C + initrange
W = -2 * initrange * W + initrange

# setelah operasi in-place modification, baru kita set mode gradient
C.requires_grad = True
W.requires_grad = True

print(f'C shape is: {C.shape}, W shape is: {W.shape}')
print(C)
print(W)


C shape is: torch.Size([6, 5]), W shape is: torch.Size([5, 6])
tensor([[ 0.0211,  0.0985, -0.0455, -0.0255,  0.0694],
        [-0.0975, -0.0400,  0.0332,  0.0531, -0.0291],
        [-0.0093,  0.0155, -0.0791,  0.0351,  0.0700],
        [-0.0768, -0.0659, -0.0999,  0.0133,  0.0429],
        [-0.0937, -0.0536,  0.0462, -0.0990, -0.0470],
        [ 0.0778,  0.0194,  0.0897, -0.0568, -0.0445]], requires_grad=True)
tensor([[-0.0442,  0.0224, -0.0367,  0.0495,  0.0488, -0.0330],
        [-0.0743, -0.0405, -0.0746,  0.0550, -0.0811,  0.0286],
        [-0.0379, -0.0178, -0.0070, -0.0600, -0.0414,  0.0646],
        [ 0.0775,  0.0605, -0.0092,  0.0653, -0.0594, -0.0311],
        [-0.0429,  0.0704,  0.0887, -0.0201,  0.0433, -0.0915]],
       requires_grad=True)


In [8]:
# Softmax
def softmax(x):
  maxes = torch.max(x, 1, keepdim=True)[0]
  x_exp = torch.exp(x - maxes)
  x_exp_sum = torch.sum(x_exp, 1, keepdim = True)
  return x_exp / x_exp_sum

# Categorical Cross Entropy Loss
def CCEloss(Y_pred, Y_true):
    m = Y_pred.size()[0]
    return -(1 / m) * torch.sum(Y_true * torch.log(Y_pred))

In [9]:
EPOCHS = 200
LEARNING_RATE = 0.2
LR_DECAY = 0.99

for i in range(EPOCHS):
  X = get_input_tensor(X_ctr).float()
  Y = get_input_tensor(Y_ctx).float()

  h = X.mm(C)
  Y_pred = softmax(h.mm(W))

  loss = CCEloss(Y_pred, Y)
  loss.backward()

  # update C dan W
  with torch.no_grad():
    C -= LEARNING_RATE * C.grad
    W -= LEARNING_RATE * W.grad
    
    C.grad.zero_()
    W.grad.zero_()

  if i % 10 == 0:
    # kita coba lihat progress cosine similarity
    # antara embedding "tidak" dengan "nggak"
    vector_tidak = C[word_id["tidak"]]
    vector_gk = C[word_id["gk"]]
    sim = fun.cosine_similarity(vector_tidak, vector_gk, dim = 0)
    print(f'Epoch {i}, loss = {loss}, sim(tidak, gk) = {sim}')


Epoch 0, loss = 1.7935898303985596, sim(tidak, gk) = -0.4133906364440918
Epoch 10, loss = 1.7917194366455078, sim(tidak, gk) = -0.4798831641674042
Epoch 20, loss = 1.7898756265640259, sim(tidak, gk) = -0.4558359980583191
Epoch 30, loss = 1.7871145009994507, sim(tidak, gk) = -0.32872533798217773
Epoch 40, loss = 1.7820773124694824, sim(tidak, gk) = -0.09139911830425262
Epoch 50, loss = 1.7723863124847412, sim(tidak, gk) = 0.21741338074207306
Epoch 60, loss = 1.7537850141525269, sim(tidak, gk) = 0.5089050531387329
Epoch 70, loss = 1.719202995300293, sim(tidak, gk) = 0.7173287272453308
Epoch 80, loss = 1.659062385559082, sim(tidak, gk) = 0.8408175706863403
Epoch 90, loss = 1.5662643909454346, sim(tidak, gk) = 0.9070404767990112
Epoch 100, loss = 1.4478002786636353, sim(tidak, gk) = 0.9411842823028564
Epoch 110, loss = 1.3298448324203491, sim(tidak, gk) = 0.9588562846183777
Epoch 120, loss = 1.2377886772155762, sim(tidak, gk) = 0.968408465385437
Epoch 130, loss = 1.1765074729919434, sim(ti

In [10]:
# vector "tidak" dan "gk"
vector_tidak = C[word_id["tidak"]].detach().numpy()
vector_gk = C[word_id["gk"]].detach().numpy()

print(vector_tidak)
print(vector_gk)

[ 0.5922387  -0.5567056   0.54271483  0.39649314 -0.5429961 ]
[ 0.7301304  -0.474415    0.5729187   0.27293417 -0.53294003]
