In [1]:
## using pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
## Skip-Gram 모델 정의
import torch
import torch.nn as nn
import torch.nn.functional as F

embedding_dim = 128
embedding_max_norm = 1   # Embedding Layer의 weight가 너무 커지지 않도록 제한

class SkipGram(nn.Module):
    def __init__(self, embedding_dim, vocab_size: int, context_size=4):
        super(SkipGram, self).__init__()
        self.context_size = context_size
        self.vocab_size = vocab_size

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.layer = nn.Linear(embedding_dim, vocab_size*context_size)

    def forward(self, input):
        x = self.embeddings(input)                                      
        x = self.layer(x)                                              
        output = F.softmax(x, dim=-1)
        output = output.view(self.context_size, self.vocab_size)        
        return x

In [3]:
example_sentence = "In the case of CBOW, one word is eliminated, and the word is predicted from surrounding words. Therefore, it takes multiple input vectors as inputs to the model and creates one output vector. In contrast, Skip-Gram learns by removing all words except one word and predicting the surrounding words in the context through one word.  So, it takes a vector as input and produces multiple output vectors. CBOW and Skip-Gram are different." 
#-----------------------------------------------------------------------
## Data Preprocessing

# Tokenization (just split)
tokenized_sentence = example_sentence.split()

# Make Vocabulary
vocab = set(tokenized_sentence)    # 중복제거
#-----------------------------------------------------------------------
## Make Dictionary _ word2index{word: index}, index2word{index: word}

word2index = {word:index for index, word in enumerate(vocab)}
index2word = {index:word for index, word in enumerate(vocab)}
#-----------------------------------------------------------------------

In [4]:
vocab

{'CBOW',
 'CBOW,',
 'In',
 'Skip-Gram',
 'So,',
 'Therefore,',
 'a',
 'all',
 'and',
 'are',
 'as',
 'by',
 'case',
 'context',
 'contrast,',
 'creates',
 'different.',
 'eliminated,',
 'except',
 'from',
 'in',
 'input',
 'inputs',
 'is',
 'it',
 'learns',
 'model',
 'multiple',
 'of',
 'one',
 'output',
 'predicted',
 'predicting',
 'produces',
 'removing',
 'surrounding',
 'takes',
 'the',
 'through',
 'to',
 'vector',
 'vector.',
 'vectors',
 'vectors.',
 'word',
 'word.',
 'words',
 'words.'}

In [5]:
def make_data(sentence):
  data = []
  for i in range(2, len(tokenized_sentence)-2):
    input = tokenized_sentence[i]
    target = [tokenized_sentence[i-2],tokenized_sentence[i-1],
              tokenized_sentence[i+1],tokenized_sentence[i+2]]
    data.append((input, target))
  return data    # input, target pair

data = make_data(example_sentence.split())
data[:10]

[('case', ['In', 'the', 'of', 'CBOW,']),
 ('of', ['the', 'case', 'CBOW,', 'one']),
 ('CBOW,', ['case', 'of', 'one', 'word']),
 ('one', ['of', 'CBOW,', 'word', 'is']),
 ('word', ['CBOW,', 'one', 'is', 'eliminated,']),
 ('is', ['one', 'word', 'eliminated,', 'and']),
 ('eliminated,', ['word', 'is', 'and', 'the']),
 ('and', ['is', 'eliminated,', 'the', 'word']),
 ('the', ['eliminated,', 'and', 'word', 'is']),
 ('word', ['and', 'the', 'is', 'predicted'])]

In [6]:
# Convert Context to Index Vector

def make_index_vector(context, word_to_ix):
  idxs = [word_to_ix[w] for w in context]
  return torch.tensor(idxs)

make_index_vector(['CBOW'], word2index)

tensor([22])

In [7]:
model = SkipGram(embedding_dim, len(vocab))
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

EPOCHS=300
for epoch in range(EPOCHS):
    total_loss = 0
    for input, target in data:
      input_ = make_index_vector([input], word2index) 
      output = model(input_)
      total_loss += loss_function(output, make_index_vector(target, word2index))
    if epoch % 50 == 0:
      print('epoch = ',epoch, ', loss = ',total_loss)
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

ValueError: Expected input batch_size (1) to match target batch_size (4).

In [None]:
test_data = 'Skip-Gram'

test_vector = make_index_vector([test_data], word2index)
result = model(test_vector)
print('Prediction : ', [index2word[torch.argmax(r).item()] for r in result])
