# <center>Word2Vec Tutorial</center>
<center>Kelas Pengolahan Bahasa Manusia 2017/2018</center>
<center>Lintang Adyuta Sutawika</center>

In [1]:
import os
import sys
import itertools
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.autograd import Variable
from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm_notebook as tqdm

In [2]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

class Corpus(object):
    def __init__(self, path='dataset'):
        self.path = path
        self.dictionary = Dictionary()
        self.dataset = self.tokenize(os.path.join(path, 'ptb_dataset.txt'))

    def tokenize(self, path):
        self.tokenizer = RegexpTokenizer(r'\S+')
        """Tokenizes a text file."""
        assert os.path.exists(path)
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = list(self.tokenizer.tokenize(line))
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

        # Tokenize file content
        with open(path, 'r') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                # words = line.split() + ['<eos>']
                words = list(self.tokenizer.tokenize(line))
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

In [3]:
corpus = Corpus()
word2idx = corpus.dictionary.word2idx
idx2word = corpus.dictionary.idx2word

In [4]:
#Prepare context for training
data = []
num_context = 10
for i in range(num_context, len(corpus.dataset)-num_context):
    past_context = corpus.dataset[i-num_context:i-1]
    future_context = corpus.dataset[i+1:i+num_context]
    target = corpus.dataset[i]
    context = torch.cat((past_context,future_context))
    data.append((context,target))

In [5]:
#Make dataset into batches
batch_size = 1000
nbatch = len(data)//1000
data_batch = []
for i in range(nbatch):
    _context, _target = zip(*data[i*batch_size:i*batch_size+batch_size])
    _context = list(_context)
    _target = list(_target)
    #Ubah ke bentuk yang dimengerti pytorch
    data_batch.append((torch.stack(_context), torch.LongTensor(_target)))

<img src="Images/Dot_Product.png">

In [6]:
def make_context_vector(context, word2idx):
    idxs = [word2idx[w] for w in context]
    tensor = torch.LongTensor(idxs)
    return Variable(tensor)

def get_index_of_max(input):
    index = 0
    for i in range(1, len(input)):
        if input[i] > input[index]:
            index = i 
    return index

def get_max_prob_result(input, idx2word):
    return idx2word[get_index_of_max(input)]

def get_similarity(w1, w2, model):
    e1 = model.word_embedding(w1)
    e2 = model.word_embedding(w2)
    return (e1.dot(e2) / (torch.norm(e1) * torch.norm(e2))).data.numpy()[0]

In [7]:
type(corpus.dataset[0:2])

torch.LongTensor

<img src="Images/CBOW_Skip-gram.png" style="width: 800px;">

In [8]:
#Continous Bag-of-Words Implementation
class cbow(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim):
        super(cbow, self).__init__()
        
        self.embedding = nn.Embedding(vocabulary_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim * (10*2-2), 128)
        self.linear2 = nn.Linear(128, vocabulary_size)
        
    def forward(self, inputs):
        #Lapisan proyeksi merupakan penjumlahan dari kata-kata konteks
        embeds = model.embedding(input_context).view(1000,-1)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out)
        return log_probs
    
    def word_embedding(self,inputs):
        return self.embedding(inputs)

In [None]:
corpus = Corpus()
model = cbow(len(corpus.dictionary), 300)
num_epoch = 100
loss_function = nn.NLLLoss() 


for epoch in range(num_epoch):
    total_loss = 0
    lr = 0.001
    pbar = tqdm(data_batch, total=len(data_batch))
    for batch in pbar:
        

        context, target = batch
        input_context = Variable(context)
        output_target = Variable(target)
        
        log_prob = model(input_context)
        
        running_loss = loss_function(log_prob, output_target)
        running_loss.backward()
        pbar.set_description("{}".format(running_loss.data))
        
        if epoch%25 == 0:
            lr = lr/10.0
        
        optimizer = optim.Adagrad(model.parameters(), lr = lr)
        optimizer.step()
        total_loss += running_loss.data

  from ipykernel import kernelapp as app


In [None]:
model.embedding(input_context).view(1000,-1)

In [16]:
dog = Variable(torch.LongTensor([word2idx['france']]))
cat = Variable(torch.LongTensor([word2idx['germany']]))
get_similarity(dog, cat, model)
# model.word_embedding(Variable(torch.LongTensor([word2idx['dog']])))
word_embedding(Variable(torch.LongTensor([word2idx['france']])))-

0.09736606

https://arxiv.org/pdf/1301.3781.pdf

<img src="Images/Word_pair_relationships.png" style="width: 1000px;"/>

<img src="Images/training_data.png">

# Referensi
1) http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
