# Word2vec Algorithm

<img src="image/word2vec.png",width=800,height=800>

In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import numpy as np

In [13]:
import progressbar
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
progressbar.streams.wrap_stderr()

CONTEXT_SIZE = 2
EMBEDDING_DIM = 2
NUM_EPOCHS = 5

In [14]:
def gather_word_freqs(split_text, subsampling = False, sampling_rate = 0.001):
    vocab = {}
    word_to_ix = {}
    total = 0.0
    for word in split_text:
        if word not in vocab:
            vocab[word] = 0
            word_to_ix[word] = len(word_to_ix)
        vocab[word] += 1.0
        total += 1.0
    if subsampling:
        for i, word in enumerate(split_text):
            val = np.sqrt(total / vocab[word])
            prob = val * (1 + val)
            sampling = np.random.sample()
            if (sampling <= prob):
                del [split_text[i]]
                i -= 1
    return split_text, vocab, word_to_ix

In [15]:
def gather_training_data(split_text, word_to_ix, context_size, model_type = "skipgram"):
    training_data = []
    for i, word in enumerate(split_text):
        if (model_type == "skipgram"):
            back_i = i - 1
            back_c = 0
            forw_i = i + 1
            forw_c = 0
            while (back_i >= 0 and back_c < context_size):
                training_data.append(([word_to_ix[word]], word_to_ix[split_text[back_i]]))
                back_i -= 1
                back_c += 1
            while (forw_i < len(split_text) and forw_c < context_size):
                training_data.append(([word_to_ix[word]], word_to_ix[split_text[forw_i]]))
                forw_i += 1
                forw_c += 1
        elif (model_type == "cbow"):
            point = []
            back_i = i - 1
            back_c = 0
            forw_i = i + 1
            forw_c = 0
            while (back_i >= 0 and back_c < context_size):
                point.append(word_to_ix[split_text[back_i]])
                back_i -= 1
                back_c += 1
            while (forw_i < len(split_text) and forw_c < context_size):
                point.append(word_to_ix[split_text[forw_i]])
                forw_i += 1
                forw_c += 1
            training_data.append((point, word_to_ix[word]))
        else:
            raise ValueError("Inappropriate argument value for model_type - either `skipgram` or `cbow`.")
    return training_data

In [16]:
def skipgram(context_size, model_type = "skipgram", subsampling = False, sampling_rate = 0.001):
    
    processed_text = """superonline bir milyon evin kapisina fiber internet goturdu bizde niye 
    yok bize niye getirmiyorlar bir evde olmamasi gereken   şeyturkcell   digiturk keriz anten çiçeği
    en ucuz iphone s fiyatını hangi operatör sunuyor ohasın""".split()
    
    print("processed_text : \n",processed_text,"\n")
    
    new_processed_text, vocab, word_to_ix = gather_word_freqs(processed_text,subsampling = subsampling, sampling_rate = sampling_rate)
    
    
    print("new processed_text : \n",new_processed_text,"\n")
    print("vocab : \n",vocab,"\n")
    print("word_to_ix : \n",word_to_ix,"\n")
    
    training_data = gather_training_data(new_processed_text, word_to_ix, context_size ,model_type = model_type)
    print("training_data : \n",training_data,"\n")
    
    return new_processed_text, vocab, word_to_ix, training_data

In [22]:
def cbow(context_size, model_type = "cbow", subsampling = False, sampling_rate = 0.001):
    
    processed_text = """superonline bir milyon evin kapisina fiber internet goturdu bizde niye 
    yok bize niye getirmiyorlar bir evde olmamasi gereken   şeyturkcell   digiturk keriz anten çiçeği
    en ucuz iphone s fiyatını hangi operatör sunuyor ohasın""".split()
    
    print("processed_text : \n",processed_text,"\n")
    
    new_processed_text, vocab, word_to_ix = gather_word_freqs(processed_text,subsampling = subsampling, sampling_rate = sampling_rate)
    print("new_processed_text : \n",new_processed_text,"\n")
    print("vocab : \n",vocab,"\n")
    print("word_to_ix : \n",word_to_ix,"\n")
    
    training_data = gather_training_data(new_processed_text, word_to_ix, context_size ,model_type = model_type)
    print("training_data : \n",training_data,"\n")
    
    return new_processed_text, vocab, word_to_ix, training_data

In [18]:
#model

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

''' Continuous bag-of-words model for word2vec.

Parameters:
    vocab_size: number of defined words in the vocab
    embedding_dim: desired embedded vector dimension
    context_size: number of context words used

'''
class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = torch.mean(self.embeddings(inputs), dim=0).view((1, -1))
        out = self.linear(embeds)
        log_probs = F.log_softmax(out)
        return log_probs


''' Skip-gram bag-of-words model for word2vec.

Parameters:
    vocab_size: number of defined words in the vocab
    embedding_dim: desired embedded vector dimension

'''
class SkipGram(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = self.linear(embeds)
        log_probs = F.log_softmax(out)
        return log_probs

In [23]:
#skipgram

processed_text, vocab, word_to_ix, training_data = skipgram(CONTEXT_SIZE, model_type="skipgram", subsampling=True, sampling_rate=0.001)

losses = []
loss_function = nn.NLLLoss()  #negative log likelihood loss
model = SkipGram(len(vocab), EMBEDDING_DIM)
print(model)

processed_text : 
 ['superonline', 'bir', 'milyon', 'evin', 'kapisina', 'fiber', 'internet', 'goturdu', 'bizde', 'niye', 'yok', 'bize', 'niye', 'getirmiyorlar', 'bir', 'evde', 'olmamasi', 'gereken', 'şeyturkcell', 'digiturk', 'keriz', 'anten', 'çiçeği', 'en', 'ucuz', 'iphone', 's', 'fiyatını', 'hangi', 'operatör', 'sunuyor', 'ohasın'] 

new processed_text : 
 ['bir', 'evin', 'fiber', 'goturdu', 'niye', 'bize', 'getirmiyorlar', 'evde', 'gereken', 'digiturk', 'anten', 'en', 'iphone', 'fiyatını', 'operatör', 'ohasın'] 

vocab : 
 {'superonline': 1.0, 'bir': 2.0, 'milyon': 1.0, 'evin': 1.0, 'kapisina': 1.0, 'fiber': 1.0, 'internet': 1.0, 'goturdu': 1.0, 'bizde': 1.0, 'niye': 2.0, 'yok': 1.0, 'bize': 1.0, 'getirmiyorlar': 1.0, 'evde': 1.0, 'olmamasi': 1.0, 'gereken': 1.0, 'şeyturkcell': 1.0, 'digiturk': 1.0, 'keriz': 1.0, 'anten': 1.0, 'çiçeği': 1.0, 'en': 1.0, 'ucuz': 1.0, 'iphone': 1.0, 's': 1.0, 'fiyatını': 1.0, 'hangi': 1.0, 'operatör': 1.0, 'sunuyor': 1.0, 'ohasın': 1.0} 

word_to_ix :

In [24]:
#skipgram
optimizer = optim.SGD(model.parameters(), lr=0.001)  #Stochastic gradient descent 

In [21]:
#skipgram
print ("Starting training")
for epoch in range(NUM_EPOCHS):
    total_loss = torch.Tensor([0])
    print ("Beginning epoch %d" % epoch)
    progress_bar = progressbar.ProgressBar()
    for context, target in progress_bar(training_data):
        print("context",context)
        print("target",target)
        print("\n","*"*50,"\n")
        context_var = autograd.Variable(torch.LongTensor(context))
        model.zero_grad()
        log_probs = model(context_var)
        loss = loss_function(log_probs, autograd.Variable(
            torch.LongTensor([target])))
        loss.backward()
        optimizer.step()
        total_loss += loss.data
    print ("Epoch %d Loss: %.5f" % (epoch, total_loss[0]))
    losses.append(total_loss)

Starting training
Beginning epoch 0
context [1]
target 3

 ************************************************** 

context [1]
target 5

 ************************************************** 

context [3]
target 1

 ************************************************** 

context [3]
target 5

 ************************************************** 

context [3]
target 7

 ************************************************** 

context [5]
target 3

 ************************************************** 

context [5]
target 1

 ************************************************** 

context [5]
target 7

 ************************************************** 

context [5]
target 9

 ************************************************** 

context [7]
target 5

 ************************************************** 

context [7]
target 3

 ************************************************** 

context [7]
target 9

 ************************************************** 

context [7]
target 11

 ****************************

target 25

 ************************************************** 

context [23]
target 27

 ************************************************** 

context [25]
target 23

 ************************************************** 

context [25]
target 21

 ************************************************** 

context [25]
target 27

 ************************************************** 

context [25]
target 29

 ************************************************** 

context [27]
target 25

 ************************************************** 

context [27]
target 23

 ************************************************** 

context [27]
target 29

 ************************************************** 

context [29]
target 27

 ************************************************** 

context [29]
target 25

 ************************************************** 

Epoch 2 Loss: 213.60249
Beginning epoch 3
context [1]
target 3

 ************************************************** 

context [1]
target 5

 **************

In [25]:
#cbow
processed_text, vocab, word_to_ix, training_data = cbow(CONTEXT_SIZE, model_type="cbow", subsampling=True, sampling_rate=0.001)

losses = []
loss_function = nn.NLLLoss()
model = CBOW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
print(model)

processed_text : 
 ['superonline', 'bir', 'milyon', 'evin', 'kapisina', 'fiber', 'internet', 'goturdu', 'bizde', 'niye', 'yok', 'bize', 'niye', 'getirmiyorlar', 'bir', 'evde', 'olmamasi', 'gereken', 'şeyturkcell', 'digiturk', 'keriz', 'anten', 'çiçeği', 'en', 'ucuz', 'iphone', 's', 'fiyatını', 'hangi', 'operatör', 'sunuyor', 'ohasın'] 

new_processed_text : 
 ['bir', 'evin', 'fiber', 'goturdu', 'niye', 'bize', 'getirmiyorlar', 'evde', 'gereken', 'digiturk', 'anten', 'en', 'iphone', 'fiyatını', 'operatör', 'ohasın'] 

vocab : 
 {'superonline': 1.0, 'bir': 2.0, 'milyon': 1.0, 'evin': 1.0, 'kapisina': 1.0, 'fiber': 1.0, 'internet': 1.0, 'goturdu': 1.0, 'bizde': 1.0, 'niye': 2.0, 'yok': 1.0, 'bize': 1.0, 'getirmiyorlar': 1.0, 'evde': 1.0, 'olmamasi': 1.0, 'gereken': 1.0, 'şeyturkcell': 1.0, 'digiturk': 1.0, 'keriz': 1.0, 'anten': 1.0, 'çiçeği': 1.0, 'en': 1.0, 'ucuz': 1.0, 'iphone': 1.0, 's': 1.0, 'fiyatını': 1.0, 'hangi': 1.0, 'operatör': 1.0, 'sunuyor': 1.0, 'ohasın': 1.0} 

word_to_ix :

In [26]:
#cbow
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [27]:
#cbow
print ("Starting training")
for epoch in range(NUM_EPOCHS):
    total_loss = torch.Tensor([0])
    print ("Beginning epoch %d" % epoch)
    progress_bar = progressbar.ProgressBar()
    for context, target in progress_bar(training_data):
        context_var = autograd.Variable(torch.LongTensor(context))
        model.zero_grad()
        log_probs = model(context_var)
        loss = loss_function(log_probs, autograd.Variable(
            torch.LongTensor([target])))
        loss.backward()
        optimizer.step()
        total_loss += loss.data
    print ("Epoch %d Loss: %.5f" % (epoch, total_loss[0]))
    losses.append(total_loss)

Starting training
Beginning epoch 0
Epoch 0 Loss: 57.29324
Beginning epoch 1
Epoch 1 Loss: 57.27409
Beginning epoch 2
Epoch 2 Loss: 57.25497
Beginning epoch 3
Epoch 3 Loss: 57.23587
Beginning epoch 4
Epoch 4 Loss: 57.21682
