### Word2Vec CBOW

Adapted from: <br>
https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#an-example-n-gram-language-modeling <br>
https://gist.github.com/GavinXing/9954ea846072e115bb07d9758892382c

In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

In [2]:
random_seed = 77777

#### Our toy dataset with a few sentences

In [3]:
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

In [4]:
print(raw_text)

['We', 'are', 'about', 'to', 'study', 'the', 'idea', 'of', 'a', 'computational', 'process.', 'Computational', 'processes', 'are', 'abstract', 'beings', 'that', 'inhabit', 'computers.', 'As', 'they', 'evolve,', 'processes', 'manipulate', 'other', 'abstract', 'things', 'called', 'data.', 'The', 'evolution', 'of', 'a', 'process', 'is', 'directed', 'by', 'a', 'pattern', 'of', 'rules', 'called', 'a', 'program.', 'People', 'create', 'programs', 'to', 'direct', 'processes.', 'In', 'effect,', 'we', 'conjure', 'the', 'spirits', 'of', 'the', 'computer', 'with', 'our', 'spells.']


#### Tokenize the dataset and map each word to a unique index

In [5]:
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

In [6]:
print(str(vocab_size) + " words in the vocabulary")
print()
print(word_to_ix)

49 words in the vocabulary

{'computational': 0, 'Computational': 1, 'the': 2, 'create': 3, 'we': 4, 'The': 5, 'processes': 6, 'abstract': 7, 'pattern': 8, 'process': 9, 'by': 10, 'to': 11, 'As': 12, 'rules': 13, 'is': 14, 'with': 15, 'things': 16, 'are': 17, 'evolve,': 18, 'spirits': 19, 'they': 20, 'study': 21, 'about': 22, 'process.': 23, 'beings': 24, 'idea': 25, 'computer': 26, 'conjure': 27, 'programs': 28, 'We': 29, 'In': 30, 'other': 31, 'directed': 32, 'a': 33, 'program.': 34, 'inhabit': 35, 'manipulate': 36, 'effect,': 37, 'People': 38, 'evolution': 39, 'of': 40, 'called': 41, 'that': 42, 'data.': 43, 'processes.': 44, 'direct': 45, 'spells.': 46, 'computers.': 47, 'our': 48}


#### Create the context - target observations to input into model

In [7]:
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


#### Helper function to transform input observations into vectors with corresponding vocabulary indexes

In [8]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.LongTensor(idxs)

In [9]:
# First observation
data[0][0]

['We', 'are', 'to', 'study']

In [10]:
make_context_vector(data[0][0], word_to_ix)

tensor([ 29,  17,  11,  21])

#### Setup the model

In [11]:
torch.manual_seed(random_seed)

<torch._C.Generator at 0x1103f78b0>

In [12]:
class CBOW(nn.Module):

    def __init__(self, context_size, embedding_size, vocab_size=None):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear1 = nn.Linear(embedding_size, vocab_size)

    def embed(self, inputs):
        embedded = self.embeddings(inputs)
        return embedded
                
    def forward(self, inputs):        
        embeds_sum = self.embed(inputs).sum(dim=0)
        out = self.linear1(embeds_sum)
        return out

In [13]:
learning_rate = 0.01
num_epochs = 500

vocab_size=vocab_size
context_size=2
embedding_size=10

#### Loss function, model, optimizer

In [14]:
loss_func = nn.CrossEntropyLoss()
model = CBOW(context_size=context_size, embedding_size=embedding_size, vocab_size=vocab_size)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

#### Train the model

In [15]:
for epoch in range(num_epochs+1):
    total_loss = 0
    for context, target in data:
        context_var = make_context_vector(context, word_to_ix)
        
        model.zero_grad()
        probs = model(context_var).view(1,-1)
        loss = loss_func(probs, torch.LongTensor([word_to_ix[target]]))
        loss.backward()
        optimizer.step()

        total_loss += loss.data
    
    if epoch % 50 == 0:
        print('Train Epoch: {} - Loss: {:.6f}'.format(
                    epoch, total_loss.item() ))

Train Epoch: 0 - Loss: 260.169891
Train Epoch: 50 - Loss: 21.289560
Train Epoch: 100 - Loss: 6.683471
Train Epoch: 150 - Loss: 3.555195
Train Epoch: 200 - Loss: 2.336178
Train Epoch: 250 - Loss: 1.710035
Train Epoch: 300 - Loss: 1.335281
Train Epoch: 350 - Loss: 1.088280
Train Epoch: 400 - Loss: 0.914324
Train Epoch: 450 - Loss: 0.785756
Train Epoch: 500 - Loss: 0.687183


#### Inspect a few examples

In [16]:
def get_prediction(data, i):
    
    v = make_context_vector(data[i][0], word_to_ix)
    #print(v)
    
    output=model(v)    
    #print(output)
    
    _, predicted = torch.max(output, 0)
    
    print("Context: "+str(data[i][0]))
    print("Output for the word with highest likelihood: "+str(_.item()))
    print("Predicted word: "+str(ix_to_word[predicted.item()]))
    print("True word: "+str(data[i][1]))
    print()
    
    return

In [17]:
for i in range(4):
    get_prediction(data, i)

Context: ['We', 'are', 'to', 'study']
Output for the word with highest likelihood: 12.355534553527832
Predicted word: about
True word: about

Context: ['are', 'about', 'study', 'the']
Output for the word with highest likelihood: 13.095701217651367
Predicted word: to
True word: to

Context: ['about', 'to', 'the', 'idea']
Output for the word with highest likelihood: 12.992199897766113
Predicted word: study
True word: study

Context: ['to', 'study', 'idea', 'of']
Output for the word with highest likelihood: 13.817288398742676
Predicted word: the
True word: the



#### Check embedding vectors for the first observation

In [18]:
data[0][0]

['We', 'are', 'to', 'study']

In [177]:
vector = make_context_vector(data[0][0], word_to_ix)
embedded_vector = model.embed(vector).data.numpy()
pd.DataFrame(embedded_vector, index=data[0][0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
We,0.126473,-0.678771,0.486892,-1.115773,1.58951,-0.743425,1.538858,1.566961,0.499367,-0.511873
are,-1.034676,-0.252601,-1.139089,-0.681244,0.09622,-0.094143,1.211157,2.939287,-0.892874,0.790775
to,-2.353026,0.742168,-0.135938,1.189165,0.391911,0.499691,1.386475,-1.27311,-0.858032,0.206108
study,0.415411,-1.001889,0.987945,0.937073,0.71247,0.521906,-1.27296,0.491454,-0.371379,-1.117039
