### Word2Vec CBOW

Adapted from: <br>
https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html#an-example-n-gram-language-modeling <br>
https://gist.github.com/GavinXing/9954ea846072e115bb07d9758892382c

In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F

#### Our toy dataset with a few sentences

In [2]:
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

In [3]:
print(raw_text)

['We', 'are', 'about', 'to', 'study', 'the', 'idea', 'of', 'a', 'computational', 'process.', 'Computational', 'processes', 'are', 'abstract', 'beings', 'that', 'inhabit', 'computers.', 'As', 'they', 'evolve,', 'processes', 'manipulate', 'other', 'abstract', 'things', 'called', 'data.', 'The', 'evolution', 'of', 'a', 'process', 'is', 'directed', 'by', 'a', 'pattern', 'of', 'rules', 'called', 'a', 'program.', 'People', 'create', 'programs', 'to', 'direct', 'processes.', 'In', 'effect,', 'we', 'conjure', 'the', 'spirits', 'of', 'the', 'computer', 'with', 'our', 'spells.']


#### Tokenize the dataset and map each word to a unique index

In [4]:
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for i, word in enumerate(vocab)}

In [5]:
print(str(vocab_size) + " words in the vocabulary")
print()
print(word_to_ix)

49 words in the vocabulary

{'computers.': 0, 'abstract': 1, 'process': 2, 'with': 3, 'of': 4, 'they': 5, 'evolution': 6, 'manipulate': 7, 'evolve,': 8, 'processes': 9, 'create': 10, 'other': 11, 'is': 12, 'In': 13, 'inhabit': 14, 'by': 15, 'directed': 16, 'spirits': 17, 'As': 18, 'We': 19, 'program.': 20, 'conjure': 21, 'pattern': 22, 'study': 23, 'spells.': 24, 'beings': 25, 'about': 26, 'things': 27, 'direct': 28, 'our': 29, 'process.': 30, 'to': 31, 'that': 32, 'processes.': 33, 'The': 34, 'rules': 35, 'Computational': 36, 'idea': 37, 'a': 38, 'programs': 39, 'are': 40, 'data.': 41, 'effect,': 42, 'the': 43, 'computer': 44, 'People': 45, 'called': 46, 'computational': 47, 'we': 48}


#### Create the context - target observations to input into model

In [6]:
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


#### Helper function to transform input observations into vectors with corresponding vocabulary indexes

In [7]:
def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.LongTensor(idxs)

In [10]:
# First observation
data[0][0]

['We', 'are', 'to', 'study']

In [12]:
make_context_vector(data[0][0], word_to_ix)

tensor([ 19,  40,  31,  23])

#### Setup the model

In [142]:
torch.manual_seed(random_seed)

<torch._C.Generator at 0x107f5ca50>

In [143]:
class CBOW(nn.Module):

    def __init__(self, context_size, embedding_size, vocab_size=None):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear1 = nn.Linear(embedding_size, vocab_size)

    def embed(self, inputs):
        embedded = self.embeddings(inputs)
        return embedded
                
    def forward(self, inputs):        
        embeds_sum = self.embed(inputs).sum(dim=0)
        out = self.linear1(embeds_sum)
        return out

In [144]:
learning_rate = 0.01
num_epochs = 500

vocab_size=vocab_size
context_size=2
embedding_size=10

random_seed = 77777

#### Loss function, model, optimizer

In [145]:
loss_func = nn.CrossEntropyLoss()
model = CBOW(context_size=context_size, embedding_size=embedding_size, vocab_size=vocab_size)
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

#### Train the model

In [146]:
for epoch in range(num_epochs+1):
    total_loss = 0
    for context, target in data:
        context_var = make_context_vector(context, word_to_ix)
        
        model.zero_grad()
        probs = model(context_var).view(1,-1)
        loss = loss_func(probs, torch.LongTensor([word_to_ix[target]]))
        loss.backward()
        optimizer.step()

        total_loss += loss.data
        
    print('Train Epoch: {} - Loss: {:.6f}'.format(
                epoch, total_loss.item() ))

Train Epoch: 0 - Loss: 253.784637
Train Epoch: 1 - Loss: 229.940887
Train Epoch: 2 - Loss: 210.646942
Train Epoch: 3 - Loss: 194.869553
Train Epoch: 4 - Loss: 181.499847
Train Epoch: 5 - Loss: 169.792404
Train Epoch: 6 - Loss: 159.379303
Train Epoch: 7 - Loss: 150.042801
Train Epoch: 8 - Loss: 141.616898
Train Epoch: 9 - Loss: 133.963028
Train Epoch: 10 - Loss: 126.963890
Train Epoch: 11 - Loss: 120.521767
Train Epoch: 12 - Loss: 114.556618
Train Epoch: 13 - Loss: 109.004013
Train Epoch: 14 - Loss: 103.812393
Train Epoch: 15 - Loss: 98.940247
Train Epoch: 16 - Loss: 94.353813
Train Epoch: 17 - Loss: 90.025375
Train Epoch: 18 - Loss: 85.931755
Train Epoch: 19 - Loss: 82.053535
Train Epoch: 20 - Loss: 78.374191
Train Epoch: 21 - Loss: 74.879715
Train Epoch: 22 - Loss: 71.558121
Train Epoch: 23 - Loss: 68.399033
Train Epoch: 24 - Loss: 65.393440
Train Epoch: 25 - Loss: 62.533272
Train Epoch: 26 - Loss: 59.811218
Train Epoch: 27 - Loss: 57.220417
Train Epoch: 28 - Loss: 54.754356
Train Epo

Train Epoch: 249 - Loss: 1.610709
Train Epoch: 250 - Loss: 1.602067
Train Epoch: 251 - Loss: 1.593511
Train Epoch: 252 - Loss: 1.585040
Train Epoch: 253 - Loss: 1.576652
Train Epoch: 254 - Loss: 1.568346
Train Epoch: 255 - Loss: 1.560121
Train Epoch: 256 - Loss: 1.551976
Train Epoch: 257 - Loss: 1.543910
Train Epoch: 258 - Loss: 1.535921
Train Epoch: 259 - Loss: 1.528009
Train Epoch: 260 - Loss: 1.520172
Train Epoch: 261 - Loss: 1.512410
Train Epoch: 262 - Loss: 1.504721
Train Epoch: 263 - Loss: 1.497103
Train Epoch: 264 - Loss: 1.489559
Train Epoch: 265 - Loss: 1.482083
Train Epoch: 266 - Loss: 1.474678
Train Epoch: 267 - Loss: 1.467341
Train Epoch: 268 - Loss: 1.460072
Train Epoch: 269 - Loss: 1.452869
Train Epoch: 270 - Loss: 1.445732
Train Epoch: 271 - Loss: 1.438660
Train Epoch: 272 - Loss: 1.431652
Train Epoch: 273 - Loss: 1.424708
Train Epoch: 274 - Loss: 1.417826
Train Epoch: 275 - Loss: 1.411005
Train Epoch: 276 - Loss: 1.404245
Train Epoch: 277 - Loss: 1.397546
Train Epoch: 2

#### Inspect a few examples

In [175]:
def get_prediction(data, i):
    
    v = make_context_vector(data[i][0], word_to_ix)
    #print(v)
    
    output=model(v)    
    #print(output)
    
    _, predicted = torch.max(output, 0)
    
    print("Context: "+str(data[i][0]))
    print("Output for the word with highest likelihood: "+str(_.item()))
    print("Predicted word: "+str(ix_to_word[predicted.item()]))
    print("True word: "+str(data[i][1]))
    print()
    
    return

In [176]:
for i in range(4):
    get_prediction(data, i)

Context: ['We', 'are', 'to', 'study']
Output for the word with highest likelihood: 12.126791000366211
Predicted word: about
True word: about

Context: ['are', 'about', 'study', 'the']
Output for the word with highest likelihood: 12.91408634185791
Predicted word: to
True word: to

Context: ['about', 'to', 'the', 'idea']
Output for the word with highest likelihood: 10.502835273742676
Predicted word: study
True word: study

Context: ['to', 'study', 'idea', 'of']
Output for the word with highest likelihood: 12.92435359954834
Predicted word: the
True word: the



#### Check embedding vectors for the first observation

In [171]:
data[0][0]

['We', 'are', 'to', 'study']

In [177]:
vector = make_context_vector(data[0][0], word_to_ix)
embedded_vector = model.embed(vector).data.numpy()
pd.DataFrame(embedded_vector, index=data[0][0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
We,0.126473,-0.678771,0.486892,-1.115773,1.58951,-0.743425,1.538858,1.566961,0.499367,-0.511873
are,-1.034676,-0.252601,-1.139089,-0.681244,0.09622,-0.094143,1.211157,2.939287,-0.892874,0.790775
to,-2.353026,0.742168,-0.135938,1.189165,0.391911,0.499691,1.386475,-1.27311,-0.858032,0.206108
study,0.415411,-1.001889,0.987945,0.937073,0.71247,0.521906,-1.27296,0.491454,-0.371379,-1.117039
