In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim import SGD
from torch.nn.functional import cosine_similarity
from early_stopping_pytorch.pytorchtools import EarlyStopping
import numpy as np
torch.manual_seed(1)

<torch._C.Generator at 0x1f302658230>

# Generate raw corpus for various models

In [2]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# vocab set and vocab size
vocab = set(raw_text)
vocab_size = len(vocab)

# construct dictionary to lookup 
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {ix: word for word, ix in word_to_ix.items()}
# construct training data: (context, target) pair
raw_data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    raw_data.append((context, target))
print(raw_data[:5])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


In [3]:
context, target = raw_data[0]
context
[word_to_ix[word] for word in context]

[48, 13, 30, 11]

# 1.  CBOW

## CBOW data loader

In [42]:
class cbow_dataset(Dataset):
    def __init__(self, raw_dataset, transform=None):
        # raw_dataset is a list of (context, target) pair
        self.dataset = raw_dataset
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        context, target = self.dataset[idx]
        return {"data":torch.tensor([word_to_ix[word] for word in context]), "target":torch.tensor(word_to_ix[target])}

In [43]:
dataset = cbow_dataset(raw_data)
dataloader = DataLoader(dataset,batch_size=4)

## CBOW model

In [5]:
class CBOW(nn.Module):
    def __init__(self):
        super(CBOW, self).__init__()
        # parameter of shape (vocab_size, 3)
        self.embedding = nn.Embedding(vocab_size, 3)
        # matrix of shape (3, vocab_size)
        self.linear = nn.Linear(3, vocab_size, bias=False)
    def forward(self, x):
        # for batch this would be (B, 3)
        context_embed = self.embedding(x).sum(1)
        x = self.linear(context_embed)
        return x

In [6]:
model = CBOW()

In [7]:
criterion = nn.CrossEntropyLoss()

In [8]:
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)

In [44]:
def train(model, dataloader, epochs=1, early_stopping=False, retrain=False):
    # if retraining
    if retrain:
        for parameter in model.parameters():
            nn.init.normal_(parameter)
    else:
        pass
    
    # if early stopping
    if early_stopping:
        early_stopper=EarlyStopping()
    else:
        pass
    
    for epoch in range(epochs):
        average_loss = 0
        losses = []
        for i, data in enumerate(dataloader, 0):
            context = data["data"]
            target = data["target"]
            optimizer.zero_grad()
            outputs = model(context)
            loss = criterion(outputs, target)
            loss.backward()
            optimizer.step()
            losses.append(loss.detach().numpy())
        # average loss of this epoch
        average_loss = np.average(losses)
        print("average loss of epoch", epoch, ":", average_loss)
        
        early_stopper(average_loss, model)
        if early_stopper.early_stop:
            print("early stopping")
            break
    
    model.load_state_dict(torch.load('checkpoint.pt'))

    return model

In [45]:
model = train(model, dataloader,epochs=500, early_stopping=True)

average loss of epoch 0 : 0.0071551003
average loss of epoch 1 : 0.0071500777
average loss of epoch 2 : 0.0071450234
average loss of epoch 3 : 0.007140128
average loss of epoch 4 : 0.007135423
average loss of epoch 5 : 0.0071307183
average loss of epoch 6 : 0.0071251234
average loss of epoch 7 : 0.007120228
average loss of epoch 8 : 0.0071159047
average loss of epoch 9 : 0.007110564
average loss of epoch 10 : 0.007105732
average loss of epoch 11 : 0.007100741
average loss of epoch 12 : 0.0070961635
average loss of epoch 13 : 0.007091268
average loss of epoch 14 : 0.007086118
average loss of epoch 15 : 0.0070815403
average loss of epoch 16 : 0.0070766765
average loss of epoch 17 : 0.007071972
average loss of epoch 18 : 0.007066695
average loss of epoch 19 : 0.0070616724
average loss of epoch 20 : 0.007057667
average loss of epoch 21 : 0.0070526125
average loss of epoch 22 : 0.0070475894
average loss of epoch 23 : 0.007042694
average loss of epoch 24 : 0.0070379255
average loss of epoch 

average loss of epoch 206 : 0.0062523526
average loss of epoch 207 : 0.006248506
average loss of epoch 208 : 0.006244564
average loss of epoch 209 : 0.006240781
average loss of epoch 210 : 0.006237189
average loss of epoch 211 : 0.0062329927
average loss of epoch 212 : 0.0062292735
average loss of epoch 213 : 0.006225459
average loss of epoch 214 : 0.0062212306
average loss of epoch 215 : 0.0062177023
average loss of epoch 216 : 0.006213951
average loss of epoch 217 : 0.006210041
average loss of epoch 218 : 0.0062060994
average loss of epoch 219 : 0.0062023164
average loss of epoch 220 : 0.006198756
average loss of epoch 221 : 0.006194846
average loss of epoch 222 : 0.0061908723
average loss of epoch 223 : 0.006187439
average loss of epoch 224 : 0.0061834017
average loss of epoch 225 : 0.006179587
average loss of epoch 226 : 0.006176249
average loss of epoch 227 : 0.00617218
average loss of epoch 228 : 0.006168143
average loss of epoch 229 : 0.0061647096
average loss of epoch 230 : 0.0

average loss of epoch 418 : 0.005520153
average loss of epoch 419 : 0.005516847
average loss of epoch 420 : 0.0055143037
average loss of epoch 421 : 0.0055110296
average loss of epoch 422 : 0.0055080415
average loss of epoch 423 : 0.0055049895
average loss of epoch 424 : 0.0055015883
average loss of epoch 425 : 0.0054987273
average loss of epoch 426 : 0.005495294
average loss of epoch 427 : 0.005492274
average loss of epoch 428 : 0.0054894765
average loss of epoch 429 : 0.0054866155
average loss of epoch 430 : 0.00548385
average loss of epoch 431 : 0.0054802895
average loss of epoch 432 : 0.005477174
average loss of epoch 433 : 0.005474599
average loss of epoch 434 : 0.0054712933
average loss of epoch 435 : 0.0054679234
average loss of epoch 436 : 0.0054649035
average loss of epoch 437 : 0.005462424
average loss of epoch 438 : 0.005459277
average loss of epoch 439 : 0.0054559708
average loss of epoch 440 : 0.0054531097
average loss of epoch 441 : 0.00545009
average loss of epoch 442 : 

In [30]:
word_embedding = None
for submodule in model.children():
    if type(submodule)== nn.Linear:
        print(submodule.parameters())
        word_embedding = submodule.weight

<generator object Module.parameters at 0x000001F303A6A728>


In [31]:
word_embedding = word_embedding.data

In [63]:
word_embedding;

In [32]:
def similarity(word1, word2):
    return cosine_similarity(word_embedding[word_to_ix[word1]], word_embedding[word_to_ix[word2]], dim=0).numpy()

In [33]:
def similarity_topn(word, n):
    words = list(vocab)
    words.sort(key=lambda w: similarity(w, word), reverse=True)
    return words[0:n]

In [41]:
similarity_topn("idea", 10)

['idea',
 'computational',
 'evolution',
 'pattern',
 'is',
 'by',
 'As',
 'the',
 'process.',
 'computers.']

# 2. Skip-gram

In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim import SGD
from torch.nn import Sequential
from torch.nn.functional import cosine_similarity
torch.manual_seed(1)

<torch._C.Generator at 0x1f302658230>

## Form Skip-gram dataset

In [46]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# vocab set and vocab size
vocab = set(raw_text)
vocab_size = len(vocab)

# construct dictionary to lookup 
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {ix: word for word, ix in word_to_ix.items()}

raw_skip_gram_data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    center = raw_text[i]
    for word in context: 
        raw_skip_gram_data.append((center,word))
print(raw_skip_gram_data[:5])

[('about', 'We'), ('about', 'are'), ('about', 'to'), ('about', 'study'), ('to', 'are')]


In [47]:
class skipgram_dataset(Dataset):
    def __init__(self, raw_dataset, transform=None):
        # raw_dataset is a list of (context, target) pair
        self.dataset = raw_dataset
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        center, context = self.dataset[idx]
        
        return {"data":torch.tensor(word_to_ix[center]), "target":torch.tensor(word_to_ix[context])}

In [49]:
dataset_skipgram = skipgram_dataset(raw_skip_gram_data)
dataloader_skipgram = DataLoader(dataset_skipgram, batch_size=4)

In [52]:
model = Sequential(nn.Embedding(vocab_size, 3),
                  nn.Linear(3,vocab_size))

In [53]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.01)

In [26]:
model = train(model,dataloader=dataloader_skipgram, early_stopping=True,)

epoch 0 loss tensor(1.8673, grad_fn=<NllLossBackward>)
best model saved to model.pytorch
min loss: tensor(1.8673, grad_fn=<NllLossBackward>)
epoch 1 loss tensor(1.8673, grad_fn=<NllLossBackward>)
best model saved to model.pytorch
min loss: tensor(1.8673, grad_fn=<NllLossBackward>)
epoch 2 loss tensor(1.8673, grad_fn=<NllLossBackward>)
best model saved to model.pytorch
min loss: tensor(1.8673, grad_fn=<NllLossBackward>)
epoch 3 loss tensor(1.8673, grad_fn=<NllLossBackward>)
best model saved to model.pytorch
min loss: tensor(1.8673, grad_fn=<NllLossBackward>)
epoch 4 loss tensor(1.8673, grad_fn=<NllLossBackward>)
best model saved to model.pytorch
min loss: tensor(1.8673, grad_fn=<NllLossBackward>)
epoch 5 loss tensor(1.8673, grad_fn=<NllLossBackward>)
best model saved to model.pytorch
min loss: tensor(1.8673, grad_fn=<NllLossBackward>)
epoch 6 loss tensor(1.8673, grad_fn=<NllLossBackward>)
best model saved to model.pytorch
min loss: tensor(1.8673, grad_fn=<NllLossBackward>)
epoch 7 loss 

epoch 119 loss tensor(1.8679, grad_fn=<NllLossBackward>)
epoch 120 loss tensor(1.8680, grad_fn=<NllLossBackward>)
epoch 121 loss tensor(1.8680, grad_fn=<NllLossBackward>)
epoch 122 loss tensor(1.8680, grad_fn=<NllLossBackward>)
epoch 123 loss tensor(1.8680, grad_fn=<NllLossBackward>)
epoch 124 loss tensor(1.8680, grad_fn=<NllLossBackward>)
epoch 125 loss tensor(1.8680, grad_fn=<NllLossBackward>)
epoch 126 loss tensor(1.8680, grad_fn=<NllLossBackward>)
epoch 127 loss tensor(1.8681, grad_fn=<NllLossBackward>)
epoch 128 loss tensor(1.8681, grad_fn=<NllLossBackward>)
epoch 129 loss tensor(1.8681, grad_fn=<NllLossBackward>)
epoch 130 loss tensor(1.8681, grad_fn=<NllLossBackward>)
epoch 131 loss tensor(1.8681, grad_fn=<NllLossBackward>)
epoch 132 loss tensor(1.8681, grad_fn=<NllLossBackward>)
epoch 133 loss tensor(1.8682, grad_fn=<NllLossBackward>)
epoch 134 loss tensor(1.8682, grad_fn=<NllLossBackward>)
epoch 135 loss tensor(1.8682, grad_fn=<NllLossBackward>)
epoch 136 loss tensor(1.8682, g

epoch 266 loss tensor(1.8714, grad_fn=<NllLossBackward>)
epoch 267 loss tensor(1.8714, grad_fn=<NllLossBackward>)
epoch 268 loss tensor(1.8714, grad_fn=<NllLossBackward>)
epoch 269 loss tensor(1.8715, grad_fn=<NllLossBackward>)
epoch 270 loss tensor(1.8715, grad_fn=<NllLossBackward>)
epoch 271 loss tensor(1.8715, grad_fn=<NllLossBackward>)
epoch 272 loss tensor(1.8716, grad_fn=<NllLossBackward>)
epoch 273 loss tensor(1.8716, grad_fn=<NllLossBackward>)
epoch 274 loss tensor(1.8716, grad_fn=<NllLossBackward>)
epoch 275 loss tensor(1.8717, grad_fn=<NllLossBackward>)
epoch 276 loss tensor(1.8717, grad_fn=<NllLossBackward>)
epoch 277 loss tensor(1.8717, grad_fn=<NllLossBackward>)
epoch 278 loss tensor(1.8718, grad_fn=<NllLossBackward>)
epoch 279 loss tensor(1.8718, grad_fn=<NllLossBackward>)
epoch 280 loss tensor(1.8718, grad_fn=<NllLossBackward>)
epoch 281 loss tensor(1.8719, grad_fn=<NllLossBackward>)
epoch 282 loss tensor(1.8719, grad_fn=<NllLossBackward>)
epoch 283 loss tensor(1.8719, g

epoch 410 loss tensor(1.8770, grad_fn=<NllLossBackward>)
epoch 411 loss tensor(1.8771, grad_fn=<NllLossBackward>)
epoch 412 loss tensor(1.8771, grad_fn=<NllLossBackward>)
epoch 413 loss tensor(1.8772, grad_fn=<NllLossBackward>)
epoch 414 loss tensor(1.8772, grad_fn=<NllLossBackward>)
epoch 415 loss tensor(1.8772, grad_fn=<NllLossBackward>)
epoch 416 loss tensor(1.8773, grad_fn=<NllLossBackward>)
epoch 417 loss tensor(1.8773, grad_fn=<NllLossBackward>)
epoch 418 loss tensor(1.8774, grad_fn=<NllLossBackward>)
epoch 419 loss tensor(1.8774, grad_fn=<NllLossBackward>)
epoch 420 loss tensor(1.8775, grad_fn=<NllLossBackward>)
epoch 421 loss tensor(1.8775, grad_fn=<NllLossBackward>)
epoch 422 loss tensor(1.8776, grad_fn=<NllLossBackward>)
epoch 423 loss tensor(1.8776, grad_fn=<NllLossBackward>)
epoch 424 loss tensor(1.8777, grad_fn=<NllLossBackward>)
epoch 425 loss tensor(1.8777, grad_fn=<NllLossBackward>)
epoch 426 loss tensor(1.8778, grad_fn=<NllLossBackward>)
epoch 427 loss tensor(1.8778, g

epoch 558 loss tensor(1.8844, grad_fn=<NllLossBackward>)
epoch 559 loss tensor(1.8844, grad_fn=<NllLossBackward>)
epoch 560 loss tensor(1.8845, grad_fn=<NllLossBackward>)
epoch 561 loss tensor(1.8845, grad_fn=<NllLossBackward>)
epoch 562 loss tensor(1.8846, grad_fn=<NllLossBackward>)
epoch 563 loss tensor(1.8846, grad_fn=<NllLossBackward>)
epoch 564 loss tensor(1.8847, grad_fn=<NllLossBackward>)
epoch 565 loss tensor(1.8847, grad_fn=<NllLossBackward>)
epoch 566 loss tensor(1.8848, grad_fn=<NllLossBackward>)
epoch 567 loss tensor(1.8848, grad_fn=<NllLossBackward>)
epoch 568 loss tensor(1.8849, grad_fn=<NllLossBackward>)
epoch 569 loss tensor(1.8850, grad_fn=<NllLossBackward>)
epoch 570 loss tensor(1.8850, grad_fn=<NllLossBackward>)
epoch 571 loss tensor(1.8851, grad_fn=<NllLossBackward>)
epoch 572 loss tensor(1.8851, grad_fn=<NllLossBackward>)
epoch 573 loss tensor(1.8852, grad_fn=<NllLossBackward>)
epoch 574 loss tensor(1.8852, grad_fn=<NllLossBackward>)
epoch 575 loss tensor(1.8853, g

epoch 703 loss tensor(1.8923, grad_fn=<NllLossBackward>)
epoch 704 loss tensor(1.8924, grad_fn=<NllLossBackward>)
epoch 705 loss tensor(1.8924, grad_fn=<NllLossBackward>)
epoch 706 loss tensor(1.8925, grad_fn=<NllLossBackward>)
epoch 707 loss tensor(1.8925, grad_fn=<NllLossBackward>)
epoch 708 loss tensor(1.8926, grad_fn=<NllLossBackward>)
epoch 709 loss tensor(1.8927, grad_fn=<NllLossBackward>)
epoch 710 loss tensor(1.8927, grad_fn=<NllLossBackward>)
epoch 711 loss tensor(1.8928, grad_fn=<NllLossBackward>)
epoch 712 loss tensor(1.8928, grad_fn=<NllLossBackward>)
epoch 713 loss tensor(1.8929, grad_fn=<NllLossBackward>)
epoch 714 loss tensor(1.8929, grad_fn=<NllLossBackward>)
epoch 715 loss tensor(1.8930, grad_fn=<NllLossBackward>)
epoch 716 loss tensor(1.8930, grad_fn=<NllLossBackward>)
epoch 717 loss tensor(1.8931, grad_fn=<NllLossBackward>)
epoch 718 loss tensor(1.8932, grad_fn=<NllLossBackward>)
epoch 719 loss tensor(1.8932, grad_fn=<NllLossBackward>)
epoch 720 loss tensor(1.8933, g

In [28]:
model = torch.load("model.pytorch")

In [39]:
word_embedding = model[0].weight.data

In [40]:
def similarity(word1, word2):
    return cosine_similarity(word_embedding[word_to_ix[word1]], word_embedding[word_to_ix[word2]], dim=0).numpy()

In [41]:
def similarity_topn(word, n):
    words = list(vocab)
    words.sort(key=lambda w: similarity(w, word), reverse=True)
    return words[0:n]

In [44]:
similarity_topn("We", 10)

['We',
 'is',
 'directed',
 'by',
 'process',
 'a',
 'are',
 'process.',
 'inhabit',
 'pattern']

## Skipgram with hierarchical softmax