In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim import SGD
from torch.nn.functional import cosine_similarity
torch.manual_seed(1)

<torch._C.Generator at 0x28f85ade070>

# Generate raw corpus for various models

In [2]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# vocab set and vocab size
vocab = set(raw_text)
vocab_size = len(vocab)

# construct dictionary to lookup 
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {ix: word for word, ix in word_to_ix.items()}
# construct training data: (context, target) pair
raw_data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    raw_data.append((context, target))
print(raw_data[:5])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]


In [3]:
context, target = raw_data[0]
context
[word_to_ix[word] for word in context]

[48, 13, 30, 11]

# 1.  CBOW

## CBOW data loader

In [3]:
class cbow_dataset(Dataset):
    def __init__(self, raw_dataset, transform=None):
        # raw_dataset is a list of (context, target) pair
        self.dataset = raw_dataset
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        context, target = self.dataset[idx]
        return {"context":torch.tensor([word_to_ix[word] for word in context]), "target":torch.tensor(word_to_ix[target])}

In [31]:
dataset = cbow_dataset(raw_data)
dataloader = DataLoader(dataset,batch_size=4)

## CBOW model

In [42]:
class CBOW(nn.Module):
    def __init__(self):
        super(CBOW, self).__init__()
        # parameter of shape (vocab_size, 3)
        self.embedding = nn.Embedding(vocab_size, 3)
        # matrix of shape (3, vocab_size)
        self.linear = nn.Linear(3, vocab_size, bias=False)
    def forward(self, x):
        # for batch this would be (B, 3)
        context_embed = self.embedding(x).sum(1)
        x = self.linear(context_embed)
        return x

In [47]:
model = CBOW()

In [48]:
criterion = nn.CrossEntropyLoss()

In [49]:
optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)

In [60]:
# intialize parameters
for parameter in model.parameters():
    nn.init.normal_(parameter)

# train
for epoch in range(400):
    running_loss = 0.0
    for i, data in enumerate(dataloader, 0):
        context = data["context"]
        target = data["target"]
        optimizer.zero_grad()
        outputs = model(context)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if i % 10 == 0:
            print('[%d, %5d] loss: %.8f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0
print('Finished Training')

[1,     1] loss: 0.00402282
[1,    11] loss: 0.04147724
[2,     1] loss: 0.00342317
[2,    11] loss: 0.03446168
[3,     1] loss: 0.00285365
[3,    11] loss: 0.02923987
[4,     1] loss: 0.00249494
[4,    11] loss: 0.02584207
[5,     1] loss: 0.00226498
[5,    11] loss: 0.02343580
[6,     1] loss: 0.00210213
[6,    11] loss: 0.02161346
[7,     1] loss: 0.00197792
[7,    11] loss: 0.02017254
[8,     1] loss: 0.00187943
[8,    11] loss: 0.01899862
[9,     1] loss: 0.00179976
[9,    11] loss: 0.01802174
[10,     1] loss: 0.00173438
[10,    11] loss: 0.01719609
[11,     1] loss: 0.00167990
[11,    11] loss: 0.01648965
[12,     1] loss: 0.00163375
[12,    11] loss: 0.01587869
[13,     1] loss: 0.00159396
[13,    11] loss: 0.01534478
[14,     1] loss: 0.00155908
[14,    11] loss: 0.01487314
[15,     1] loss: 0.00152801
[15,    11] loss: 0.01445185
[16,     1] loss: 0.00149994
[16,    11] loss: 0.01407127
[17,     1] loss: 0.00147425
[17,    11] loss: 0.01372370
[18,     1] loss: 0.00145047
[18

[156,     1] loss: 0.00047775
[156,    11] loss: 0.00355843
[157,     1] loss: 0.00047519
[157,    11] loss: 0.00353744
[158,     1] loss: 0.00047262
[158,    11] loss: 0.00351661
[159,     1] loss: 0.00047005
[159,    11] loss: 0.00349594
[160,     1] loss: 0.00046747
[160,    11] loss: 0.00347543
[161,     1] loss: 0.00046488
[161,    11] loss: 0.00345508
[162,     1] loss: 0.00046228
[162,    11] loss: 0.00343488
[163,     1] loss: 0.00045967
[163,    11] loss: 0.00341484
[164,     1] loss: 0.00045705
[164,    11] loss: 0.00339496
[165,     1] loss: 0.00045442
[165,    11] loss: 0.00337523
[166,     1] loss: 0.00045178
[166,    11] loss: 0.00335565
[167,     1] loss: 0.00044913
[167,    11] loss: 0.00333623
[168,     1] loss: 0.00044648
[168,    11] loss: 0.00331696
[169,     1] loss: 0.00044381
[169,    11] loss: 0.00329784
[170,     1] loss: 0.00044114
[170,    11] loss: 0.00327888
[171,     1] loss: 0.00043846
[171,    11] loss: 0.00326007
[172,     1] loss: 0.00043577
[172,    1

[296,    11] loss: 0.00175805
[297,     1] loss: 0.00019359
[297,    11] loss: 0.00175035
[298,     1] loss: 0.00019247
[298,    11] loss: 0.00174269
[299,     1] loss: 0.00019134
[299,    11] loss: 0.00173507
[300,     1] loss: 0.00019023
[300,    11] loss: 0.00172749
[301,     1] loss: 0.00018911
[301,    11] loss: 0.00171995
[302,     1] loss: 0.00018801
[302,    11] loss: 0.00171244
[303,     1] loss: 0.00018690
[303,    11] loss: 0.00170497
[304,     1] loss: 0.00018580
[304,    11] loss: 0.00169754
[305,     1] loss: 0.00018471
[305,    11] loss: 0.00169015
[306,     1] loss: 0.00018361
[306,    11] loss: 0.00168279
[307,     1] loss: 0.00018252
[307,    11] loss: 0.00167548
[308,     1] loss: 0.00018144
[308,    11] loss: 0.00166819
[309,     1] loss: 0.00018036
[309,    11] loss: 0.00166094
[310,     1] loss: 0.00017928
[310,    11] loss: 0.00165373
[311,     1] loss: 0.00017821
[311,    11] loss: 0.00164655
[312,     1] loss: 0.00017714
[312,    11] loss: 0.00163940
[313,     

In [61]:
word_embedding = None
for submodule in model.children():
    if type(submodule)== nn.Linear:
        print(submodule.parameters())
        word_embedding = submodule.weight

<generator object Module.parameters at 0x0000028F86421EB8>


In [62]:
word_embedding = word_embedding.data

In [63]:
word_embedding;

In [64]:
def similarity(word1, word2):
    return cosine_similarity(word_embedding[word_to_ix[word1]], word_embedding[word_to_ix[word2]], dim=0).numpy()

In [65]:
def similarity_topn(word, n):
    words = list(vocab)
    words.sort(key=lambda w: similarity(w, word), reverse=True)
    return words[0:n]

In [66]:
similarity_topn("We", 10)

['We',
 'evolve,',
 'program.',
 'they',
 'inhabit',
 'spirits',
 'abstract',
 'programs',
 'rules',
 'directed']

# 2. Skip-gram

In [75]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim import SGD
from torch.nn import Sequential
from torch.nn.functional import cosine_similarity
torch.manual_seed(1)

<torch._C.Generator at 0x28f85ade070>

## Form Skip-gram dataset

In [69]:
raw_skip_gram_data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    center = raw_text[i]
    for word in context: 
        raw_skip_gram_data.append((center,word))
print(raw_skip_gram_data[:5])

[('about', 'We'), ('about', 'are'), ('about', 'to'), ('about', 'study'), ('to', 'are')]


In [70]:
class skipgram_dataset(Dataset):
    def __init__(self, raw_dataset, transform=None):
        # raw_dataset is a list of (context, target) pair
        self.dataset = raw_dataset
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        center, context = self.dataset[idx]
        
        return {"center":torch.tensor([word_to_ix[center]]), "context":torch.tensor(word_to_ix[context])}

In [79]:
dataset_skipgram = skipgram_dataset(raw_skip_gram_data)
dataloader_skipgram = DataLoader(dataset_skipgram, batch_size=4)

In [76]:
model = Sequential(nn.Embedding(vocab_size, 3),
                  nn.Linear(3,vocab_size))

In [78]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [89]:
center = torch.tensor([39,39,39,41])
layer = nn.Embedding(vocab_size, 3)
layer(center)

tensor([[-1.0304, -1.2977,  0.9390],
        [-1.0304, -1.2977,  0.9390],
        [-1.0304, -1.2977,  0.9390],
        [ 2.4358,  0.7929,  0.9873]], grad_fn=<EmbeddingBackward>)

In [83]:
# initialize parameters
for parameter in model.parameters():
    nn.init.normal_(parameter)

for epoch in range(300):
    for idx, data in enumerate(dataloader_skipgram):
        # center is of shape (B, 1)
        center = data["center"]
        print(center)
        # target is of shape ()
        target= data["context"]
        print(target)
        output = model.forward(center)
        print(output)
        print(target)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if idx % 10 == 0:
            print("epoch:", epoch, "iter:", idx, "loss:", loss)

tensor([[39],
        [39],
        [39],
        [39]])
tensor([17, 18,  3, 12])
tensor([[[ 5.1001, -1.3883, -0.4741,  2.1008,  0.0644,  2.5013,  2.7036,
           0.6809, -1.6003, -2.5444, -0.9272, -3.2729,  2.3253, -0.2955,
          -1.8054,  1.2215,  1.5943, -2.8442, -0.8601,  1.9945, -3.5406,
          -0.6422, -0.5790,  1.4110, -0.3164, -0.2242,  2.8887, -1.6497,
           1.3752, -0.9250, -2.2290,  2.1204,  3.1357,  5.3337,  0.9207,
          -0.4299,  1.2942,  0.2996,  3.6706, -2.3519, -2.7258,  0.6672,
           1.2590,  2.1013,  0.3003,  0.9130, -0.2308, -0.1417,  2.2458]],

        [[ 5.1001, -1.3883, -0.4741,  2.1008,  0.0644,  2.5013,  2.7036,
           0.6809, -1.6003, -2.5444, -0.9272, -3.2729,  2.3253, -0.2955,
          -1.8054,  1.2215,  1.5943, -2.8442, -0.8601,  1.9945, -3.5406,
          -0.6422, -0.5790,  1.4110, -0.3164, -0.2242,  2.8887, -1.6497,
           1.3752, -0.9250, -2.2290,  2.1204,  3.1357,  5.3337,  0.9207,
          -0.4299,  1.2942,  0.2996,  3

ValueError: Expected target size (4, 49), got torch.Size([4])