# Implement word2vec 

1. You are given the text corpus 'bible.txt' as a list of sentences. Obtain the unique words in the text. As a pre-processing step convert all the words into lower case. You can consider the vocabulary as a list of words

In [1]:
from nltk.corpus import gutenberg

In [2]:
sents = gutenberg.sents('bible-kjv.txt')

In [4]:
len(sents)
sents[1]

['The', 'Old', 'Testament', 'of', 'the', 'King', 'James', 'Bible']

In [4]:
sents[0]

['[', 'The', 'King', 'James', 'Bible', ']']

In [17]:
from collections import Counter

vocab = Counter()
for sent in sents:
    for word in sent:
#         v = Counter(word)
        vocab[word.lower()] = 1

In [18]:
vocab = list(vocab.keys())
vocab.index('the')

1

2. Write a function that takes a word in the vocabulary as input and encodes it into a one-hot vector

In [15]:
import numpy as np

In [16]:
def encodingWord(word, vocab):
    encoding = np.zeros(len(vocab))
    encoding[vocab.index(word)] = 1
    return encoding

In [18]:
encodingWord('james',vocab)[:5]

array([0., 0., 0., 1., 0.])

3. Write a function to generate a list of center word and context word pairs for each unique in the vocabulary with a given skip-gram window. You can represent a pair as a tuple.

In [22]:
def obtainPairs(sents, window=5):
    pairs = []
    for sent in sents:
        for i,w in enumerate(sent):
            for j in range(i-window,i+window):
                if j>=0 and j<len(sent) and j!=i:
                    pairs.append((w.lower(),sent[j].lower()))
    return pairs                

In [23]:
pairs = obtainPairs(sents)

In [29]:
pairs = list(set(pairs))

In [30]:
len(pairs)

1018981

4. Write a dataset class which genertes a batch of context word center word pair. You have to return the one hot encoding of the center word and the index of the context word in the vocabulary as label

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

In [12]:
class word2vecDataset(Dataset):
    def __init__(self, encoding, vocab, pairs):
        super(word2vecDataset,self).__init__()
        self.encoding = encoding
        self.vocab = vocab
        self.pairs = pairs
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, index):
        w_1, w_2 = self.pairs[index]
        w_1_e = self.encoding(w_1, self.vocab)
        label = self.vocab.index(w_2)
        return torch.tensor(w_1_e, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

In [94]:
train_dataset = word2vecDataset(encodingWord, vocab, pairs)

5. Write a model class which implements word2vec model. You will need two parameters W_1 and W_2 which represent the center word and the context word matrices. You can use nn.Parameters to initialize the two parameters. A sigmoid layer follows them. 

In [95]:
import torch.nn as nn

In [96]:
class Word2vec(nn.Module):
    def __init__(self, vocab_size, encoding_size):
        super(Word2vec, self).__init__()
        self.vocab_size = vocab_size
        self.encoding_size = encoding_size
        
        self.W_1 = nn.Parameter(torch.randn(self.vocab_size, self.encoding_size, requires_grad=True))
        self.W_2 = nn.Parameter(torch.randn(self.encoding_size, self.vocab_size, requires_grad=True))
        self.softmax = nn.Softmax(dim=0)
        
    def forward(self, inp):
        x = torch.matmul(inp, self.W_1)
        y = torch.matmul(x, self.W_2)
        y = self.softmax(y)
        return y

In [97]:
model = Word2vec(len(vocab), 200)

6. Write a function to train the model with batches of context-center word pairs. You can use Adam for optimization and cross-entropy as loss function. 

In [1]:
from torch.optim import Adam
from tqdm import tqdm

In [2]:
def train(model, train_dataset, batch_size, epochs=1, learning_rate=0.0001):
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
    optimizer = Adam(model.parameters(), lr=learning_rate)
    
    criterion = nn.CrossEntropyLoss()
    
    for _ in range(epochs):
        with tqdm(total=int(len(train_dataset)/batch_size)) as pbar:
            for inp,labels in train_dataloader:
                out = model(inp)
                loss = criterion(out, labels)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                pbar.update(1)

7. Create a instance of the model and train it with batch size of 32 and train the model.

In [3]:
train(model, train_dataset, 32)

NameError: name 'model' is not defined

8. Write a function which when given a word generates the its embedding based on the trained model.

In [108]:
def getEmbedding(model, word, vocab):
    ind = vocab.index(word)
    params = model.state_dict()
    return params['W_1'][ind]

In [112]:
getEmbedding(model, 'the', vocab)

tensor([-0.0159, -0.5141,  1.7470,  0.6286,  0.3453, -0.4378, -0.9434, -3.3841,
        -0.5757, -0.6783, -1.4207, -0.2772, -1.7561,  1.3408, -0.2505, -0.6332,
         1.2268,  1.0868,  2.5314,  0.7935, -0.2361,  0.9152, -1.5618, -1.6053,
        -3.3672, -1.3475,  0.6621,  0.4263, -0.3119,  0.3514,  0.0674,  0.4105,
         1.5939, -0.0116, -0.3430, -0.7105,  0.0786, -1.7043,  0.6105,  3.1155,
        -0.4178, -2.0308,  1.2899,  1.0814, -0.5028,  0.9320, -0.7964,  1.7560,
         0.4185, -0.5805,  0.8947,  0.0873, -0.7984,  1.1038, -0.6423, -0.2161,
         1.5204, -0.3924, -0.9964, -0.6141,  1.0638,  0.4743, -0.9689, -0.3962,
        -1.0791,  0.8241, -0.2425, -0.5392,  1.1395, -0.0305, -1.3192,  0.2712,
        -1.4570,  0.3039, -1.9089,  0.1500,  0.7335,  0.7298, -0.6969,  0.5375,
        -0.5575, -0.8123, -1.4375, -1.2186, -0.2160,  0.0677,  0.5877,  1.6579,
         0.9000,  2.9497, -1.8123, -0.4993, -0.7505,  1.3070, -2.7847, -0.3067,
        -0.0383, -0.0071,  1.0467, -0.15