<a href="https://colab.research.google.com/github/hissain/mlworks/blob/main/codes/RNN_CBOW_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import numpy as np

In [17]:
# Constant definition
RANDOM_SEED = 1
CONTEXT_SIZE = 3
EMBEDDING_DIM = 10

In [18]:
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x118e5d990>

In [19]:
# word level tokenization
class Tokenizer:
    def __init__(self):
        self.mapping = {}
        self.reverse_mapping = {}

    def encode(self, text):
        words = text.split()
        tokens = []
        for word in words:
            if word not in self.mapping:
                mapped_int = len(self.mapping)
                self.mapping[word] = mapped_int
                self.reverse_mapping[mapped_int] = word
            tokens.append(self.mapping[word])
        return tokens

    def decode(self, tokens):
        words = [self.reverse_mapping[token] for token in tokens]
        return " ".join(words)


In [20]:
tokenizer = Tokenizer()

In [21]:
# Taking a random paragraph
text = "We are living in an AI era . One day AI will take all the Human jobs ."

In [22]:
# Generating dataset
dataset = []
tokens = tokenizer.encode(text)
for i in range(len(tokens) - CONTEXT_SIZE):
    dataset.append((tokens[i:i + CONTEXT_SIZE], tokens[i + CONTEXT_SIZE]))

In [23]:
vocab_size = len(tokenizer.mapping)
print(vocab_size)

16


In [24]:
# Used for tokenization purpose
def get_one_hot(tokens):
    return F.one_hot(
        torch.tensor(tokens),
        num_classes=vocab_size
    ).flatten().type(torch.float)

In [25]:
# Define the CBOW model
class CBOW(nn.Module):
  def __init__(self, vocab_size, embedding_dim, context_size):
    super(CBOW, self).__init__()
    self.hidden = nn.Linear(vocab_size * context_size, embedding_dim)
    self.output = nn.Linear(embedding_dim, vocab_size)

  def forward(self, input):
    return self.output(self.hidden(get_one_hot(input)))

In [26]:
# Create the model
model = CBOW(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE)

# Define loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [27]:
len(dataset)

15

In [28]:
# Training
for epoch in range(100):
    total_loss = 0
    for input, label in dataset:
        predictions = model(input)
        loss = loss_function(predictions, get_one_hot(label))
        total_loss += loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(total_loss.item())


41.66307067871094
41.2083854675293
40.83362579345703
40.462249755859375
40.088836669921875
39.71091842651367
39.32661056518555
38.934146881103516
38.53190612792969
38.1184196472168
37.69240951538086


37.252830505371094
36.798851013183594
36.329830169677734
35.84531784057617
35.34503936767578
34.82884216308594
34.2967529296875
33.748931884765625
33.18568420410156
32.60744094848633
32.0147590637207
31.408281326293945
30.788738250732422
30.15692710876465
29.513708114624023
28.859996795654297
28.19676971435547
27.525053024291992
26.845914840698242
26.16048812866211
25.4699649810791
24.775577545166016
24.07860565185547
23.380393981933594
22.68229866027832
21.985736846923828
21.2921085357666
20.602827072143555
19.9193058013916
19.242897033691406
18.574926376342773
17.916641235351562
17.26921844482422
16.633743286132812
16.011194229125977
15.40245246887207
14.808279991149902
14.229337692260742
13.666165351867676
13.119205474853516
12.588802337646484
12.07519817352295
11.57855224609375
11.098942756652832
10.636371612548828
10.190774917602539
9.7620267868042
9.349943161010742
8.95429515838623
8.57480239868164
8.21114444732666
7.862966537475586
7.529881954193115
7.211475372314453
6.907311916

In [29]:
# Testing
input_text = "One day AI"
correct_output = "will"

with torch.no_grad():
    prediction = model(tokenizer.encode(input_text))
    print(torch.argmax(prediction).item())
    print(tokenizer.reverse_mapping[torch.argmax(prediction).item()])

10
will


In [30]:
# Embedding of a word
token = tokenizer.mapping["will"]
print(model.output.weight[token])

tensor([-0.6185,  0.7577,  0.4743,  0.5531, -0.6016,  0.5008,  0.0481, -0.6179,
         0.3598,  0.2874], grad_fn=<SelectBackward0>)
