<a href="https://colab.research.google.com/github/hissain/mlworks/blob/main/codes/RNN_CBOW_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install torchtext torchdata portalocker>=2.0.0

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import numpy as np

In [None]:
# Constant definition
RANDOM_SEED = 1
CONTEXT_SIZE = 3
EMBEDDING_DIM = 10

In [None]:
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7fe5041e2790>

In [None]:
# word level tokenization
class Tokenizer:
    def __init__(self):
        self.mapping = {}
        self.reverse_mapping = {}

    def encode(self, text):
        words = text.split()
        tokens = []
        for word in words:
            if word not in self.mapping:
                mapped_int = len(self.mapping)
                self.mapping[word] = mapped_int
                self.reverse_mapping[mapped_int] = word
            tokens.append(self.mapping[word])
        return tokens

    def decode(self, tokens):
        words = [self.reverse_mapping[token] for token in tokens]
        return " ".join(words)


In [None]:
tokenizer = Tokenizer()

In [None]:
# Loading IMDB dataset
# Source: http://ai.stanford.edu/~amaas/data/sentiment/
from torchtext.datasets import IMDB
train_iter = IMDB(split='train')

In [None]:
# Generating dataset
dataset = []
# Converting all to tokens
for label, line in train_iter:
    tokens = tokenizer.encode(line)
    for i in range(len(tokens) - CONTEXT_SIZE):
        dataset.append((tokens[i:i + CONTEXT_SIZE], tokens[i + CONTEXT_SIZE]))

In [None]:
vocab_size = len(tokenizer.mapping)
print(vocab_size)

280617


In [None]:
# Used for tokenization purpose
def get_one_hot(tokens):
    return F.one_hot(
        torch.tensor(tokens),
        num_classes=vocab_size
    ).flatten().type(torch.float).to('cuda')

In [None]:
# Define the CBOW model
class CBOW(nn.Module):
  def __init__(self, vocab_size, embedding_dim, context_size):
    super(CBOW, self).__init__()
    self.hidden = nn.Linear(vocab_size * context_size, embedding_dim, device="cuda")
    self.output = nn.Linear(embedding_dim, vocab_size, device="cuda")

  def forward(self, input):
    return self.output(self.hidden(get_one_hot(input)))

In [None]:
# Create the model
model = CBOW(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE)

# Define loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
len(dataset)

5769680

In [None]:
# Training
for epoch in range(100):
    total_loss = 0
    for input, label in dataset[:100]:
        predictions = model(input)
        loss = loss_function(predictions, get_one_hot(label))
        total_loss += loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # print(loss.item())
    print(total_loss.item())

638.3038330078125
579.9682006835938
557.775146484375
542.259765625
528.4616088867188
515.0277709960938
501.28912353515625
486.8854675292969
471.597900390625
455.286376953125
437.9103698730469
419.5715637207031
400.56365966796875
381.3873291015625
362.6524963378906
344.8702087402344
328.2801208496094
312.8523254394531
298.4093017578125
284.74456787109375
271.68359375
259.10125732421875
246.91754150390625
235.08901977539062
223.59815979003906
212.444580078125
201.63699340820312
191.1876220703125
181.10838317871094
171.40745544433594
162.08926391601562
153.1543426513672
144.5998992919922
136.4208526611328
128.6103515625
121.16064453125
114.06331634521484
107.30990600585938
100.89163208007812
94.79962158203125
89.02440643310547
83.55644226074219
78.3853530883789
73.50056457519531
68.89105987548828
64.54551696777344
60.452491760253906
56.600494384765625
52.97810363769531
49.5739860534668
46.37704849243164
43.37643814086914
40.56161117553711
37.922279357910156
35.44866943359375
33.1312942504

In [None]:
# Testing
with torch.no_grad():
    input_text = "movie was good"
    for i in range(5):
        prediction = model(tokenizer.encode(input_text))
        # print(torch.argmax(prediction).item())
        print(input_text, tokenizer.reverse_mapping[torch.argmax(prediction).item()])
        input_text = input_text[input_text.index(" ") + 1:] + " " + tokenizer.reverse_mapping[torch.argmax(prediction).item()]

movie was good released
was good released in
good released in 1967.
released in 1967. I
in 1967. I also


In [None]:
# Embedding of a word
token = tokenizer.mapping["movie"]
print(model.output.weight[token])

tensor([ 0.3852, -2.0231, -0.8462, -0.8344,  1.2325,  1.1313, -0.8794, -0.9071,
         0.1915,  0.6741], device='cuda:0', grad_fn=<SelectBackward0>)
