<a href="https://colab.research.google.com/github/hissain/mlworks/blob/main/codes/RNN_Skip_Gram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import numpy as np

In [None]:
# Constant definition
RANDOM_SEED = 1
CONTEXT_SIZE = 3
EMBEDDING_DIM = 10

In [None]:
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7da1343959f0>

In [None]:
# word level tokenization
class Tokenizer:
    def __init__(self):
        self.mapping = {}
        self.reverse_mapping = {}

    def encode(self, text):
        words = text.split()
        tokens = []
        for word in words:
            if word not in self.mapping:
                mapped_int = len(self.mapping)
                self.mapping[word] = mapped_int
                self.reverse_mapping[mapped_int] = word
            tokens.append(self.mapping[word])
        return tokens

    def decode(self, tokens):
        words = [self.reverse_mapping[token] for token in tokens]
        return " ".join(words)


In [None]:
tokenizer = Tokenizer()

In [None]:
# Taking a random paragraph
text = "We are living in an AI era . One day AI will take all the Human jobs ."

In [None]:
# Generating dataset
dataset = []
tokens = tokenizer.encode(text)
for i in range(len(tokens) - CONTEXT_SIZE):
    dataset.append((tokens[i + CONTEXT_SIZE], tokens[i:i + CONTEXT_SIZE]))

In [None]:
vocab_size = len(tokenizer.mapping)
print(vocab_size)

16


In [None]:
# Used for tokenization purpose
def get_one_hot(tokens):
    return F.one_hot(
        torch.tensor(tokens),
        num_classes=vocab_size
    ).flatten().type(torch.float)

In [None]:
# Define the SkipGram model
class SkipGram(nn.Module):
  def __init__(self, vocab_size, embedding_dim, context_size):
    super(SkipGram, self).__init__()
    self.hidden = nn.Linear(vocab_size, embedding_dim)
    self.output = nn.Linear(embedding_dim, vocab_size * context_size)

  def forward(self, input):
    return self.output(self.hidden(get_one_hot(input)))

In [None]:
# Create the model
model = SkipGram(vocab_size, EMBEDDING_DIM, CONTEXT_SIZE)

# Define loss function and optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training
for epoch in range(100):
    for input, label in dataset:
        predictions = model(input)
        loss = loss_function(predictions, get_one_hot(label))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
# Testing
input_text = "will"
correct_output = "One day AI"

with torch.no_grad():
    prediction = model(tokenizer.encode(input_text))
    print(torch.argmax(prediction[0:16]).item(), tokenizer.reverse_mapping[torch.argmax(prediction[0:16]).item()])
    print(torch.argmax(prediction[16:32]).item(), tokenizer.reverse_mapping[torch.argmax(prediction[16:32]).item()])
    print(torch.argmax(prediction[32:48]).item(), tokenizer.reverse_mapping[torch.argmax(prediction[32:48]).item()])

8 One
9 day
5 AI


In [None]:
# Embedding of a word
token = tokenizer.mapping["will"]
print(model.hidden.weight[:, token])

tensor([ 0.5661,  0.6565,  0.6039,  0.5392, -0.2362,  0.5543,  0.7447,  0.7718,
        -0.6273, -0.6929], grad_fn=<SelectBackward0>)
