<a href="https://colab.research.google.com/github/hissain/mlworks/blob/main/codes/RNN_Converting_Raw_Text_To_Sequential_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install torchtext torchdata portalocker>=2.0.0

In [None]:
# Importing Libraries
import collections
import random
import re
import torch
from tabulate import tabulate

In [None]:
# Print dictionary in table
def print_in_table(dictionary):
    print(tabulate(
        {
            "keys": dictionary.keys(),
            "values": dictionary.values()
        },
        headers="keys",
        tablefmt="outline"
    ))

In [None]:
# Taking a random paragraph
text = "We are living in an AI era . One day AI will take all the Human jobs ."

In [None]:
# character level tokenization
class CharTokenizer:
    def __init__(self):
        self.mapping = {
            'a': 1,
            'b': 2,
            # ....
        }

    def encode(self, text):
        chars = list(text)
        return [ord(char) for char in chars]

    def decode(self, tokens):
        chars = [chr(token) for token in tokens]
        return "".join(chars)



In [None]:
# Encoding text to token and retrieve the text again
tokenizer = CharTokenizer()
tokens = tokenizer.encode(text)
print(tokens)
decoded_text = tokenizer.decode(tokens)
print(decoded_text)

[87, 101, 32, 97, 114, 101, 32, 108, 105, 118, 105, 110, 103, 32, 105, 110, 32, 97, 110, 32, 65, 73, 32, 101, 114, 97, 32, 46, 32, 79, 110, 101, 32, 100, 97, 121, 32, 65, 73, 32, 119, 105, 108, 108, 32, 116, 97, 107, 101, 32, 97, 108, 108, 32, 116, 104, 101, 32, 72, 117, 109, 97, 110, 32, 106, 111, 98, 115, 32, 46]
We are living in an AI era . One day AI will take all the Human jobs .


In [None]:
# word level tokenization
class Tokenizer:
    def __init__(self):
        self.mapping = {}
        self.reverse_mapping = {}

    def encode(self, text):
        words = text.split()
        tokens = []
        for word in words:
            if word not in self.mapping:
                mapped_int = len(self.mapping)
                self.mapping[word] = mapped_int
                self.reverse_mapping[mapped_int] = word
            tokens.append(self.mapping[word])
        return tokens

    def decode(self, tokens):
        words = [self.reverse_mapping[token] for token in tokens]
        return " ".join(words)


In [None]:
# Encoding text to token and retrieve the text again
tokenizer = Tokenizer()
tokens = tokenizer.encode(text)
print(tokens)
decoded_text = tokenizer.decode(tokens)
print(decoded_text)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 5, 10, 11, 12, 13, 14, 15, 7]
We are living in an AI era . One day AI will take all the Human jobs .


In [None]:
print_in_table(tokenizer.mapping)
print()
print_in_table(tokenizer.reverse_mapping)

+--------+----------+
| keys   |   values |
| We     |        0 |
| are    |        1 |
| living |        2 |
| in     |        3 |
| an     |        4 |
| AI     |        5 |
| era    |        6 |
| .      |        7 |
| One    |        8 |
| day    |        9 |
| will   |       10 |
| take   |       11 |
| all    |       12 |
| the    |       13 |
| Human  |       14 |
| jobs   |       15 |
+--------+----------+

+--------+----------+
|   keys | values   |
|      0 | We       |
|      1 | are      |
|      2 | living   |
|      3 | in       |
|      4 | an       |
|      5 | AI       |
|      6 | era      |
|      7 | .        |
|      8 | One      |
|      9 | day      |
|     10 | will     |
|     11 | take     |
|     12 | all      |
|     13 | the      |
|     14 | Human    |
|     15 | jobs     |
+--------+----------+


In [None]:
# Loading IMDB dataset
# Source: http://ai.stanford.edu/~amaas/data/sentiment/
from torchtext.datasets import IMDB
train_iter = IMDB(split='train')

In [None]:
# Converting all to tokens
for label, line in train_iter:
    tokens += tokenizer.encode(line)

In [None]:
len(tokenizer.mapping)

280617

In [None]:
# One hot representation using Pytorch
import torch.nn.functional as F

data = torch.arange(0, 5)
print(data)
print(F.one_hot(data))
print(F.one_hot(data, num_classes=10))

tensor([0, 1, 2, 3, 4])
tensor([[1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1]])
tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]])
