<a href="https://colab.research.google.com/github/ismoil27/jaydariGPT/blob/main/tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
print('Hello world!') # token
# 1 word => 1 token
#

Hello world!


In [39]:
text = 'Hello world this is a simple example to show how tokenization works in NLP and a sentence' # 17 token
print(text)

Hello world this is a simple example to show how tokenization works in NLP and a sentence


In [40]:
tokens = text.split()
print(tokens)

['Hello', 'world', 'this', 'is', 'a', 'simple', 'example', 'to', 'show', 'how', 'tokenization', 'works', 'in', 'NLP', 'and', 'a', 'sentence']


In [41]:
vocab = {}
current_id = 0

for token in tokens:
  if token not in vocab:
    vocab[token] = current_id
    current_id +=1

print(vocab)

{'Hello': 0, 'world': 1, 'this': 2, 'is': 3, 'a': 4, 'simple': 5, 'example': 6, 'to': 7, 'show': 8, 'how': 9, 'tokenization': 10, 'works': 11, 'in': 12, 'NLP': 13, 'and': 14, 'sentence': 15}


In [42]:
encoded = [vocab[token] for token in tokens]
print(encoded)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 4, 15]


In [43]:
reverse_vocab = {id: token for token, id in vocab.items()}
decoded = " ".join([reverse_vocab[id] for id in encoded])
print(decoded)

Hello world this is a simple example to show how tokenization works in NLP and a sentence


In [44]:
# Embedding vector => dense numerical array | Vector Database
# 0 => [0.41, 0.22, -0.77, 0.56.........]
# 1 => [0.34, 0.72, 0.77, 0.82.........]

In [45]:
import torch
import torch.nn as nn

embedding_dim = 8
vocab_size = len(vocab)

embedding_layer = nn.Embedding(vocab_size, embedding_dim)

input_ids = torch.tensor([encoded])
embeddings = embedding_layer(input_ids)


print(embeddings)
print(embeddings.shape)


tensor([[[-1.1258,  0.2460,  1.3143, -1.2784, -1.4513,  0.7249, -0.4719,
           0.6487],
         [ 0.6209,  1.4133,  1.0931,  0.2682, -0.0119,  2.3800, -0.8791,
           1.9438],
         [ 0.1230, -0.1511,  0.7573,  1.1208,  0.1305, -1.7984,  0.9725,
          -0.3203],
         [-0.8356,  0.8769, -0.4962, -0.9548, -1.9509,  0.9479, -1.5003,
          -0.8702],
         [-1.4420,  1.4476,  1.5378,  1.1561,  1.4788,  1.6431,  0.6851,
          -1.2843],
         [-0.2397, -0.9280, -0.8575, -0.4059, -0.8115, -1.1991, -1.0120,
          -0.1601],
         [-0.3742,  0.7698, -0.6001,  0.9918, -0.8302, -0.7771, -1.0741,
           0.1270],
         [-0.5015, -0.7514, -0.9778,  1.1783, -0.3227,  0.5824,  0.5978,
          -0.3365],
         [ 0.7993,  1.5874,  0.1401, -0.0121, -0.6334, -0.0721,  0.4345,
           0.6102],
         [-0.6092,  0.5344, -2.0814, -1.1128,  0.5209,  0.4883,  1.1735,
          -1.2967],
         [ 1.7202, -1.4463,  1.6445, -2.3525, -0.5788, -1.7072, -0.573

In [46]:
!pip install transformers



In [47]:
from transformers import AutoTokenizer

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [48]:
print('Vocab size:', tokenizer.vocab_size)
print('Special tokens:', tokenizer.special_tokens_map)

Vocab size: 32000
Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}


In [49]:
text = "Hello how are you doing?"
print('text:', text)

text: Hello how are you doing?


In [50]:
tokens = tokenizer.tokenize(text)
print('tokens:', tokens)

tokens: ['▁Hello', '▁how', '▁are', '▁you', '▁doing', '?']


In [51]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print('token IDs:', ids)

token IDs: [15043, 920, 526, 366, 2599, 29973]


In [52]:
encoded = tokenizer(text)
print('encoded:',encoded)

encoded: {'input_ids': [1, 15043, 920, 526, 366, 2599, 29973], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}


In [53]:
print('Input IDs:', encoded['input_ids'])

Input IDs: [1, 15043, 920, 526, 366, 2599, 29973]


In [54]:
"Hi" #=> [1, 345, 2, 2, 2] // length 5
"How are you?" #=> [1, 45, 657, 234, 23453] // length 5
# 'attention_mask': [1, 1, 0, 0, 0]

'How are you?'

In [55]:
decoded_text = tokenizer.decode(encoded["input_ids"])
print('decoded_text:', decoded_text)

decoded_text: <s> Hello how are you doing?


In [56]:
texts = [
    "Hello, how are you?",
    "Today we are learning about tokenizers.",
    "TinyLlama is a small but powerful language model."
]

batch = tokenizer(texts, padding=True, return_tensors="pt")
print(batch.keys())

print("Input IDs shape:", batch["input_ids"].shape)
print('Attention mask shape:', batch["attention_mask"].shape)

KeysView({'input_ids': tensor([[    1, 15043, 29892,   920,   526,   366, 29973,     2,     2,     2,
             2,     2,     2,     2],
        [    1, 20628,   591,   526,  6509,  1048,  5993, 19427, 29889,     2,
             2,     2,     2,     2],
        [    1,   323,  4901, 29931, 29880,  3304,   338,   263,  2319,   541,
         13988,  4086,  1904, 29889]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])})
Input IDs shape: torch.Size([3, 14])
Attention mask shape: torch.Size([3, 14])


In [57]:
# [    1, 15043, 29892,   920,   526,   366, 29973,     2,     2,     2, 2,     2,     2,     2]
# [    1, 20628,   591,   526,  6509,  1048,  5993, 19427, 29889,     2, 2,     2,     2,     2]
# [    1,   323,  4901, 29931, 29880,  3304,   338,   263,  2319,   541, 13988,  4086,  1904, 29889]

In [58]:
print('special tokens map:', tokenizer.special_tokens_map)

special tokens map: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}


In [59]:
word = "headerandshoulder" # sub word
print("word:", word)
print("Tokens:", tokenizer.tokenize(word))
print("IDs:", tokenizer.encode(word))

word: headerandshoulder
Tokens: ['▁header', 'and', 'should', 'er']
IDs: [1, 4839, 392, 9344, 261]
