### Creating tokens

In [1]:
import re
class Vocabulary:
    SPLIT_PATTERN = re.compile(r'([,.:;?!_"()\']|--|\s)')
    END_OF_TEXT = '<|endoftext|>'
    UNKNOWN = '<|unk|>'
    
    def __init__(self, filename):
        self.filename = filename
        
    def __read_content__(self):
        with open(self.filename, 'r', encoding='utf-8') as f:
            content = f.read()
        return content
    
    def create_vocabulary(self):
        content = self.__read_content__()
        tokens = self.SPLIT_PATTERN.split(content)
        tokens = [token.strip() for token in tokens if token.strip()]
        
        unique_tokens = sorted(set(tokens))
        unique_tokens.extend((self.UNKNOWN, self.END_OF_TEXT))  # Add padding and unknown tokens
        return {token: idx for idx, token in enumerate(unique_tokens)}


In [2]:
class TokenizerV1:
    ENCODE_SPLIT_PATTERN = re.compile(r'([,.:;?!_"()\']|--|\s)')
    DECODE_SUB_PATTERN = re.compile(r'\s+([,.?!"()\'])')
    
    def __init__(self, vocabulary_or_file_name):
        if isinstance(vocabulary_or_file_name, str):
            creator = Vocabulary(vocabulary_or_file_name)
            self.vocabulary = creator.create_vocabulary()
        elif isinstance(vocabulary_or_file_name, dict):
            self.vocabulary = vocabulary_or_file_name
        else:
            raise ValueError("Vocabulary must be a filename or a dictionary.")
        self.reverse_vocabulary = {idx: token for token, idx in self.vocabulary.items()}

    def encode(self, text):
        preprocessed = self.ENCODE_SPLIT_PATTERN.split(text)
        preprocessed = [word.strip() for word in preprocessed if word.strip()]
        preprocessed = [
            item if item in self.vocabulary else Vocabulary.UNKNOWN 
            for item in preprocessed
        ]
        return [self.vocabulary.get(word, -1) for word in preprocessed]
    
    def decode(self, ids):
        text = ' '.join(self.reverse_vocabulary.get(idx, '') for idx in ids)
        return self.DECODE_SUB_PATTERN.sub(r'\1', text)
    

In [3]:
class TokenizerTeset:
    def __init__(self, filename):
        self.tokenizer = TokenizerV1(filename)
    
    def test_tokenizer(self, test_texts):
        test_text = test_texts if isinstance(test_texts, str) else f' {Vocabulary.END_OF_TEXT} '.join(test_texts)
        encoded = self.tokenizer.encode(test_text)
        decoded = self.tokenizer.decode(encoded)
        
        print(f"Original text: {test_text}")
        print(f"Encoded tokens: {encoded}")
        print(f"Decoded text: {decoded}\n\n")

tester = TokenizerTeset('verdict.txt')
tester.test_tokenizer( """It's the last he painted, you know,"  Mrs. Gisburn said with pardonable pride.""")
tester.test_tokenizer([  "It's the last he painted, you know,","Mrs. Gisburn said with pardonable pride."])
tester.test_tokenizer([ 'Dawud says Hello',  'In the sunlit terraces of the palace.'])

Original text: It's the last he painted, you know,"  Mrs. Gisburn said with pardonable pride.
Encoded tokens: [56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
Decoded text: It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


Original text: It's the last he painted, you know, <|endoftext|> Mrs. Gisburn said with pardonable pride.
Encoded tokens: [56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1131, 67, 7, 38, 851, 1108, 754, 793, 7]
Decoded text: It' s the last he painted, you know, <|endoftext|> Mrs. Gisburn said with pardonable pride.


Original text: Dawud says Hello <|endoftext|> In the sunlit terraces of the palace.
Encoded tokens: [1130, 858, 1130, 1131, 55, 988, 956, 984, 722, 988, 1130, 7]
Decoded text: <|unk|> says <|unk|> <|endoftext|> In the sunlit terraces of the <|unk|>.




## BYTE PAIR ENCODING (BPE)

In [4]:
! pip3 install tiktoken



In [5]:
import tiktoken
print(f"tiktoken version: {tiktoken.__version__}\n")

# Suppoorted encoding  -  https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
# example: gpt2, o200k_base, cl100k_base, r50k_base, p50k_base, p50k_edit, r50k_edit
tokenizer = tiktoken.get_encoding("gpt2")

text =  ( "Hello, world! This is a test of the tiktoken library. <|endoftext|>"
            "It is designed to tokenize text efficiently for use with OpenAI's GPT models."
            "Tokenization is the process of converting text into tokens, which are the basic units of meaning. <|endoftext|>"
            "This library supports various encodings, including GPT-2 and GPT-3."
            "I'm from SomeunkownPlace, and I love coding!"
)

# encoding text
tokens = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
print(f'Encoding:\nNumber of tokens: {len(tokens)}\nTokens: {tokens}\n')

# decoding tokens back to text
decoded_t =  tokenizer.decode(tokens)
print(f'Decoding:\nDecoded text: {decoded_t}\nDecoded text matches original: {decoded_t == text}\n')

tiktoken version: 0.9.0



Encoding:
Number of tokens: 88
Tokens: [15496, 11, 995, 0, 770, 318, 257, 1332, 286, 262, 256, 1134, 30001, 5888, 13, 220, 50256, 1026, 318, 3562, 284, 11241, 1096, 2420, 18306, 329, 779, 351, 4946, 20185, 338, 402, 11571, 4981, 13, 30642, 1634, 318, 262, 1429, 286, 23202, 2420, 656, 16326, 11, 543, 389, 262, 4096, 4991, 286, 3616, 13, 220, 50256, 1212, 5888, 6971, 2972, 2207, 375, 654, 11, 1390, 402, 11571, 12, 17, 290, 402, 11571, 12, 18, 13, 40, 1101, 422, 2773, 2954, 593, 27271, 11, 290, 314, 1842, 19617, 0]

Decoding:
Decoded text: Hello, world! This is a test of the tiktoken library. <|endoftext|>It is designed to tokenize text efficiently for use with OpenAI's GPT models.Tokenization is the process of converting text into tokens, which are the basic units of meaning. <|endoftext|>This library supports various encodings, including GPT-2 and GPT-3.I'm from SomeunkownPlace, and I love coding!
Decoded text matches original: True



## Input-Target data pairs

In [6]:
def read_file_content(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        content = f.read()
    return content

enc_token = tokenizer.encode(read_file_content('verdict.txt'), allowed_special={'<|endoftext|>'})
print(f'Encoding verdict.txt:\nNumber of tokens: {len(enc_token)}\nTokens: {enc_token}\n')

context_size = 4 # Number of tokens to use as context
# x = enc_token[:context_size]
# y = enc_token[1:context_size+1]
# print(f'x: {x}')
# print(f'y: {y}')

for i in range(1, context_size + 1):
    context = enc_token[:i]
    target = enc_token[i]
    print(f'Context: {tokenizer.decode(context)} ----> Target: {tokenizer.decode([target])}')

Encoding verdict.txt:
Number of tokens: 5145
Tokens: [40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11, 290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423, 587, 10598, 393, 28537, 2014, 198, 198, 1, 464, 6001, 286, 465, 13476, 1, 438, 5562, 373, 644, 262, 1466, 1444, 340, 13, 314, 460, 3285, 9074, 13, 46606, 536, 5469, 438, 14363, 938, 4842, 1650, 353, 438, 2934, 489, 3255, 465, 48422, 540, 450, 67, 3299, 13, 366, 5189, 1781, 340, 338, 1016, 284, 3758, 262, 1988, 286, 616, 4286, 705, 1014, 510, 26, 475, 314, 836, 470, 892, 286, 326, 11, 1770, 13, 8759, 2763, 438, 1169, 2994, 284, 943, 17034, 318, 477, 314, 892, 286, 526, 383, 1573, 11, 319, 9074, 13, 536, 5469, 338, 11914, 11, 33096, 663, 4808, 3808, 62, 355, 996

In [7]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu


In [8]:
from torch.utils.data import Dataset, DataLoader
import torch

class GPTDatasetv1(Dataset):
    def __init__(self, text, tokenizer, max_lenth, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
        for i in range(0, len(token_ids) - max_lenth, stride):
            input_chunk = token_ids[i:i + max_lenth]
            target_chunk = token_ids[i + 1:i + max_lenth + 1]
            
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return  self.input_ids[idx], self.target_ids[idx]
    
def create_dataloader(text, 
                      batch_size=4,
                      max_length=25, 
                      stride=128, 
                      shuffle=True,
                      drop_last=True,
                      num_workers=2):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetv1(text, tokenizer, max_length, stride)
    
    dataloader = DataLoader(dataset, 
                            batch_size=batch_size, 
                            shuffle=shuffle, 
                            drop_last=drop_last, 
                            num_workers=num_workers)
    return dataloader
         

In [9]:
import torch
raw_text = read_file_content('verdict.txt')
dataloader = create_dataloader(raw_text, max_length=4, stride=1, batch_size=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
second_batch = next(data_iter)
print(f'first batch: {first_batch}')
print(f'second batch: {second_batch}')

first batch: [tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
second batch: [tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


# Token embedding

In [31]:
!pip3 install gensim

# First, uninstall any existing NumPy version
!pip3 uninstall numpy -y # The -y flag will automatically confirm the uninstall

# Then, install a specific compatible version of NumPy (e.g., 1.26.4)
!pip3 install numpy==1.26.4

# After running these cells, you MUST restart the kernel.
# In Jupyter, go to Kernel -> Restart.
# In Google Colab, you can sometimes just run the next cell, but a restart is safer.
print("NumPy downgrade complete. Please restart your kernel now (Kernel -> Restart).")

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
Installing collected packages: numpy
Successfully installed numpy-1.26.4
NumPy downgrade complete. Please restart your kernel now (Kernel -> Restart).


In [None]:
import numpy as np
print((f'Numpy version: {np.__version__}\n'))
import gensim.downloader as api
model = api.load("word2vec-google-news-300") # https://huggingface.co/small-text/word2vec-google-news-300
word_vector = model
print(f'word vector for computer: {word_vector['computer']}')


Numpy version: 1.26.4

word vector for computer: [ 1.07421875e-01 -2.01171875e-01  1.23046875e-01  2.11914062e-01
 -9.13085938e-02  2.16796875e-01 -1.31835938e-01  8.30078125e-02
  2.02148438e-01  4.78515625e-02  3.66210938e-02 -2.45361328e-02
  2.39257812e-02 -1.60156250e-01 -2.61230469e-02  9.71679688e-02
 -6.34765625e-02  1.84570312e-01  1.70898438e-01 -1.63085938e-01
 -1.09375000e-01  1.49414062e-01 -4.65393066e-04  9.61914062e-02
  1.68945312e-01  2.60925293e-03  8.93554688e-02  6.49414062e-02
  3.56445312e-02 -6.93359375e-02 -1.46484375e-01 -1.21093750e-01
 -2.27539062e-01  2.45361328e-02 -1.24511719e-01 -3.18359375e-01
 -2.20703125e-01  1.30859375e-01  3.66210938e-02 -3.63769531e-02
 -1.13281250e-01  1.95312500e-01  9.76562500e-02  1.26953125e-01
  6.59179688e-02  6.93359375e-02  1.02539062e-02  1.75781250e-01
 -1.68945312e-01  1.21307373e-03 -2.98828125e-01 -1.15234375e-01
  5.66406250e-02 -1.77734375e-01 -2.08984375e-01  1.76757812e-01
  2.38037109e-02 -2.57812500e-01 -4.46777

In [3]:
print(word_vector['cta'].shape)

(300,)


In [4]:
# simplarity word
print(word_vector.most_similar(positive=['king', 'woman'], negative=['man'], topn=5))

[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581)]


In [5]:
for p in [ ('woman', 'man'), ('king', 'queen'), ('computer', 'laptop'), ('boy', 'girl'), ('nephew', 'niece'), ('paper', 'water') ]:
    print(f"Similarity between {p[0]} and {p[1]}: {word_vector.similarity(p[0], p[1])}")

Similarity between woman and man: 0.7664012312889099
Similarity between king and queen: 0.6510956883430481
Similarity between computer and laptop: 0.6640492677688599
Similarity between boy and girl: 0.8543272018432617
Similarity between nephew and niece: 0.7594367265701294
Similarity between paper and water: 0.11408083885908127
