### Creating tokens

In [5]:
import re
class VocabularyCreator:
    SPLIT_PATTERN = re.compile(r'([,.:;?!_"()\']|--|\s)')
    END_OF_TEXT = '<|endoftext|>'
    UNKNOWN = '<|unk|>'
    
    def __init__(self, filename):
        self.filename = filename
        
    def __read_content__(self):
        with open(self.filename, 'r', encoding='utf-8') as f:
            content = f.read()
        return content
    
    def create_vocabulary(self):
        content = self.__read_content__()
        tokens = self.SPLIT_PATTERN.split(content)
        tokens = [token.strip() for token in tokens if token.strip()]
        
        unique_tokens = sorted(set(tokens))
        unique_tokens.extend((self.UNKNOWN, self.END_OF_TEXT))  # Add padding and unknown tokens
        return {token: idx for idx, token in enumerate(unique_tokens)}


In [6]:
class TokenizerV1:
    ENCODE_SPLIT_PATTERN = re.compile(r'([,.:;?!_"()\']|--|\s)')
    DECODE_SUB_PATTERN = re.compile(r'\s+([,.?!"()\'])')
    
    def __init__(self, volcalOrFilename):
        if isinstance(volcalOrFilename, str):
            creator = VocabularyCreator(volcalOrFilename)
            self.vocabulary = creator.create_vocabulary()
        elif isinstance(volcalOrFilename, dict):
            self.vocabulary = volcalOrFilename
        else:
            raise ValueError("Vocabulary must be a filename or a dictionary.")
        self.reverse_vocabulary = {idx: token for token, idx in self.vocabulary.items()}

    def encode(self, text):
        preprocessed = self.ENCODE_SPLIT_PATTERN.split(text)
        preprocessed = [word.strip() for word in preprocessed if word.strip()]
        preprocessed = [
            item if item in self.vocabulary else VocabularyCreator.UNKNOWN 
            for item in preprocessed
        ]
        return [self.vocabulary.get(word, -1) for word in preprocessed]
    
    def decode(self, ids):
        text = ' '.join(self.reverse_vocabulary.get(idx, '') for idx in ids)
        return self.DECODE_SUB_PATTERN.sub(r'\1', text)
    

In [7]:
class TokenizerTeset:
    def __init__(self, filename):
        self.tokenizer = TokenizerV1(filename)
    
    def test_tokenizer(self, test_texts):
        test_text = test_texts if isinstance(test_texts, str) else '<|endoftext|> '.join(test_texts)
        encoded = self.tokenizer.encode(test_text)
        decoded = self.tokenizer.decode(encoded)
        
        print(f"Original text: {test_text}")
        print(f"Encoded tokens: {encoded}")
        print(f"Decoded text: {decoded}\n\n")

tester = TokenizerTeset('verdict.txt')
tester.test_tokenizer( """It's the last he painted, you know,"  Mrs. Gisburn said with pardonable pride.""")
tester.test_tokenizer([  "It's the last he painted, you know,","Mrs. Gisburn said with pardonable pride."])
tester.test_tokenizer([ 'Dawud says Hello',  'In the sunlit terraces of the palace.'])

Original text: It's the last he painted, you know,"  Mrs. Gisburn said with pardonable pride.
Encoded tokens: [56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
Decoded text: It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


Original text: It's the last he painted, you know,<|endoftext|> Mrs. Gisburn said with pardonable pride.
Encoded tokens: [56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1131, 67, 7, 38, 851, 1108, 754, 793, 7]
Decoded text: It' s the last he painted, you know, <|endoftext|> Mrs. Gisburn said with pardonable pride.


Original text: Dawud says Hello<|endoftext|> In the sunlit terraces of the palace.
Encoded tokens: [1130, 858, 1130, 55, 988, 956, 984, 722, 988, 1130, 7]
Decoded text: <|unk|> says <|unk|> In the sunlit terraces of the <|unk|>.


