In [27]:
from collections.abc import Callable

def read_dataset(filename: str, visualize: bool = False) -> str:
    ''' Read a text file and return the contents as a string. 
    
        Args:
            filename - Path of the text file.
            visualize - Whether to visualize the statistics of the file contents.
            
        Returns
            Content of the file as a string.
    '''
    with open(filename, 'r') as reader:
        data = reader.read()
        
    if visualize:
        print(f'Visualizing dataset at path {filename}.')
        print(f'First 100 characters:\n{data[0:100]}.')
        print(f'Length: {len(data)}.')
    return data
        
def create_vocabulary(data: str, visualize: bool = False) -> (list[str], dict[str, int], dict[int, str], Callable[str, list[int]], Callable[list[int], str]):
    vocabulary = sorted(list(set(data)))
    token_to_index_map = {token:index for (index, token) in enumerate(vocabulary)}
    index_to_token_map = {index:token for (index, token) in enumerate(vocabulary)}

    if visualize:
        print(f'Visualizing vocabulary.')
        print(f'Length of vocabulary: {len(vocabulary)}.')
        print(f'Vocabulary is {"".join(vocabulary)}.')
        print(f'Token to index map sorted is {token_to_index_map}')
        print(f'Index to token map sorted is {index_to_token_map}')              
        
    def encoder(input: str) -> (list[int]):
        ''' Encodes the input string. 
            
            Args:
                input: string of text to be encoded.
                
            Returns:
                List of indices of the tokens in the input string.
        '''
        return [token_to_index_map[token] for token in input]
    
    def decoder(input: list[int]) -> str:
        ''' Decodes the input token index into text.
        
            Args:
                input: List of indices of tokens in the text to be decoded.
                
            Returns:
                String corresponding to the decoded text.
        '''
        return ''.join([index_to_token_map[index] for index in input])
        
    return (vocabulary, token_to_index_map, index_to_token_map, encoder, decoder)
    
# Read input file.
filename = 'data/tinyshakespeare.txt'
data = read_dataset(filename, True)
(vocabulary, token_to_index_map, index_to_token_map, encoder, decoder) = create_vocabulary(data, True)

print('Tokenizer example.')
input_text = 'Hello, how are you?'
print(f'Index is {token_to_index_map["H"]}.')
tokenized_text = encoder(input_text)
decoded_text = decoder(tokenized_text)
print(f'{input_text=}, {tokenized_text=}, {decoded_text=}.')


Visualizing dataset at path data/tinyshakespeare.txt.
First 100 characters:
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You.
Length: 1115394.
Visualizing vocabulary.
Length of vocabulary: 65.
Vocabulary is 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.
Token to index map sorted is {'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
Index to token map sorted is {0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&