In [8]:
import unicodedata
import re

In [9]:
# normalize
def normalize_text(text):
    normalized = unicodedata.normalize('NFKC', text).lower()
    #Note:
    # r'() denotes raw string input, [,.!?;:] is the capturing group, r' \1 ' means will replace the puncutations 
    # in the capturing group with itself wrapped in spaces
    #/1 means the first punctuation mark encountered.
    # Example: "Hello, world!" will be replaced with "Hello , World !", 
    # it matches the comma, /1 is now comma, and then it replaces it with r' \1 ' which is " , "
    normalized = re.sub(r'([,.!?;:])', r' \1 ', normalized)
    normalized = re.sub(r'\s+', ' ', normalized)
    # Replace spaces with the special token "▁"
    normalized = normalized.replace(" ", "▁")
    return normalized

In [10]:
# split text into sentences
def split_into_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return sentences

In [11]:
# extract unique characters
def extract_unique_characters(normalized_file):
    unique_chars = set()
    with open(normalized_file, "r", encoding="utf-8") as f:
        for line in f:
            unique_chars.update(line.strip()) 
    return sorted(list(unique_chars))

In [12]:
def prepare_and_extract(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        sentences = split_into_sentences(infile.read())
        for sentence in sentences:
            normalized_sentence = normalize_text(sentence.strip())
            outfile.write(normalized_sentence + '\n')

    vocab_chars = extract_unique_characters(output_file)
    
    special_tokens = ["<unk>", "<s>", "</s>", "<pad>"]
    final_vocab = special_tokens + vocab_chars

    token2idx = {token: idx for idx, token in enumerate(final_vocab)}
    print(token2idx)
    return token2idx

In [15]:
input_file = 'viterbi.txt'
output_file = 'output.txt' 

token_dictionary = prepare_and_extract(input_file, output_file)


{'<unk>': 0, '<s>': 1, '</s>': 2, '<pad>': 3, '!': 4, ',': 5, '-': 6, '.': 7, ';': 8, '?': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'q': 26, 'r': 27, 's': 28, 't': 29, 'u': 30, 'v': 31, 'w': 32, 'x': 33, 'y': 34, 'z': 35, '“': 36, '”': 37, '▁': 38}
