In [1]:
import os
import pickle
import numpy as np

In [2]:
def save_pickle(pickle_path: str, data_to_save):
    with open(pickle_path, 'wb') as pickle_handle:
        pickle.dump(data_to_save, pickle_handle, protocol=-1)

def load_pickle(pickle_path: str):
    if not os.path.exists(pickle_path):
        return None
    
    with open(pickle_path, 'rb') as pickle_handle:
        data = pickle.load(pickle_handle)
        return data

In [6]:
full_data_path = os.path.join('..', 'data', 'post-ocr-correction', 'full')
vocabulary_path = os.path.join('..', 'data', 'post-ocr-correction', 'pickles', 'vocabulary.pickle')

vocabulary_data = load_pickle(vocabulary_path)
if not vocabulary_data:
    data_characters = set([])
    all_files_count = len(os.listdir(full_data_path))
    for i, filename in enumerate(os.listdir(full_data_path)):
        print(f'{i}/{all_files_count}               \r', end='')
        file_path = os.path.join(full_data_path, filename)
        with open(file_path, 'r', encoding='utf-8') as data_file:
            file_characters = set(data_file.read())
            data_characters = data_characters.union(file_characters)

    data_characters = list(data_characters)
    data_characters.insert(0, '[PAD]')
    data_characters.insert(1, '[UNK]')
    data_characters.insert(2, '[CLS]')
    data_characters.insert(3, '[EOS]')
    
    # use enumeration to give the characters integer values
    int2char = dict(enumerate(data_characters))

    # create the look up dictionary from characters to the assigned integers
    char2int = {char: index for index, char in int2char.items()}
    
    vocabulary_data = {
        'characters-set': data_characters,
        'int2char': int2char,
        'char2int': char2int
    }
    
    save_pickle(vocabulary_path, vocabulary_data)

In [7]:
print(vocabulary_data['int2char'])

{0: '[PAD]', 1: '[UNK]', 2: '[CLS]', 3: '[EOS]', 4: '“', 5: '¿', 6: 'Ù', 7: '`', 8: 'Q', 9: '!', 10: '″', 11: 'ẽ', 12: 'О', 13: 'ſ', 14: 'x', 15: '\ue781', 16: '₤', 17: '¡', 18: '©', 19: '✓', 20: 'ō', 21: 'Y', 22: '-', 23: '€', 24: 'ɩ', 25: '<', 26: '\\', 27: 'B', 28: 'y', 29: '▲', 30: '"', 31: '@', 32: '\ueba6', 33: '*', 34: '1', 35: 'û', 36: 'õ', 37: 'u', 38: '±', 39: 'J', 40: 'Λ', 41: '$', 42: 'Ú', 43: 'ﬄ', 44: 'Î', 45: 'b', 46: 'Ñ', 47: 'ꝰ', 48: 'N', 49: '℔', 50: '°', 51: 'М', 52: 'Ç', 53: 'é', 54: '‐', 55: 'V', 56: ';', 57: '¶', 58: 'о', 59: 'κ', 60: 'A', 61: '—', 62: '«', 63: '¦', 64: 'C', 65: '3', 66: '»', 67: 'Z', 68: ')', 69: 'L', 70: 'ﬁ', 71: 'M', 72: 'T', 73: '®', 74: '?', 75: '+', 76: '>', 77: '}', 78: 'S', 79: '¼', 80: 'γ', 81: 'η', 82: 'χ', 83: '·', 84: 'r', 85: '′', 86: '⅞', 87: '◊', 88: 'a', 89: 'ē', 90: 'á', 91: '5', 92: 'g', 93: 'â', 94: '(', 95: 'È', 96: 'ß', 97: ':', 98: 'Φ', 99: '\uf219', 100: 'φ', 101: 'ﬂ', 102: 'G', 103: 'Е', 104: '{', 105: '…', 106: '&', 107: '\