In [2]:
import os
import pickle
import numpy as np

In [3]:
def save_pickle(pickle_path: str, data_to_save):
    with open(pickle_path, 'wb') as pickle_handle:
        pickle.dump(data_to_save, pickle_handle, protocol=-1)

def load_pickle(pickle_path: str):
    if not os.path.exists(pickle_path):
        return None
    
    with open(pickle_path, 'rb') as pickle_handle:
        data = pickle.load(pickle_handle)
        return data

In [36]:
full_data_path = os.path.join('..', 'data', 'ocr', 'full')
vocabulary_path = os.path.join('..', 'data', 'ocr', 'pickles', 'vocabulary.pickle')

vocabulary_data = load_pickle(vocabulary_path)
if not vocabulary_data:
    data_characters = set([])
    all_files_count = len(os.listdir(full_data_path))
    for i, filename in enumerate(os.listdir(full_data_path)):
        print(f'{i}/{all_files_count}               \r', end='')
        file_path = os.path.join(full_data_path, filename)
        with open(file_path, 'r', encoding='utf-8') as data_file:
            file_characters = set(data_file.read())
            data_characters = data_characters.union(file_characters)

    data_characters = list(data_characters)
    data_characters.insert(0, '[PAD]')
    data_characters.insert(1, '[UNK]')
    data_characters.insert(2, '[CLS]')
    data_characters.insert(3, '[EOS]')
    
    # use enumeration to give the characters integer values
    int2char = dict(enumerate(data_characters))

    # create the look up dictionary from characters to the assigned integers
    char2int = {char: index for index, char in int2char.items()}
    
    vocabulary_data = {
        'characters-set': data_characters,
        'int2char': int2char,
        'char2int': char2int
    }
    
    save_pickle(vocabulary_path, vocabulary_data)

In [37]:
print(vocabulary_data['int2char'])

{0: '[PAD]', 1: '[UNK]', 2: '[CLS]', 3: '[EOS]', 4: '.', 5: '½', 6: 'ε', 7: 'Λ', 8: '(', 9: 'Í', 10: '8', 11: 'J', 12: 'λ', 13: '—', 14: 'ē', 15: '5', 16: 'V', 17: 'Œ', 18: 'À', 19: 'h', 20: '†', 21: 'q', 22: "'", 23: 'В', 24: 'á', 25: 'ì', 26: '`', 27: 'È', 28: '¾', 29: '¬', 30: 'W', 31: 'y', 32: 'Φ', 33: '»', 34: 'Ñ', 35: 't', 36: 'ꝰ', 37: '°', 38: 'έ', 39: ',', 40: '®', 41: '►', 42: 'n', 43: 'η', 44: '’', 45: 'Ë', 46: '˚', 47: '✓', 48: 'κ', 49: 'X', 50: '⅛', 51: 'Ä', 52: '\uefa1', 53: 'õ', 54: 'Z', 55: '\ueada', 56: 'ò', 57: 'ó', 58: '\\', 59: '\uf52d', 60: ']', 61: '~', 62: 'Ö', 63: 'v', 64: 'ô', 65: 'p', 66: '&', 67: '▼', 68: 'Ê', 69: ':', 70: 'ç', 71: '́', 72: 'а', 73: 'ñ', 74: 'ﬄ', 75: '3', 76: '0', 77: ' ', 78: 'æ', 79: 'L', 80: 'Х', 81: 'ι', 82: 'R', 83: '«', 84: 'γ', 85: '¦', 86: 'P', 87: 'r', 88: '{', 89: 'b', 90: 'Q', 91: 'ﬂ', 92: 'é', 93: 'χ', 94: '■', 95: '9', 96: 'ū', 97: '\uf51c', 98: 'd', 99: 'N', 100: 'ā', 101: 'I', 102: '*', 103: 'Р', 104: '℔', 105: '‘', 106: 'ﬁ', 10