In [7]:
from tensorflow.keras.layers import StringLookup
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np
import os
import tensorflow as tf
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
except: pass

np.random.seed(42)
tf.random.set_seed(42)

## I. Dataset Splitting

In [15]:
base_path = "Datasets/IAM_Words/"
words_list = []

words = None
with open(f"{base_path}/words.txt", 'r') as f:
    words = f.readlines()

for line in words:
    if line[0] == "#": continue
    if line.split(" ")[1] != 'err': words_list.append(line) # Append correctly predicted word

print(f"len(words_list) = {len(words_list)}")
np.random.shuffle(words_list)
print(f"words_list[0:10]: {words_list[0:10]}")

len(words_list) = 96456
words_list[0:10]: ['r06-076-07-06 ok 177 1807 2010 76 53 CC or\n', 'n01-004-01-01 ok 180 614 906 246 69 JJ unable\n', 'g06-011f-00-03 ok 203 778 721 46 70 INO of\n', 'f04-011-07-01 ok 145 504 1976 118 78 BEDZ was\n', 'e04-103-01-01 ok 174 471 916 205 123 VB plank\n', 'g06-047g-04-05 ok 182 924 1430 193 67 NP Europe\n', 'm06-056-04-11 ok 158 2061 1537 11 21 , ,\n', 'j06-026-03-04 ok 185 1593 1416 341 129 NN sunlight\n', 'm06-019-01-12 ok 189 1837 949 142 50 CD three\n', 'a04-043-02-05 ok 186 1906 1113 59 68 INO of\n']


### Train-validation-test split (90:5:5 ratio)

In [31]:
split_idx = int(0.9*len(words_list))
train_samples = words_list[:split_idx]

remaining = words_list[split_idx:]
half_idx = int(0.5*len(remaining))
validation_samples = remaining[:half_idx]
test_samples = remaining[half_idx:]

assert (len(train_samples) + len(validation_samples) + len(test_samples)) == len(words_list)

print(f"Total training samples: {len(train_samples)}")
print(f"Total validation samples: {len(validation_samples)}")
print(f"Total test samples: {len(test_samples)}")

Total training samples: 86810
Total validation samples: 4823
Total test samples: 4823


## II. Data Input Pipeline

Start by preparing the image paths

In [32]:
base_image_path = os.path.join(base_path, "words")
print(base_image_path)

def get_image_paths_and_labels(samples):
    paths = []
    corrected_samples = []

    for i, file_line in enumerate(samples):
        line_split = file_line.strip().split(' ')

        # Image path format:
        # base_image_path/part1-part2/part1-part2-part3.png
        img_name = line_split[0]
        name_split = img_name.split('-')
        part1 = name_split[0]
        part2 = name_split[1]
        img_path = os.path.join(
            base_image_path, part1, f"{part1}-{part2}", f"{img_name}.png"
        )
        if os.path.getsize(img_path):
            paths.append(img_path) # Append if path is actually a valid file
            corrected_samples.append(file_line.split('\n')[0])
    
    return paths, corrected_samples

Datasets/IAM_Words/words


In [33]:
train_img_paths, train_labels = get_image_paths_and_labels(train_samples)
validation_img_paths, validation_labels = get_image_paths_and_labels(validation_samples)
test_img_paths, test_labels = get_image_paths_and_labels(test_samples)

In [34]:
train_labels[:10]

['r06-076-07-06 ok 177 1807 2010 76 53 CC or',
 'n01-004-01-01 ok 180 614 906 246 69 JJ unable',
 'g06-011f-00-03 ok 203 778 721 46 70 INO of',
 'f04-011-07-01 ok 145 504 1976 118 78 BEDZ was',
 'e04-103-01-01 ok 174 471 916 205 123 VB plank',
 'g06-047g-04-05 ok 182 924 1430 193 67 NP Europe',
 'm06-056-04-11 ok 158 2061 1537 11 21 , ,',
 'j06-026-03-04 ok 185 1593 1416 341 129 NN sunlight',
 'm06-019-01-12 ok 189 1837 949 142 50 CD three',
 'a04-043-02-05 ok 186 1906 1113 59 68 INO of']

In [35]:
train_img_paths[:10]

['Datasets/IAM_Words/words/r06/r06-076/r06-076-07-06.png',
 'Datasets/IAM_Words/words/n01/n01-004/n01-004-01-01.png',
 'Datasets/IAM_Words/words/g06/g06-011f/g06-011f-00-03.png',
 'Datasets/IAM_Words/words/f04/f04-011/f04-011-07-01.png',
 'Datasets/IAM_Words/words/e04/e04-103/e04-103-01-01.png',
 'Datasets/IAM_Words/words/g06/g06-047g/g06-047g-04-05.png',
 'Datasets/IAM_Words/words/m06/m06-056/m06-056-04-11.png',
 'Datasets/IAM_Words/words/j06/j06-026/j06-026-03-04.png',
 'Datasets/IAM_Words/words/m06/m06-019/m06-019-01-12.png',
 'Datasets/IAM_Words/words/a04/a04-043/a04-043-02-05.png']