# Image-label alignment process
This notebook is about to produce a file aligning the images and labels of IAM text lines dataset.

In [1]:
# some constant
LABEL_DIR = "../data/labels/"
IMAGE_DIR = "../data/images/"

In [2]:
# import
import os
import re

In [3]:
# count examples
num_examples = sum([len(files) for r, d, files in os.walk(IMAGE_DIR)])
assert(num_examples == 13353)

In [4]:
# parse the xml file
images = []
labels = []
pattern = re.compile(r'.*<line.*id="(\w+-\w+-\w+)".*text="(.*?)".*')
for filename in os.listdir(LABEL_DIR):
    with open(os.path.join(LABEL_DIR, filename), 'r') as f:
        for line in f.readlines():
            m = pattern.match(line)
            if m:
                fs = m.group(1).split('-')
                images.append("{}/{}/{}.png".format(fs[0], '-'.join([fs[0],fs[1]]), m.group(1)))
                labels.append(m.group(2))
assert(len(images) == len(labels) == num_examples)

In [5]:
# clean the labels
#     &apos; -> '
#     &quot; -> "
#     &amp;quot; -> "
#     &amp;amp; -> &
#     &amp; -> &
for i in range(len(labels)):
    labels[i] = labels[i].replace("&apos;", "'")
    labels[i] = labels[i].replace("&quot;", '"')
    labels[i] = labels[i].replace("&amp;quot;",'"')
    labels[i] = labels[i].replace("&amp;amp;",'&')
    labels[i] = labels[i].replace("&amp;",'&')

In [6]:
# write the alignmnet file
with open("alignment.txt", 'w') as f:
    for line in zip(images, labels):
        f.write('||'.join(line) + '\n')

In [15]:
# build the dictionary of labels
label_count = {}
label_dict = {}
for label in labels:
    for c in label:
        if c not in label_count:
            label_count[c] = 1
        else:
            label_count[c] += 1
index = 0
for label in sorted(label_count):
    label_dict[label] = index
    index += 1

In [16]:
label_count

{' ': 87288,
 '!': 226,
 '"': 1629,
 '#': 79,
 '&': 55,
 "'": 1987,
 '(': 203,
 ')': 200,
 '*': 14,
 '+': 9,
 ',': 5629,
 '-': 1545,
 '.': 6224,
 '/': 16,
 '0': 431,
 '1': 494,
 '2': 209,
 '3': 174,
 '4': 118,
 '5': 157,
 '6': 121,
 '7': 64,
 '8': 118,
 '9': 179,
 ':': 165,
 ';': 204,
 '?': 259,
 'A': 1257,
 'B': 854,
 'C': 750,
 'D': 569,
 'E': 675,
 'F': 489,
 'G': 661,
 'H': 988,
 'I': 1600,
 'J': 181,
 'K': 163,
 'L': 638,
 'M': 1144,
 'N': 711,
 'O': 469,
 'P': 722,
 'Q': 16,
 'R': 606,
 'S': 1064,
 'T': 1652,
 'U': 189,
 'V': 171,
 'W': 728,
 'X': 8,
 'Y': 227,
 'Z': 10,
 'a': 35849,
 'b': 6803,
 'c': 11823,
 'd': 17952,
 'e': 56911,
 'f': 10006,
 'g': 8653,
 'h': 25244,
 'i': 30300,
 'j': 423,
 'k': 2819,
 'l': 18036,
 'm': 10642,
 'n': 31027,
 'o': 33343,
 'p': 8253,
 'q': 370,
 'r': 27516,
 's': 27867,
 't': 39972,
 'u': 12203,
 'v': 4528,
 'w': 8808,
 'x': 785,
 'y': 8264,
 'z': 205}

In [17]:
label_dict

{' ': 0,
 '!': 1,
 '"': 2,
 '#': 3,
 '&': 4,
 "'": 5,
 '(': 6,
 ')': 7,
 '*': 8,
 '+': 9,
 ',': 10,
 '-': 11,
 '.': 12,
 '/': 13,
 '0': 14,
 '1': 15,
 '2': 16,
 '3': 17,
 '4': 18,
 '5': 19,
 '6': 20,
 '7': 21,
 '8': 22,
 '9': 23,
 ':': 24,
 ';': 25,
 '?': 26,
 'A': 27,
 'B': 28,
 'C': 29,
 'D': 30,
 'E': 31,
 'F': 32,
 'G': 33,
 'H': 34,
 'I': 35,
 'J': 36,
 'K': 37,
 'L': 38,
 'M': 39,
 'N': 40,
 'O': 41,
 'P': 42,
 'Q': 43,
 'R': 44,
 'S': 45,
 'T': 46,
 'U': 47,
 'V': 48,
 'W': 49,
 'X': 50,
 'Y': 51,
 'Z': 52,
 'a': 53,
 'b': 54,
 'c': 55,
 'd': 56,
 'e': 57,
 'f': 58,
 'g': 59,
 'h': 60,
 'i': 61,
 'j': 62,
 'k': 63,
 'l': 64,
 'm': 65,
 'n': 66,
 'o': 67,
 'p': 68,
 'q': 69,
 'r': 70,
 's': 71,
 't': 72,
 'u': 73,
 'v': 74,
 'w': 75,
 'x': 76,
 'y': 77,
 'z': 78}