# Prototype of handwriten text recognition software(Keras implementation).

Here I am going to import most of the modules That will be used

In [30]:
from keras.models import Sequential,Model
from keras.layers import Dense,Conv2D,Flatten,BatchNormalization,LSTM,Bidirectional,Lambda
from keras.layers import Activation,MaxPooling2D,TimeDistributed,Input,Reshape,Permute
from keras.initializers import TruncatedNormal
from keras.optimizers import RMSprop,Adam,SGD
from sklearn.model_selection import train_test_split
import xml.etree.ElementTree as ET
import keras.backend as K
import glob
import os
import keras
import numpy as np
import tensorflow as tf
from pathlib import Path
import matplotlib.pyplot as plt
from PIL import Image
import cv2
from keras.preprocessing import sequence
import itertools
from tensorflow.python.ops import ctc_ops as ctc
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
import random
import editdistance
from IPython.display import Image

time: 2.29 ms


In [2]:
%load_ext autotime

Directory structure, all the images are in the data directory, while the labels are in the labels directory. The file containing the final code is name 'final.ipynb'

In [40]:
ls

'1*IjxpxWcKX8EJUVFBNFeKdA.gif'                    model.py
 [0m[01;34mdata[0m/                                            project.ipynb
 dataLoader.py                                    project-v2.ipynb
 [01;34mdataset[0m/                                         [01;34m__pycache__[0m/
 dataset_example.png                              temp.py
 directory_structure.png                          [01;34mtrained_models[0m/
 final.ipynb                                      train.py
 how_images_is_stored_in_the_data_directory.png  'Untitled Document 1'
 how_labels_are_stored.png                        Untitled.ipynb
 images.jpg                                       words.tgz
 [01;34mlabels[0m/                                          xml.tgz
time: 139 ms


![title](directory_structure.png)

This is an outlook of how the images in the data directory are organized.

![title](how_images_is_stored_in_the_data_directory.png)

This image shows how labels are stored in there respective xml files

![title](how_labels_are_stored.png)

This image on the other hand gives us a preview of what the images in our dataset looks like

![title](example.png)

Initializing some constant values

In [3]:
batch_size = 32
random.seed(1)
imgSize = (160,40)
maxTextLen = 16# for the a01 dataset and 53 for the whole dataset
keys = ' !"#&\'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
nb_labels = len(keys)+1


time: 142 ms


The function below converts each character in the label to a number according to its index in the keys variable, for example

In [4]:
def text_to_num(txt):
    indices = []
    values = []
    shape = [len(txt),53]
    y_len = np.asarray([len(i) for i in txt])

    for (batchElement, text) in enumerate(txt):
        labelStr = [keys.index(c) for c in text]

        if len(labelStr)>shape[1]:
            shape[1] = len(labelStr)

        for (i, label) in enumerate(labelStr):
            indices.append([batchElement, i])
            values.append(label)
    
    array = np.zeros(shape)
    for j,i in enumerate(indices):
        array[i[0],i[1]] = values[j]
#     arr = np.zeros([shape[0],imgSize[1]])
#     arr[0:shape[0],0:shape[1]] = array
    
    return array,y_len

time: 209 ms


Images come in various sizes so the function below reshapes them to a 160x40 size

In [5]:
def img_resize(img):
    outfile = img.split('/')[-1]
    try:
        img = Image.open(img)
#         img.show()
        img.thumbnail(imgSize)
    except IOError:
        print ("Failed to create thumbnail for: ",outfile)

    im = np.array(img)
    temp = np.ones([imgSize[1],imgSize[0]])*255
    temp[0:im.shape[0],0:im.shape[1]] = im
    image = temp/255
    return image

time: 216 ms


In [6]:
def image_processing(img_list):
    num_iter = len(img_list)
    data = np.zeros([num_iter,imgSize[1],imgSize[0],1])
    bad_samples = []
    os.chdir('data/')
    for i in range(num_iter):
        image_name = img_list[i]+'.png'
        # for the full dataset use:
        #otherwise for a01 only uncomment the following line
        try:
            if not os.path.isfile(image_name) or not os.path.getsize(image_name):
                bad_samples.append(image_name)
                continue
            data[i,:,:,0] = img_resize(image_name)
        except Exception as e:
            print(e)
    os.chdir('..')
#          print(os.getcwd())
    return data

time: 324 ms


In [7]:
def ctc_decode_greedy(args):
    y_pred, input_length = args
    print(input_length)
    print(input_length.shape)
    return K.cast(K.ctc_decode(y_pred,tf.squeeze(input_length),greedy=True)[0][0],dtype='float32')
    

time: 85.3 ms


In [8]:
def ctc_decode_beam_width(args):
    y_pred,input_length = args
    out = K.ctc_decode(y_pred,tf.squeeze(input_length),greedy=False,beam_width = 100,top_paths=1)
    return K.cast(out[0][0],dtype='float32')

time: 80.1 ms


This function as it name implies get the data, and the labels, preprocess them and then return values that our model can work with, meaning it gets the image data and convert them to array of number each of 160x40 in size, get the labels from there respective xml files and assign each label with it respective image

In [9]:
def get_data(dataset_length = 40000):
    list_of_imgs = []
    path_name = 'data'
    if path_name not in os.listdir():
        os.chdir(str(Path.home())+'/'+'internshipme/')
    
    # for getting a list of all images
    list_of_imgs = os.walk(path_name)
#    

    imgs_to_labels = {}
    # change to ../labels for whole dataset, current dir structure == '/home/keke/internshipme/dataset/a01'
    # and ../../labels for a01
    path = 'labels/'
    list_of_dirs = os.listdir('labels')
   
    for folder in list_of_dirs:
        path_name = path+folder
        tree = ET.parse(path_name) 
        root = tree.getroot() 
        for word in root.iter('word'): 
            imgs_to_labels[word.attrib['id']]=word.attrib['text']
    temp_keys = list(imgs_to_labels.keys())
    temp_keys = random.sample(temp_keys,dataset_length)

    temp_values = [[value for value in imgs_to_labels[i]] for i in temp_keys]
#     temp_values = list(imgs_to_labels.values())
    print('This the number of variables: ',len(temp_keys))
    print('This is the number of values: ',len(temp_values))
    X = image_processing(temp_keys)
    Y,y_len = text_to_num(temp_values)

    X_train,X_test,Y_train_cat,Y_test,y_train_len,y_test_len = train_test_split(X,Y,y_len,train_size=0.7,random_state=1,shuffle=True)
    X_val,X_test,Y_val_cat,Y_test_cat,y_val_len,y_test_len = train_test_split(X_test,Y_test,y_test_len,test_size=0.3,random_state=1,shuffle=True)
    

    nb_train = len(X_train)
    nb_test = len(X_test)
    nb_val = len(X_val)

    x_train_len = np.asarray([imgSize[1] for i in range(nb_train)])
    x_test_len = np.asarray([imgSize[1] for i in range(nb_test)])
    x_val_len = np.asarray([imgSize[1] for i in range(nb_val)])

    print(".............Image data processed successfully................ :)")

    print(".............Label data successfully converted............... :) ")
    return imgs_to_labels,X_train,X_val,X_test,Y_train_cat,Y_val_cat,Y_test_cat,x_train_len,x_val_len,x_test_len,y_train_len,y_val_len,y_test_len
#     return imgs_to_labels,list_of_dirs,list_of_imgs

time: 81.6 ms


In [10]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
 
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
 


time: 97.2 ms


In [11]:
def num_to_text(y):
    list_of_text = []
    for i in y:
        l = list(i)
        string = ''
        for j in l:
            j = int(j)
            if j!=-1:
                string+=keys[j]
        list_of_text.append(string)
    return list_of_text

time: 99.9 ms


In [12]:
def myModel(maxTextLen,l2):
    input_data = Input(shape=(40,160,1))
    # input_length = Input(shape)
    X = Conv2D(32,kernel_size=(5,5),strides=(1,1),padding='SAME',kernel_regularizer=keras.regularizers.l2(l=l2))(input_data)
    X = BatchNormalization()(X)
    X = Activation('relu')(X)
    X = MaxPooling2D(pool_size=(2,2),strides=(2,2),padding='valid')(X)

    X = Conv2D(64,kernel_size=(5,5),strides=(1,1),padding='SAME')(X)
    X = BatchNormalization()(X)
    
    X = Activation('relu')(X)
    X = MaxPooling2D(pool_size=(2,2),strides=(2,2),padding='valid')(X)

    X=Conv2D(128,kernel_size=(3,3),strides=(1,1),padding='SAME',kernel_regularizer=keras.regularizers.l2(l=l2))(X)
    X=BatchNormalization()(X)
    X=Activation('relu')(X)
    X=MaxPooling2D(pool_size=(2,1),strides=(2,1),padding='valid')(X)

    X=Conv2D(128,kernel_size=(3,3),strides=(1,1),padding='SAME',kernel_regularizer=keras.regularizers.l2(l=l2))(X)
    X=BatchNormalization()(X)
    X=Activation('relu')(X)
    X=MaxPooling2D(pool_size=(2,1),strides=(2,1),padding='valid')(X)

    X=Conv2D(256,kernel_size=(3,3),strides=(1,1),padding='SAME',kernel_regularizer=keras.regularizers.l2(l=l2))(X)
    X=BatchNormalization()(X)
    X=Activation('relu')(X)
    X=MaxPooling2D(pool_size=(2,1),strides=(2,1),padding='valid')(X)
    
    X = Reshape([40,256])(X)
    X=Bidirectional(LSTM(256, return_sequences=True,kernel_regularizer=keras.regularizers.l2(l=l2)),merge_mode='concat')(X)
    X=Bidirectional(LSTM(256,return_sequences=True,kernel_regularizer=keras.regularizers.l2(l=l2)),merge_mode='concat')(X)
#     X = Reshape([32,512,1])(X)
    outputs = Dense(80,name="dense",activation='softmax',kernel_regularizer=keras.regularizers.l2(l=l2))(X)
#     X = Conv2D(80,kernel_size=(1,512),strides=(1,1),kernel_initializer=TruncatedNormal(stddev=0.1))(X)
#     outrnn = Reshape([32,80])(outrnn)

#     outrnn = Lambda(lambda input:tf.transpose(input,[1,0,2]))(outrnn)
    model = Model(inputs=input_data,outputs=outputs)
    model.summary()
    
    labels = Input(name='the_labels', shape=[maxTextLen], dtype='float32')
    input_length = Input(name='input_length', shape=[1], dtype='int64')
    label_length = Input(name='label_length', shape=[1], dtype='int64')
 
    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])
    out_decoded_greedy = Lambda(ctc_decode_greedy,output_shape=(None,None),name='CTCdecodegreedy')([outputs,input_length])
    out_decoded_beam = Lambda(ctc_decode_beam_width,output_shape=(None,None),name='CTCdecodebeam')([outputs,input_length])
    model_train = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
    model_predict_greedy = Model(inputs=[input_data,input_length],outputs=out_decoded_greedy)
    model_predict_beam = Model(inputs=[input_data,input_length],outputs = out_decoded_beam)
    return model,model_train,model_predict_greedy,model_predict_beam

time: 123 ms


In [13]:
 imgs_to_labels,X_train,X_val,X_test,Y_train_cat,Y_val_cat,Y_test_cat,x_train_len,x_val_len,x_test_len,y_train_len,y_val_len,y_test_len = get_data()

This the number of variables:  40000
This is the number of values:  40000
.............Image data processed successfully................ :)
.............Label data successfully converted............... :) 
time: 11min 37s


In [14]:
X_val.shape

(8400, 40, 160, 1)

time: 81 ms


In [26]:
X_test.shape

(3600, 40, 160, 1)

time: 2.74 ms


In [27]:
X_train.shape

(28000, 40, 160, 1)

time: 2.38 ms


In [17]:
l2 = 0.01
# maxTextLen = Y_train_cat.shape[1]
model,model_train,model_predict_greedy,model_predict_beam = myModel(53,l2)

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 40, 160, 1)        0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 40, 160, 32)       832       
_________________________________________________________________
batch_normalization_6 (Batch (None, 40, 160, 32)       128       
_________________________________________________________________
activation_6 (Activation)    (None, 40, 160, 32)       0         
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 20, 80, 32)        0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 20, 80, 64)        51264     
_________________________________________________________________
batch_normalization_7 (Batch (None, 20, 80, 64)        256 

In [19]:
def train(model_train,file_path,X_train,Y_train_cat,x_train_len,y_train_len,X_val,Y_val_cat,x_val_len,y_val_len,batch_size=32,epochs=30):
    model_train.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')
 
    checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
    callbacks_list = [checkpoint]
    model_train.fit(x=[X_train,Y_train_cat,x_train_len,y_train_len],y=np.zeros(X_train.shape[0]),batch_size=32,epochs = 30,validation_data = ([X_val,Y_val_cat,x_val_len,y_val_len],[np.zeros(X_val.shape[0])]),verbose=1,callbacks=callbacks_list)
    print("------Training completed--------")

time: 1.59 ms


In [24]:
def error_rate(X_test,Y_test,x_test_len,model_path='trained_models/model_40000.hdf5'):
    char_error = 0
    batch_size = 32
    num_char = 0
    num_word = 0
    num_correct_word = 0
    
    num_of_batch = len(X_test//batch_size)
    iterator = 0
    disp = 0
    while iterator<=num_of_batch:
        
        model_predict_greedy.load_weights(model_path)
        prediction = model_predict_greedy.predict([X_test[iterator:iterator+batch_size],x_test_len[iterator:iterator+batch_size]])
        list_of_prediction = num_to_text(prediction)
        ground_truths = num_to_text(Y_test[iterator:iterator+batch_size])
        iterator+=batch_size

        if disp == 0:
            disp = 1
            print("These are the ground truths and predicted values for the first ten examples in the test set")
            print("\nGround truth  ->   Predicted value")
            print("\n----------------------------------\n")
            for i in range(10):
                print(ground_truths[i].strip(),'----->',list_of_prediction[i])
        for i in range(len(list_of_prediction)):
            ground_truth = ground_truths[i].strip()
            num_correct_word+= 1 if ground_truth == list_of_prediction[i] else 0
            num_word+=1
            distance = editdistance.eval(list_of_prediction[i],ground_truth.strip())
            char_error += distance
            num_char += len(ground_truth)

    
    char_error_rate = char_error/num_char
    word_accuracy = num_correct_word/num_word
    print('\nCharacter Error Rate: %.2f%%'%(char_error_rate*100.0),'|  Word Accuracy: %.2f%%'%(word_accuracy*100.0))    
    return char_error_rate,word_accuracy

time: 3.87 ms


In [25]:
char_error_rate,word_accuracy = error_rate(X_test,Y_test_cat,x_test_len,model_path='trained_models/model_40000.hdf5')

These are the ground truths and predicted values for the first ten examples in the test set

Ground truth  ->   Predicted value

----------------------------------

on -----> on
unguarded -----> onsvardes
the -----> the
help -----> hely
the -----> the
couples -----> comples
Caxtons -----> Castons
firmly -----> firely
with -----> with
the -----> the

Character Error Rate: 20.68% |  Word Accuracy: 55.67%
time: 43.8 s
