# Importing all the packages required 

In [1]:
import numpy as np
import pandas as pd
from PIL import Image, ImageOps 
from skimage.util.shape import view_as_windows
from keras import backend as K
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers import Input, Dense, Activation, Dropout, BatchNormalization, Flatten
from keras.layers import Lambda
from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.layers.recurrent import LSTM
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import RMSprop
from keras.models import load_model

Using TensorFlow backend.


In [None]:
# These two packages will be used to evaluate the performance of the prediction.
# from the distance package we will use the editdistance which calculate the minimum number of string operation
# required to match two strings. On the other hand the SequenceMatcher function is a similarity measure function for
# two strings that gives a number between 0 and 1 where 1 means two exact strings
! pip install distance 
from difflib import SequenceMatcher
import distance

## Defining the CTC loss function and the Model (Copied from Training file)

In [0]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

In [0]:
timeSteps = 277
window_h  = 64
window_w  = 32
Char_Num  = 80
seq_len   = 90
    
input_lines = Input(shape=(timeSteps, window_h, window_w, 1), name='the_input')

Conv_1 = TimeDistributed(Conv2D(16, (3,3), padding= 'same'),name="Conv_1")(input_lines)
Acti_1 = TimeDistributed(Activation("relu"),name="Acti_1")(Conv_1)
Pool_1 = TimeDistributed(MaxPooling2D(pool_size=(2, 1)),name="Pool_1")(Acti_1)
Norm_1 = TimeDistributed(BatchNormalization(),name="Norm_1")(Pool_1)

Conv_2 = TimeDistributed(Conv2D(32, (3,3), padding= 'same'),name="Conv_2")(Norm_1)
Acti_2 = TimeDistributed(Activation("relu"),name="Acti_2")(Conv_2)
Pool_2 = TimeDistributed(MaxPooling2D(pool_size=(2, 2)),name="Pool_2")(Acti_2)
Norm_2 = TimeDistributed(BatchNormalization(),name="Norm_2")(Pool_2)

Conv_3 = TimeDistributed(Conv2D(64, (3,3), padding= 'same'),name="Conv_3")(Norm_2)
Acti_3 = TimeDistributed(Activation("relu"),name="Acti_3")(Conv_3)
Pool_3 = TimeDistributed(MaxPooling2D(pool_size=(2, 2)),name="Pool_3")(Acti_3)
Norm_3 = TimeDistributed(BatchNormalization(),name="Norm_3")(Pool_3)
Drop_3 = TimeDistributed(Dropout(0.2),name="Drop_3")(Norm_3)

Conv_4 = TimeDistributed(Conv2D(128, (3,3), padding= 'same'),name="Conv_4")(Drop_3)
Acti_4 = TimeDistributed(Activation("relu"),name="Acti_4")(Conv_4)
Pool_4 = TimeDistributed(MaxPooling2D(pool_size=(2, 2)),name="Pool_4")(Acti_4)
Norm_4 = TimeDistributed(BatchNormalization(),name="Norm_4")(Pool_4)
Drop_4 = TimeDistributed(Dropout(0.25),name="Drop_4")(Norm_4)

Conv_5 = TimeDistributed(Conv2D(256, (3,3), padding= 'same'),name="Conv_5")(Drop_4)
Acti_5 = TimeDistributed(Activation("relu"),name="Acti_5")(Conv_5)
Pool_5 = TimeDistributed(MaxPooling2D(pool_size=(2, 2)),name="Pool_5")(Acti_5)
Norm_5 = TimeDistributed(BatchNormalization(),name="Norm_5")(Pool_5)
Drop_5 = TimeDistributed(Dropout(0.3),name="Drop_5")(Norm_5)

Flat_1 = TimeDistributed(Flatten(),name="Flat_1")(Drop_5)
Dens_1 = TimeDistributed(Dense(256,activation='relu',name='Dens_1'))(Flat_1)

lstm_1 = LSTM(256, return_sequences=True, name='lstm_1')(Dens_1)
lstm_1b = LSTM(256, return_sequences=True, go_backwards=True, name='lstm_1b')(Dens_1)
lstm1_merged = add([lstm_1, lstm_1b])
lstm_2 = LSTM(256, return_sequences=True, name='lstm_2')(lstm1_merged)
lstm_2b = LSTM(256, return_sequences=True, go_backwards=True, name='lstm_2b')(lstm1_merged)

Dens_f = Dense(Char_Num, name='Dens_f')(concatenate([lstm_2, lstm_2b]))
Acti_f = Activation('softmax',name="Acti_f")(Dens_f)    #y_pred

labels = Input(name='the_labels', shape=[seq_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([Acti_f, labels, input_length, label_length])

line_model = Model(inputs=[input_lines, labels, input_length, label_length], outputs=loss_out)


## Compile and load the weights of the trained model and define the prediction model

In [0]:
# Although we saved the whole model, we can not directly use save model because keras will have a problem as our model
# uses a lambda function as the minimizing loss function. To overcome this issue we have to create the whole model 
# again then use the saved trained model to set the weights. Finally we define that our predictind model takes the
# input lines and outputs the matrix from the classification (dense - activation layer). Remeber that this layer will 
# be of length equals to the timesteps and some transformation is need to compress it to the coresponding text length.

line_model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer='rmsprop')
line_model.load_weights('Moving_Window_Models/Model.17.hdf5')
predict_model = Model(inputs=input_lines, outputs=Acti_f)

## Helper methode to transform the output classes into a text by defined a reverse dictionary

In [0]:
def labels_to_text(labels):
    
    alphabet = {' ': 0 , '!': 1, '"': 2, '#': 3, "&": 4, "'": 5, '(': 6, ')': 7, '*': 8, '+': 9, ',': 10, '-': 11,
             '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22,
             '9': 23, ':': 24, ';': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33,
             'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44,
             'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52, 'a': 53, 'b': 54, 'c': 55,
             'd': 56, 'e': 57, 'f': 58, 'g': 59, 'h': 60, 'i': 61, 'j': 62, 'k': 63, 'l': 64, 'm': 65, 'n': 66,
             'o': 67, 'p': 68, 'q': 69, 'r': 70, 's': 71, 't': 72, 'u': 73, 'v': 74, 'w': 75, 'x': 76, 'y': 77, 
             'z': 78}

    reverse_alphabet = dict((i, char) for char, i in alphabet.items())
    
    res = []
    for c in labels:
        if c == len(alphabet):  # CTC Blank
            res.append("")
        else:
            res.append(reverse_alphabet[c])
    return "".join(res)

## Data preparation of the image to pe passed to the model

In [0]:
# it is important to highlight that our model was created and trained to handle gray scale images, so it can only
# be used for this type of images. Thus before passing the image to this function, the user has to make sure that the
# image is a gray sclae image with dimension of width*height and that the value range is between 0 and 255. 
# Please note that some gray scale images are usually saved as boolean with true and false for black and white, while
# others are saved as binary images with only 0 and 1. Thus it is the responsibility of the user of this model to 
# prepare the images by himself at first.

def get_features_predict (image_file):
    # This function is also taken exactly from the training part where we only consider the preparation of the image
    # as no label preparation is needed during prediction
    
    def_h = 64                      
    def_w = 2240                    
    max_word_length = 90            
    window_w = 32                   
    window_step = 8                
    window_size = (def_h,window_w)
    
    im = Image.open(image_file)
    im_w,im_h = im.size 
    
    im_arr = np.array(im)
    im_arr[im_arr<=200] = 0
    im_arr[im_arr>200] = 255
    im = Image.fromarray(im_arr)  

    im = im.resize((im_w*def_h/im_h,def_h),Image.LANCZOS)
    im_w,im_h = im.size 
    if im_w <= def_w-10:
        im = ImageOps.expand(im,border=(5,0,5,0),fill='white')
        im = ImageOps.expand(im,border=(0,0,def_w-im_w-10,0),fill='black')
    else:
        im = im.resize((def_w,def_h),Image.LANCZOS)
    assert(im.size==(def_w,def_h))
    
    im_arr = np.array(im)/255.0
    im_windows_array = view_as_windows(im_arr, window_size, step = window_step)[0]
    
    return im_windows_array

# Decoding the output of the model using CTC layer

In [0]:
# The CTC decoder is a very powerfull tool it takes the output of the activation layer (batchsize_predict, timesteps, 
# num_classes). In our case we have only 1 image for each prediction and timesteps is 277 and num_classes is 80 which 
# corresponds to the 79 characters that we have in the alphabet dictionary + an additional character which is the 
# character ignored by the CTC 'blank label have id -1'. We use the decoding with greedy = True which return the 
# most probable path only. There another variation where greedy is set to False, however in that case it is advised
# to pass an external dictionary that constraints the decoding so certain words, in that case the the decoder will 
# return a set of top_paths probable output sequences. We think that it might be a good way to improve the decoding
# capability of the model. However, it is not an easy extension because at the moment both the Keras and tensorflow
# implementations of the CTC_decode function doesnot have this option and we have to implement it ourselves. There 
# seems to be an implementation on github https://github.com/githubharald/CTCWordBeamSearch but it is only for tensorflow
# Anyway, this might be a good direction for enhancing our results.

def decode_predict_ctc(out, top_paths = 1):
    results = []
    beam_width = 5
    if beam_width < top_paths:
        beam_width = top_paths
    for i in range(top_paths):
        lables = K.get_value(K.ctc_decode(out, input_length=np.ones(out.shape[0])*out.shape[1],
                           greedy=True, beam_width=beam_width, top_paths=top_paths)[0][i])[0]
        text = labels_to_text(lables)
        results.append(text)
    return results


### Here is a prediction example

In [None]:
test_image = 'lines/g04/g04-026/g04-026-00.png'

timeSteps = 277
window_h  = 64
window_w  = 32
Channels = 1

X_curr = get_features_predict (test_image)
X_predict = np.ones([1, timeSteps, window_h, window_w, Channels])
X_predict[0,:,:,:,0] = X_curr     
net_out_value = predict_model.predict(X_predict)
pred_texts = decode_predict_ctc(net_out_value)

Sequ_Siml = SequenceMatcher(None,pred_texts[0], text).ratio()
Edit_dist = distance.levenshtein(text, pred_texts[0])

## Similarity and Edit distance measure on Training lines

In [0]:
0.508923186643
22.2311178248
Model_01          662 samples

0.7942032981
9.92145015106
Model_02          662 samples

0.856698319518
7.01208459215
Model_03          662 samples

0.897841427648
5.10574018127
Model_04          662 samples

0.91086792963
4.43051359517
Model_05          662 samples

0.913104716847
4.38066465257
Model_06          662 samples

0.93418817595
3.31117824773
Model_07          662 samples

0.945191130963
2.83232628399
Model_08          662 samples

0.938934974773
3.00755287009
Model_09          662 samples

0.943225273913
2.96525679758
Model_10          662 samples

0.95160941778
2.48489425982
Model_11          662 samples

0.957841669587
2.16012084592
Model_12          662 samples

0.968514753688
1.6586102719
Model_13          662 samples

0.984877178248
0.820241691843
Model_14          662 samples

0.987583568244
0.688821752266
Model_15          662 samples

0.989319410784
0.616314199396
Model_16          662 samples

0.989877622172
0.569486404834
Model_17          662 samples  

## Similarity and Edit distance measure on Test lines

In [None]:
0.501957501476
22.6963746224
Model_01       Full Data

0.784521773509
10.5453172205
Model_02       Full Data

0.843822981689
7.80966767372
Model_3        Full Data

0.882099479777
5.95770392749
Model_4        Full Data

0.891063369234
5.45317220544
Model_5       Full Data

0.892110061667
5.58912386707
Model_6       Full Data

0.908403082921
4.64803625378
Model_7       Full Data

0.917260130158
4.24773413897
Model_08      Full Data

0.906333122088
4.57401812689
Model_09       Full Data

0.908946897993
4.59516616314
Model_10       Full Data

0.912874817945
4.43655589124
Model_11       Full Data

0.916891944741
4.13444108761
Model_12       Full Data

0.927065501785
3.68126888218
Model_13       Full Data

0.940975214411
3.02870090634
Model_14       Full Data

0.941144969955
2.99848942598
Model_15       Full Data

0.941704667352
2.95770392749
Model_16       Full Data

0.942238338125
2.93957703927
Model_17       Full Data