# Importing all the packages required for our Model

In [1]:
import os
import itertools
import codecs
import re
import datetime
import numpy as np
import pandas as pd
from scipy import ndimage
from PIL import Image, ImageOps 
import distance
from difflib import SequenceMatcher

import matplotlib.pyplot as plt
from keras import backend as K
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers import Input, Dense, Activation, Dropout, BatchNormalization
from keras.layers import Reshape, Lambda
from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers.recurrent import GRU, LSTM
from keras.optimizers import SGD, RMSprop, Adam
from keras.utils.data_utils import get_file
from keras.preprocessing import image
import keras.callbacks

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Helper methode to transform the output classes into a text by defined a reverse dictionary

In [3]:
# character classes and matching regex filter
alphabet = {' ': 0 , '!': 1, '"': 2, '#': 3, "&": 4, "'": 5, '(': 6, ')': 7, '*': 8, '+': 9, ',': 10, '-': 11,
             '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22,
             '9': 23, ':': 24, ';': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33,
             'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44,
             'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52, 'a': 53, 'b': 54, 'c': 55,
             'd': 56, 'e': 57, 'f': 58, 'g': 59, 'h': 60, 'i': 61, 'j': 62, 'k': 63, 'l': 64, 'm': 65, 'n': 66,
             'o': 67, 'p': 68, 'q': 69, 'r': 70, 's': 71, 't': 72, 'u': 73, 'v': 74, 'w': 75, 'x': 76, 'y': 77, 
             'z': 78}

reverse_alphabet = dict((i, char) for char, i in alphabet.items())

# Reverse translation of numerical classes back to characters
def labels_to_text(labels):
    ret = []
    for c in labels:
        if c == len(alphabet):  # CTC Blank
            ret.append("")
        else:
            ret.append(reverse_alphabet[c])
    return "".join(ret)

## Defining the CTC loss function from Keras

In [6]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

# Model

In [7]:
# Input Parameters
img_h = 64 
img_w = 2120 #calculated from the actual data after rescaling to height 64
len_alphabets = len(alphabet)+1
num_char_per_seq = 90

# Network parameters
kernel_size = (3, 3)
pool_size = 2
time_dense_size = 256
rnn_size = 256
minibatch_size = 64
act = 'relu'
    
input_shape = (img_w, img_h, 1)

# The input layer of our model: 
input_data = Input(name='the_input', shape=input_shape, dtype='float32')

#First Conv_Layer
X = Conv2D(16, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv1')(input_data)
X = MaxPooling2D(pool_size=(pool_size, pool_size), name='max1')(X)
X = BatchNormalization(name="BN1")(X)

#Second Conv_Layer
X = Conv2D(32, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv2')(X)
X = MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(X)
X = BatchNormalization(name="BN2")(X)
X = Dropout(0.1)(X)

#Third Conv_Layer
X = Conv2D(48, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv3')(X)
X = Dropout(0.2)(X)
X = MaxPooling2D(pool_size=(pool_size, pool_size), name='max3')(X)
X = BatchNormalization(name="BN3")(X)

#Fourth Conv_Layer
X = Conv2D(64, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv4')(X)
X = Dropout(0.2)(X)
X = BatchNormalization(name="BN4")(X)

#Fifth Conv_Layer
X = Conv2D(80, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv5')(X)
X = BatchNormalization(name="BN5")(X)

# One of the techniques used to connect CNN with LSTM is the concept of reshaping the input to the 
# dimensions expected by the LSTM i.e. (sample, time_steps, features). Here, "sample" is the size of 
# your minibatch, "time_steps" is the length of a sequence, since recurrent neural network are 
# designed to process time-series, and "features" is the dimension of each element of the time-series.
conv_to_rnn_dims = (img_w // (pool_size ** 3), (img_h // (pool_size ** 3)) * 80)
X = Reshape(target_shape=conv_to_rnn_dims, name='reshape')(X)

# cuts down input size going into RNN:
X = Dense(time_dense_size, activation=act, name='dense1')(X)

# We have a bidiretional LSTM network that consists of two LSTMs, where each of them takes the input and consume it
# from a diferent direction (forward and backward). We then take the output of the two LSTMs and merged together. We 
# then passe the merged output to another bidirectional LSTM network. We then take the output of these two LSTMS and
# concatenate it together then passe it to out finall classifing dense lasyer.

# First layer of bidirectional LSTMs
lstm_1 = LSTM(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='lstm1')(X)
lstm_1b = LSTM(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='lstm1_b')(X)

# adding the output of the two LSTMS of the previous layer
lstm1_merged = add([lstm_1, lstm_1b])

# Second layer of bidirectional LSTMs
lstm_2 = LSTM(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='lstm2')(lstm1_merged)
lstm_2b = LSTM(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='lstm2_b')(lstm1_merged)

# transforms RNN output to character activations:
X = Dense(len_alphabets, kernel_initializer='he_normal', name='dense2')(concatenate([lstm_2, lstm_2b]))
y_pred = Activation('softmax', name='softmax')(X)

# According to the previous model we will have a decided class for each timestep. Now the idea is to define a function
# that will compress these classified windows into the length of the text seqence. For this we use the predefined CTC
# loss function, which takes the output of the classification layer, the true label, the length of the classifing layer
# output (timesteps) and the length of the label sequence (original text)

labels = Input(name='the_labels', shape=[num_char_per_seq], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

# Keras doesn't currently support loss funcs with extra parameters so CTC loss is implemented in a lambda layer
# This will be our loss function that our optimizer should aim to minimize it
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])

# Define an optimzer
rms = RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)

# We finally define our model and show its summary
model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_input (InputLayer)          (None, 2120, 64, 1)  0                                            
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 2120, 64, 16) 160         the_input[0][0]                  
__________________________________________________________________________________________________
max1 (MaxPooling2D)             (None, 1060, 32, 16) 0           conv1[0][0]                      
__________________________________________________________________________________________________
BN1 (BatchNormalization)        (None, 1060, 32, 16) 64          max1[0][0]                       
__________________________________________________________________________________________________
conv2 (Con

## Compile and load the weights of the trained model and define the prediction model

In [None]:
weight_file = 'Reshape_Model_Weights/2018_06_29_15_33_15/weights27.h5'

model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=rms)
model.load_weights(weight_file)
model_p = Model(inputs=input_data, outputs=y_pred)

## Data preparation of the image to pe passed to the model

In [10]:
# it is important to highlight that our model was created and trained to handle gray scale images, so it can only
# be used for this type of images. Thus before passing the image to this function, the user has to make sure that the
# image is a gray sclae image with dimension of width*height and that the value range is between 0 and 255. 
# Please note that some gray scale images are usually saved as boolean with true and false for black and white, while
# others are saved as binary images with only 0 and 1. Thus it is the responsibility of the user of this model to 
# prepare the images by himself at first.

def prepare_test_image (image_file, h, w):
    #defining some hyperparameters
    def_h = h                   # We have set it to 64, might also conider 128 or even 32
    def_w = w                   # calculated form the data
    num_char_per_seq = 90       # max is 87 then we append spaces at the end untill we reach 90 
    
    # We first read the image and perform the binary thresholding. We choose the threshold to be 200 
    # based on some investigation. However, this is a hyperparmeter that can be tuned  
    im = Image.open(image_file)
    im_w,im_h = im.size 
    im_arr = np.array(im)
    im_arr[im_arr<=200] = 0
    im_arr[im_arr>200] = 1
    im = Image.fromarray(im_arr*255)    

    # We now resize the image to the desired height while keeping the width unchanged. This changes the aspect ratio.
    # We then pad the image with white space to reach the maximum width
    im_w,im_h = im.size 
    im = im.resize((im_w*def_h//im_h,def_h),Image.LANCZOS)
    im_w,im_h = im.size 
    
    # We pad the smaller images so that all images have equal dimensions
    if im_w < def_w:
        im = ImageOps.expand(im,border=(0,0,6,0),fill='white')
        im = ImageOps.expand(im,border=(0,0,def_w-im_w-6,0),fill='black')
    # Just in case, if we get an image with width higher than our width, we shrink it. 
    elif im_w > def_w:
        im = im.resize((def_w,def_h),Image.LANCZOS)
    assert(im.size==(def_w,def_h))
    
    # Normalize your pixels to either zero or one 
    im_arr = np.array(im)/255
    
    return im_arr

# Decoding the output of the model using CTC layer

The CTC decoder is a very powerfull tool it takes the output of the activation layer (batchsize_predict, timesteps, 
num_classes). In our case we have only 1 image for each prediction and timesteps is 277 and num_classes is 80 which 
corresponds to the 79 characters that we have in the alphabet dictionary + an additional character which is the 
character ignored by the CTC 'blank label have id -1'. We use the decoding with greedy = True which return the 
most probable path only. There another variation where greedy is set to False, however in that case it is advised
to pass an external dictionary that constraints the decoding so certain words, in that case the the decoder will 
return a set of top_paths probable output sequences. We think that it might be a good way to improve the decoding
capability of the model. However, it is not an easy extension because at the moment both the Keras and tensorflow
implementations of the CTC_decode function doesnot have this option and we have to implement it ourselves. There 
seems to be an implementation on github https://github.com/githubharald/CTCWordBeamSearch but it is only for tensorflow
Anyway, this might be a good direction for enhancing our results.


In [None]:
def decode_predict_ctc(out, top_paths = 1):
    results = []
    beam_width = 5
    if beam_width < top_paths:
        beam_width = top_paths
    for i in range(top_paths):
        lables = K.get_value(K.ctc_decode(out, input_length=np.ones(out.shape[0])*out.shape[1],
                           greedy=True, beam_width=beam_width, top_paths=top_paths)[0][i])[0]   
        text = labels_to_text(lables)
        results.append(text)
    return results
  
def predit_a_image(a, top_paths = 1):
    c = np.expand_dims(a.T, axis=0)
    net_out_value = model_p.predict(c)
    top_pred_texts = decode_predict_ctc(net_out_value, top_paths)
    return top_pred_texts

### Prediction Example

In [100]:
h = 64
w = 2120

test_image = 'lines/a05/a05-113/a05-113-01.png'

a = prepare_test_image (test_image, h, w)
X1 = np.ones([1, w, h, 1])
X1[0, 0:w, :, 0] = a.T
net_out_value = model_p.predict(X1)
pred_text = decode_predict_ctc(net_out_value)
print (pred_text)


a (64, 2120)
X1 (1, 2120, 64, 1)


['Government hac arhed he Medical Revearch']