# Importing all the packages required for our Model

In [1]:
import numpy as np
import pandas as pd
from PIL import Image, ImageOps 
from skimage.util.shape import view_as_windows
from keras import backend as K
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers import Input, Dense, Activation, Dropout, BatchNormalization, Flatten
from keras.layers import Lambda
from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.layers.recurrent import LSTM
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import RMSprop
from keras.models import load_model

Using TensorFlow backend.


# Transforming the text label into a category label

In [2]:
def text_to_labels(text):
    # The alphabet dictionary contains all the characters that occured in the dataset
    alphabet = {' ': 0 , '!': 1, '"': 2, '#': 3, "&": 4, "'": 5, '(': 6, ')': 7, '*': 8, '+': 9, ',': 10, '-': 11,
             '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22,
             '9': 23, ':': 24, ';': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33,
             'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44,
             'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52, 'a': 53, 'b': 54, 'c': 55,
             'd': 56, 'e': 57, 'f': 58, 'g': 59, 'h': 60, 'i': 61, 'j': 62, 'k': 63, 'l': 64, 'm': 65, 'n': 66,
             'o': 67, 'p': 68, 'q': 69, 'r': 70, 's': 71, 't': 72, 'u': 73, 'v': 74, 'w': 75, 'x': 76, 'y': 77, 
             'z': 78}
    res = []
    for char in text:
        res.append(alphabet[char])
    # the return label list simply contains the identifier of each character according to the dictionary.
    # this identifier corresponds to the class and this notation is required by the CTC loss function
    return res

# Transform the data into features and labels

In [3]:
def get_features_label (image_file,repres):
  
    #defining some hyperparameters
    def_h = 64                      # We might also conider 128 or even 32 but it is alot of compression
    def_w = 2240                    # calculated form the data
    max_word_length = 90            # max is 87 then we append spaces at the end untill we reach 90 
    window_w = 32                   # Other suggestion is to make it 64
    window_step = 8                 # it is a hyperparameter will try 50% and 25%
    window_size = (def_h,window_w)
    
    # as you can see the window step defines how much overlap occurs between each succeding window, 
    # In our training we tried the 25% and 50% overalp and we noticed that as the overlap increases
    # The performance of the model increases as well that is why we used 75% overlap
    # larger overlaping ratio can also be used but it is important to point out that this comes at the
    # cost of memory during training
    
    # We first read the image and perform the binary thresholding. We choose the threshold to be 200 
    # based on some investigation. However, this is a hyperparmeter that can be tuned  
    im = Image.open(image_file)
    im_w,im_h = im.size 
    
    im_arr = np.array(im)
    im_arr[im_arr<=200] = 0
    im_arr[im_arr>200] = 255
    im = Image.fromarray(im_arr)  
    
    
    # We now resize the image to the desired height while keeping the same aspect ratio.
    # We then pad the image to reach the maximum width
    im = im.resize((im_w*def_h/im_h,def_h),Image.LANCZOS)
    im_w,im_h = im.size 
    
    #return im_w    use this line with the get_max_width function to get the maximum width which is then used 
    #to identify the def_w of our model
    
    # We finally pad the start and the end with some white space then pad the rest of the image with black
    # it might also be good to pad the start and the end with white but we have not tried it. 
    if im_w <= def_w-10:
        im = ImageOps.expand(im,border=(5,0,5,0),fill='white')
        im = ImageOps.expand(im,border=(0,0,def_w-im_w-10,0),fill='black')
    else:
        im = im.resize((def_w,def_h),Image.LANCZOS)
    assert(im.size==(def_w,def_h))
    
    # Normalize your pixels then go over the image and extract the different windows 
    im_arr = np.array(im)/255.0
    im_windows_array = view_as_windows(im_arr, window_size, step = window_step)[0]
    
    #create your labels 
    label = text_to_labels(repres)
    
    return im_windows_array, label

## Defining the CTC loss function from Keras

In [4]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

# Generating each batch used to fit our Model

In [5]:
def get_train_data(csv_file, train):
    
    # Some parameters of the model 
    batch_size = 40              # This is the maximum batch size that did not crashes the memory
    timeSteps  = 277             # This is caluclated based on the def_w and the window step = (def_w - winodw_w)/window_step +1
    window_h   = 64              # hyperparatemer as highlighted above
    window_w   = 32              # again hyperparameter 
    seq_len    = 90              # The maximum length of a line as investigated form our data 87 + 3 margin characters fro error
    channels   = 1               # We are using gray scale images
    
    
    # based on the boolean valriable train we are either generating the training batches or the validating batches
    if train:
        data_df = pd.read_csv(csv_file).sample(frac=1)
    else:
        data_df = pd.read_csv(csv_file).sample(n=200)
    
    
    # the generator function that return each batch to the fit_generator function
    image_dir = 'lines/'
    while True:
        count = 0
        first = True
        for path,text in zip(data_df.Path,data_df.Text):
            X_curr, Y_curr = get_features_label(image_dir + path, text)
            
            # if the first element in the batch we intialize the matricies needed for the model other wise we simply append.
            # we return the window matrices of the image and the labels and the length of each label (number of characters)
            # in addition the length of the input image (all images have the same length but it is still needed by the CTC)
            if first:
                X_train = np.ones([batch_size, timeSteps, window_h, window_w, channels])
                X_train[count,:,:,:,0] = X_curr
                Y_train = np.ones([batch_size, seq_len]) * -1
                Y_train[count,0:len(Y_curr)] = Y_curr
                input_length, label_length = np.zeros([batch_size, 1]), np.zeros([batch_size, 1])
                input_length[count] = timeSteps - 2
                label_length[count] = len(Y_curr)
                first = False
                count += 1
            else:
                X_train[count,:,:,:,0] = X_curr
                Y_train[count,0:len(Y_curr)] = Y_curr
                input_length[count] = timeSteps
                label_length[count] = len(Y_curr)
                count +=1
                
            if count >= batch_size :
                count = 0
                first = True
                inputs = {'the_input': X_train,
                  'the_labels': Y_train,
                  'input_length': input_length,
                  'label_length': label_length }
                outputs = {'ctc': np.zeros([batch_size])}  
                yield (inputs, outputs)
        
        # the final batch is not necessarly of length 40, so we yield only what we have encountered
        if count>0 and count < batch_size:
            inputs = {'the_input': X_train[:count],
                  'the_labels': Y_train[:count],
                  'input_length': input_length[:count],
                  'label_length': label_length[:count] }
            outputs = {'ctc': np.zeros([count])}  
            yield (inputs, outputs)

In [6]:
# Input Parameters: We already heiglighted how these parameters are assigned or calculated
timeSteps = 277
window_h  = 64
window_w  = 32
Char_Num  = 80
seq_len   = 90

# The input layer of our model: 
input_lines = Input(shape=(timeSteps, window_h, window_w, 1), name='the_input')

# One of the techniques used to connect CNN with LSTM is the concept of time distributed, where the idea is
# s set of timesteps is defined. The CNN deals with one timestep at a time and produce an output, which is then
# flatened and processed by a dence layer to produce a vector. This implies that for the defined time steps, 
# we will have a matrix where one ot its dimensions is the number of the time steps, this matrix is then passed to
# the LSTM network as the input

#First Conv_Layer, have a nonsymmetric maxpooling so that the output is square shaped of dimension 32*32
Conv_1 = TimeDistributed(Conv2D(16, (3,3), padding= 'same'),name="Conv_1")(input_lines)
Acti_1 = TimeDistributed(Activation("relu"),name="Acti_1")(Conv_1)
Pool_1 = TimeDistributed(MaxPooling2D(pool_size=(2, 1)),name="Pool_1")(Acti_1)
Norm_1 = TimeDistributed(BatchNormalization(),name="Norm_1")(Pool_1)

#Second Conv_Layer
Conv_2 = TimeDistributed(Conv2D(32, (3,3), padding= 'same'),name="Conv_2")(Norm_1)
Acti_2 = TimeDistributed(Activation("relu"),name="Acti_2")(Conv_2)
Pool_2 = TimeDistributed(MaxPooling2D(pool_size=(2, 2)),name="Pool_2")(Acti_2)
Norm_2 = TimeDistributed(BatchNormalization(),name="Norm_2")(Pool_2)
#Drop_2 = TimeDistributed(Dropout(0.25),name="Drop_2")(Norm_2)

#Third Conv_Layer, We started adding drop out from this layer 
Conv_3 = TimeDistributed(Conv2D(64, (3,3), padding= 'same'),name="Conv_3")(Norm_2)
Acti_3 = TimeDistributed(Activation("relu"),name="Acti_3")(Conv_3)
Pool_3 = TimeDistributed(MaxPooling2D(pool_size=(2, 2)),name="Pool_3")(Acti_3)
Norm_3 = TimeDistributed(BatchNormalization(),name="Norm_3")(Pool_3)
Drop_3 = TimeDistributed(Dropout(0.2),name="Drop_3")(Norm_3)

#Fourth Conv_Layer
Conv_4 = TimeDistributed(Conv2D(128, (3,3), padding= 'same'),name="Conv_4")(Drop_3)
Acti_4 = TimeDistributed(Activation("relu"),name="Acti_4")(Conv_4)
Pool_4 = TimeDistributed(MaxPooling2D(pool_size=(2, 2)),name="Pool_4")(Acti_4)
Norm_4 = TimeDistributed(BatchNormalization(),name="Norm_4")(Pool_4)
Drop_4 = TimeDistributed(Dropout(0.25),name="Drop_4")(Norm_4)

#Fifth Conv_Layer
Conv_5 = TimeDistributed(Conv2D(256, (3,3), padding= 'same'),name="Conv_5")(Drop_4)
Acti_5 = TimeDistributed(Activation("relu"),name="Acti_5")(Conv_5)
Pool_5 = TimeDistributed(MaxPooling2D(pool_size=(2, 2)),name="Pool_5")(Acti_5)
Norm_5 = TimeDistributed(BatchNormalization(),name="Norm_5")(Pool_5)
Drop_5 = TimeDistributed(Dropout(0.3),name="Drop_5")(Norm_5)

# Flattening and Dense Layer
Flat_1 = TimeDistributed(Flatten(),name="Flat_1")(Drop_5)
Dens_1 = TimeDistributed(Dense(256,activation='relu',name='Dens_1'))(Flat_1)

# We have a bidiretional LSTM network that consists of two LSTMs, where each of them takes the input and consume it
# from a diferent direction (forward and backward). We then take the output of the two LSTMs and merged together. We 
# then passe the merged output to another bidirectional LSTM network. We then take the output of these two LSTMS and
# concatenate it together then passe it to out finall classifing dense lasyer.

# First layer of bidirectional LSTMs
lstm_1 = LSTM(256, return_sequences=True, name='lstm_1')(Dens_1)
lstm_1b = LSTM(256, return_sequences=True, go_backwards=True, name='lstm_1b')(Dens_1)

# adding the output of the two LSTMS of the previous layer
lstm1_merged = add([lstm_1, lstm_1b])

# Second layer of bidirectional LSTMs
lstm_2 = LSTM(256, return_sequences=True, name='lstm_2')(lstm1_merged)
lstm_2b = LSTM(256, return_sequences=True, go_backwards=True, name='lstm_2b')(lstm1_merged)


#Final Classification Dense layer
Dens_f = Dense(Char_Num, name='Dens_f')(concatenate([lstm_2, lstm_2b]))
Acti_f = Activation('softmax',name="Acti_f")(Dens_f)    #y_pred

# According to the previous model we will have a decided class for each timestep. Now the idea is to define a function
# that will compress these classified windows into the length of the text seqence. For this we use the predefined CTC
# loss function, which takes the output of the classification layer, the true label, the length of the classifing layer
# output (timesteps) and the length of the label sequence (original text)

labels = Input(name='the_labels', shape=[seq_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

# Keras doesn't currently support loss funcs with extra parameters so CTC loss is implemented in a lambda layer
# This will be our loss function that our optimizer should aim to minimize it
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([Acti_f, labels, input_length, label_length])


# We finally define our model and show its summary
line_model = Model(inputs=[input_lines, labels, input_length, label_length], outputs=loss_out)
line_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_input (InputLayer)          (None, 277, 64, 32,  0                                            
__________________________________________________________________________________________________
Conv_1 (TimeDistributed)        (None, 277, 64, 32,  160         the_input[0][0]                  
__________________________________________________________________________________________________
Acti_1 (TimeDistributed)        (None, 277, 64, 32,  0           Conv_1[0][0]                     
__________________________________________________________________________________________________
Pool_1 (TimeDistributed)        (None, 277, 32, 32,  0           Acti_1[0][0]                     
__________________________________________________________________________________________________
Norm_1 (Ti

In [7]:
# we compile the model using rmsprop as an optimizer and CTC as the loss function
line_model.compile(optimizer='rmsprop', loss={'ctc': lambda y_true, y_pred: y_pred})

# We defined a checkpointer to save the model after each epoch, so that we can eavluate the performance
checkpointer = ModelCheckpoint(filepath='Saved_Models/Model.{epoch:02d}.hdf5', verbose=1,
                       save_best_only=False, save_weights_only=False, mode='auto', period=1)

# We defined a watcher that updates the learning rate if the val-loss did not improves to avoid overfitting
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, verbose=1, mode='auto', 
                              min_delta=0.001, cooldown=0, min_lr=0)

In [None]:
# We create the train_generator and the validating generator, then we train the model, the steps per epoch parameter
# is calculated by dividing the length of the training file by the batch size, similarly, the validation steps is
# calculated by dividing the length of the valid data used (200) by the batch size. 

train_file_id = 'Train_Lines.csv'
train_generator = get_train_data(train_file_id,True)
valid_file_id = 'Test_Lines.csv'
valid_generator = get_train_data(valid_file_id,False)
line_model.fit_generator(train_generator,steps_per_epoch=311, epochs=50, validation_data = valid_generator, 
                         validation_steps= 5, callbacks=[checkpointer,reduce_lr])

Epoch 1/10

Epoch 00001: saving model to Saved_Models_5/Model.01.hdf5
Epoch 2/10

Epoch 00002: saving model to Saved_Models_5/Model.02.hdf5
Epoch 3/10

Epoch 00003: saving model to Saved_Models_5/Model.03.hdf5
Epoch 4/10

Epoch 00004: saving model to Saved_Models_5/Model.04.hdf5
Epoch 5/10

Epoch 00005: saving model to Saved_Models_5/Model.05.hdf5
Epoch 6/10

Epoch 00006: saving model to Saved_Models_5/Model.06.hdf5
Epoch 7/10

Epoch 00007: saving model to Saved_Models_5/Model.07.hdf5
Epoch 8/10