# Importing all the packages required for our Model

In [1]:
import os
import itertools
import codecs
import re
import datetime
import numpy as np
import pandas as pd
from scipy import ndimage
from PIL import Image, ImageOps 
import distance
from difflib import SequenceMatcher

import matplotlib.pyplot as plt
from keras import backend as K
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers import Input, Dense, Activation, Dropout, BatchNormalization
from keras.layers import Reshape, Lambda
from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers.recurrent import GRU, LSTM
from keras.optimizers import SGD, RMSprop, Adam
from keras.utils.data_utils import get_file
from keras.preprocessing import image
import keras.callbacks

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Directory to store results
OUTPUT_DIR = 'Reshape_Model_Weights'

# Transforming the text label into a category label

In [3]:
# character classes and matching regex filter
alphabet = {' ': 0 , '!': 1, '"': 2, '#': 3, "&": 4, "'": 5, '(': 6, ')': 7, '*': 8, '+': 9, ',': 10, '-': 11,
             '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22,
             '9': 23, ':': 24, ';': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33,
             'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44,
             'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52, 'a': 53, 'b': 54, 'c': 55,
             'd': 56, 'e': 57, 'f': 58, 'g': 59, 'h': 60, 'i': 61, 'j': 62, 'k': 63, 'l': 64, 'm': 65, 'n': 66,
             'o': 67, 'p': 68, 'q': 69, 'r': 70, 's': 71, 't': 72, 'u': 73, 'v': 74, 'w': 75, 'x': 76, 'y': 77, 
             'z': 78}

reverse_alphabet = dict((i, char) for char, i in alphabet.items())

# Translation of characters to unique integer values
def text_to_labels(text):
    ret = []
    for char in text:
        ret.append(alphabet[char])
    return ret

# Generating each batch used to fit our Model

In [4]:
# Uses generator functions to supply train/test with data. 
class TextImageGenerator(keras.callbacks.Callback):
  
    def __init__(self, run_name, minibatch_size, img_w, img_h, downsample_factor, absolute_max_string_len, train_samples, val_samples):
        self.minibatch_size = minibatch_size
        self.img_w = img_w
        self.img_h = img_h
        self.downsample_factor = downsample_factor
        self.blank_label = self.get_output_size() - 1
        self.absolute_max_string_len = absolute_max_string_len
        self.train_samples = train_samples
        self.val_samples = val_samples
        
        self.output_dir = os.path.join(OUTPUT_DIR + os.sep, run_name)
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)


    def get_output_size(self):
        return len(alphabet) + 1

    def build_Data(self, csv_train, csv_val): 
        
        self.data_train = pd.read_csv(csv_train).sample(self.train_samples)
        self.data_val = pd.read_csv(csv_val).sample(self.val_samples)
        
        self.image_dir = 'lines/'
        
        self.cur_train_index = 0
        self.cur_val_index = 0
        
    def get_batch(self, index, size, train):
        X_data = np.ones([size, self.img_w, self.img_h, 1])
        labels = np.ones([size, self.absolute_max_string_len]) * -1
        input_length = np.zeros([size, 1])
        label_length = np.zeros([size, 1])
        
        if train:
            data = self.data_train.loc[index:index+size-1]
        else:
            data = self.data_val.loc[index:index+size-1]
          
        i = 0
        for path,text in zip(data.Path,data.Text):
            X_curr, Y_curr = get_features_label(self.image_dir + path,text, self.img_h, self.img_w)
            X_data[i, 0:self.img_w, :, 0] = X_curr.T
            labels[i, 0:len(Y_curr)] = Y_curr
            input_length[i] = self.img_w // self.downsample_factor #- 2
            label_length[i] = len(Y_curr)
            i = i + 1
            
        inputs = {'the_input': X_data,
                  'the_labels': labels,
                  'input_length': input_length,
                  'label_length': label_length,
                  'source_str': source_str  # used for visualization only
                  }
        outputs = {'ctc': np.zeros([size])}  # dummy data for dummy loss function
        return (inputs, outputs)

    def next_train(self):
        while True:
            ret = self.get_batch(self.cur_train_index, self.minibatch_size, train=True)
            self.cur_train_index += self.minibatch_size
            if self.cur_train_index >= self.train_samples:
                self.cur_train_index = self.cur_train_index % self.minibatch_size
            yield ret

    def next_val(self):
        while True:
            ret = self.get_batch(self.cur_val_index, self.minibatch_size, train=False)
            self.cur_val_index += self.minibatch_size
            if self.cur_val_index >= self.val_samples:
                self.cur_val_index = self.cur_val_index % self.minibatch_size
            yield ret
    
    def on_train_begin(self, logs={}):
        csv_train = 'Train_Lines.csv'
        csv_val = 'Test_Lines.csv'
        self.build_Data(csv_train, csv_val)
        
    def on_epoch_begin(self, epoch, logs={}):
        #print ("-------------------------------------------------------")
        pass
      
    def on_epoch_end(self, epoch, logs={}):
        if epoch % 3 == 0:
            self.model.save_weights(os.path.join(self.output_dir, 'weights%02d.h5' % (epoch)))
        

# Transform the data into features and labels

In [5]:
def get_features_label (image_file,repres, h, w):
    #defining some hyperparameters
    def_h = h                   # We have set it to 64, might also conider 128 or even 32
    def_w = w                   # calculated form the data
    num_char_per_seq = 90       # max is 87 then we append spaces at the end untill we reach 90 
    
    # We first read the image and perform the binary thresholding. We choose the threshold to be 200 
    # based on some investigation. However, this is a hyperparmeter that can be tuned  
    im = Image.open(image_file)
    im_w,im_h = im.size 
    
    im_arr = np.array(im)
    im_arr[im_arr<=200] = 0
    im_arr[im_arr>200] = 1
    im = Image.fromarray(im_arr*255)    

    # We now resize the image to the desired height while keeping the same aspect ratio.
    # We then pad the image to reach the maximum width
    im_w,im_h = im.size 
    im = im.resize((im_w*def_h//im_h,def_h),Image.LANCZOS)
    im_w,im_h = im.size 
    
    # We pad the smaller images so that all images have equal dimensions
    if im_w < def_w:
        im = ImageOps.expand(im,border=(0,0,6,0),fill='white')
        im = ImageOps.expand(im,border=(0,0,def_w-im_w-6,0),fill='black')
    # Just in case, if we get an image with width higher than our width, we shrink it. 
    elif im_w > def_w:
        im = im.resize((def_w,def_h),Image.LANCZOS)
    assert(im.size==(def_w,def_h))
    
    # Normalize your pixels to either zero or one 
    im_arr = np.array(im)/255
    
    # create your labels
    label = text_to_labels(repres)
            
    return im_arr, label

## Defining the CTC loss function from Keras

In [6]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

# Model

In [7]:
# Input Parameters
img_h = 64 
img_w = 2120 #calculated from the actual data after rescaling to height 64
len_alphabets = len(alphabet)+1
num_char_per_seq = 90

# Network parameters
kernel_size = (3, 3)
pool_size = 2
time_dense_size = 256
rnn_size = 256
minibatch_size = 64
act = 'relu'
    
input_shape = (img_w, img_h, 1)

# The input layer of our model: 
input_data = Input(name='the_input', shape=input_shape, dtype='float32')

#First Conv_Layer
X = Conv2D(16, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv1')(input_data)
X = MaxPooling2D(pool_size=(pool_size, pool_size), name='max1')(X)
X = BatchNormalization(name="BN1")(X)

#Second Conv_Layer
X = Conv2D(32, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv2')(X)
X = MaxPooling2D(pool_size=(pool_size, pool_size), name='max2')(X)
X = BatchNormalization(name="BN2")(X)
X = Dropout(0.1)(X)

#Third Conv_Layer
X = Conv2D(48, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv3')(X)
X = Dropout(0.2)(X)
X = MaxPooling2D(pool_size=(pool_size, pool_size), name='max3')(X)
X = BatchNormalization(name="BN3")(X)

#Fourth Conv_Layer
X = Conv2D(64, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv4')(X)
X = Dropout(0.2)(X)
X = BatchNormalization(name="BN4")(X)

#Fifth Conv_Layer
X = Conv2D(80, kernel_size, padding='same', activation=act, kernel_initializer='he_normal', name='conv5')(X)
X = BatchNormalization(name="BN5")(X)

# One of the techniques used to connect CNN with LSTM is the concept of reshaping the input to the 
# dimensions expected by the LSTM i.e. (sample, time_steps, features). Here, "sample" is the size of 
# your minibatch, "time_steps" is the length of a sequence, since recurrent neural network are 
# designed to process time-series, and "features" is the dimension of each element of the time-series.
conv_to_rnn_dims = (img_w // (pool_size ** 3), (img_h // (pool_size ** 3)) * 80)
X = Reshape(target_shape=conv_to_rnn_dims, name='reshape')(X)

# cuts down input size going into RNN:
X = Dense(time_dense_size, activation=act, name='dense1')(X)

# We have a bidiretional LSTM network that consists of two LSTMs, where each of them takes the input and consume it
# from a diferent direction (forward and backward). We then take the output of the two LSTMs and merged together. We 
# then passe the merged output to another bidirectional LSTM network. We then take the output of these two LSTMS and
# concatenate it together then passe it to out finall classifing dense lasyer.

# First layer of bidirectional LSTMs
lstm_1 = LSTM(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='lstm1')(X)
lstm_1b = LSTM(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='lstm1_b')(X)

# adding the output of the two LSTMS of the previous layer
lstm1_merged = add([lstm_1, lstm_1b])

# Second layer of bidirectional LSTMs
lstm_2 = LSTM(rnn_size, return_sequences=True, kernel_initializer='he_normal', name='lstm2')(lstm1_merged)
lstm_2b = LSTM(rnn_size, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='lstm2_b')(lstm1_merged)

# transforms RNN output to character activations:
X = Dense(len_alphabets, kernel_initializer='he_normal', name='dense2')(concatenate([lstm_2, lstm_2b]))
y_pred = Activation('softmax', name='softmax')(X)

# According to the previous model we will have a decided class for each timestep. Now the idea is to define a function
# that will compress these classified windows into the length of the text seqence. For this we use the predefined CTC
# loss function, which takes the output of the classification layer, the true label, the length of the classifing layer
# output (timesteps) and the length of the label sequence (original text)

labels = Input(name='the_labels', shape=[num_char_per_seq], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

# Keras doesn't currently support loss funcs with extra parameters so CTC loss is implemented in a lambda layer
# This will be our loss function that our optimizer should aim to minimize it
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])

# Define an optimzer
rms = RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)

# We finally define our model and show its summary
model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_input (InputLayer)          (None, 2120, 64, 1)  0                                            
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 2120, 64, 16) 160         the_input[0][0]                  
__________________________________________________________________________________________________
max1 (MaxPooling2D)             (None, 1060, 32, 16) 0           conv1[0][0]                      
__________________________________________________________________________________________________
BN1 (BatchNormalization)        (None, 1060, 32, 16) 64          max1[0][0]                       
__________________________________________________________________________________________________
conv2 (Con

# Train the Model

In [5]:
def train(run_name, start_epoch, stop_epoch):
    
    train_samples = 12352  # needs to be a multiple of batch size in current implementation
    val_samples =  640   # needs to be a multiple of batch size in current implementation
    
    assert train_samples % minibatch_size == 0
    assert val_samples % minibatch_size == 0
        
    img_gen = TextImageGenerator(run_name = run_name, minibatch_size=minibatch_size,
                                 img_w=img_w,
                                 img_h=img_h,
                                 downsample_factor=(pool_size ** 3),
                                 absolute_max_string_len = num_char_per_seq, 
                                 train_samples=train_samples, val_samples=val_samples
                                 )
        
    # we compile the model using rmsprop as an optimizer and CTC as the loss function
    # the loss calc occurs elsewhere, so use a dummy lambda func for the loss
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=rms)
    
    #If we want to use existing weights, they need to be defined here
    
    #run_name = '2018_06_29_15_33_15'
    #weight = 'weights33.h5'
    #weight_file = os.path.join(OUTPUT_DIR, os.path.join(run_name, weight)) 
    #model.load_weights(weight_file)
    
    model.fit_generator(generator=img_gen.next_train(),
                        steps_per_epoch=train_samples // minibatch_size,
                        epochs=stop_epoch,
                        validation_data=img_gen.next_val(),
                        validation_steps=val_samples // minibatch_size,
                        callbacks=[img_gen],
                        initial_epoch=start_epoch)


In [12]:
run_name = '2018_07_07_134248_FullData'
train(run_name, 34, 155)

Epoch 35/155
Epoch 36/155
Epoch 37/155
Epoch 38/155
Epoch 39/155
Epoch 40/155
Epoch 41/155
Epoch 42/155
Epoch 43/155
Epoch 44/155
Epoch 45/155
Epoch 46/155
Epoch 47/155
Epoch 48/155
Epoch 49/155
Epoch 50/155
Epoch 51/155
Epoch 52/155
Epoch 53/155
Epoch 54/155
Epoch 55/155
Epoch 56/155
Epoch 57/155
Epoch 58/155
Epoch 59/155
Epoch 60/155
Epoch 61/155
Epoch 62/155
Epoch 63/155
Epoch 64/155
Epoch 65/155
Epoch 66/155
Epoch 67/155
Epoch 68/155
Epoch 69/155
Epoch 70/155
Epoch 71/155
Epoch 72/155
Epoch 73/155
Epoch 74/155
Epoch 75/155
Epoch 76/155
Epoch 77/155
Epoch 78/155
Epoch 79/155
Epoch 80/155
Epoch 81/155
Epoch 82/155
Epoch 83/155
 42/193 [=====>........................] - ETA: 5:40 - loss: 1.7439

KeyboardInterrupt: 