## Import Library and Download Data

In [1]:
!pip install -U --no-cache-dir gdown --pre
!gdown --id 1j_0RvUWpFchxW4BaZERrJx7oZOz7tFc4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.0
Downloading...
From: https://drive.google.com/uc?id=1j_0RvUWpFchxW4BaZERrJx7oZOz7tFc4
To: /content/data_ocr.zip
100% 370M/370M [00:09<00:00, 39.9MB/s]


In [None]:
!unzip data_ocr.zip

In [3]:
!rm -rf data_ocr.zip

In [4]:
import json
import cv2
import os, random
import numpy as np
import itertools
import editdistance
import json


import tensorflow as tf 
import keras

from keras.preprocessing import image
from tensorflow.keras.utils import load_img, img_to_array 
from keras import applications
from keras.applications.vgg16 import preprocess_input
from sklearn.model_selection import KFold

from keras.layers import Input, Dense, Activation, Bidirectional, Dropout
from keras.layers import Reshape, Lambda, BatchNormalization
from keras.layers import LSTM
from keras.layers import add, concatenate

from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

## Processing data

In [5]:
letters = " #'()+,-./:0123456789ABCDEFGHIJKLMNOPQRSTUVWXYabcdeghiklmnopqrstuvxyzÂÊÔàáâãèéêìíòóôõùúýăĐđĩũƠơưạảấầẩậắằẵặẻẽếềểễệỉịọỏốồổỗộớờởỡợụủỨứừửữựỳỵỷỹ"
MAX_LEN = 70
SIZE = 2560, 160
CHAR_DICT = len(letters) + 1

def text_to_labels(text):
    return list(map(lambda x: letters.index(x), text))

def labels_to_text(labels):
    return ''.join(list(map(lambda x: letters[x] if x < len(letters) else "", labels)))

def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    y_pred = y_pred[:, 2:, :]
    return keras.backend.ctc_batch_cost(labels, y_pred, input_length, label_length)

def decode_batch(out):
    ret = []
    for j in range(out.shape[0]):
        out_best = list(np.argmax(out[j, 2:], 1))
        out_best = [k for k, g in itertools.groupby(out_best)]
        outstr = labels_to_text(out_best)
        ret.append(outstr)
    return ret

class TextImageGenerator:
    def __init__(self, img_dirpath, labels_path, img_w, img_h,
                 batch_size, downsample_factor, idxs, training=True, max_text_len=9, n_eraser=5):
        self.img_h = img_h
        self.img_w = img_w
        self.batch_size = batch_size
        self.max_text_len = max_text_len
        self.idxs = idxs
        self.downsample_factor = downsample_factor
        self.img_dirpath = img_dirpath                  
        self.labels= json.load(open(labels_path)) if labels_path != None else None
        self.img_dir = os.listdir(self.img_dirpath)   
        if self.idxs is not None:
            self.img_dir = [self.img_dir[idx] for idx in self.idxs]

        self.n = len(self.img_dir) # có một file labels.json                      
        if "labels.json" in self.img_dir:
          self.n -= 1
        self.indexes = list(range(self.n))
        self.cur_index = 0
        self.imgs = np.zeros((self.n, self.img_h, self.img_w, 3), dtype=np.float16)
        self.training = training
        self.texts = []

    def build_data(self):
        print(self.n, " Image Loading start... ", self.img_dirpath)
        count = 0
        for i, img_file in enumerate(self.img_dir):
            prefit = img_file.split(".")[-1]
            if prefit == "json":
              # print(prefit)
              continue
              
            img = load_img(self.img_dirpath + img_file, target_size=SIZE[::-1])
            img = img_to_array(img)
            img = preprocess_input(img).astype(np.float16)
            self.imgs[count] = img
            count+=1
            if self.labels != None: 
                self.texts.append(self.labels[img_file])
            else:
                #valid mode
                self.texts.append('')
        print("Image Loading finish...")

    def next_sample(self):
        self.cur_index += 1
        if self.cur_index >= self.n:
            self.cur_index = 0
            random.shuffle(self.indexes)
        return self.imgs[self.indexes[self.cur_index]].astype(np.float32), self.texts[self.indexes[self.cur_index]]

    def next_batch(self):
        while True:
            X_data = np.zeros([self.batch_size, self.img_w, self.img_h, 3], dtype=np.float32)     
            Y_data = np.zeros([self.batch_size, self.max_text_len], dtype=np.float32)             
            input_length = np.ones((self.batch_size, 1), dtype=np.float32) * (self.img_w // self.downsample_factor - 2)  
            label_length = np.zeros((self.batch_size, 1), dtype=np.float32)          

            for i in range(self.batch_size):
                img, text = self.next_sample()
                img = img.transpose((1, 0, 2))
                
                X_data[i] = img
                Y_data[i,:len(text)] = text_to_labels(text)
                label_length[i] = len(text)

            inputs = {
                'the_inputs': X_data,  
                'the_labels': Y_data,  
                'input_length': input_length,  
                'label_length': label_length  
            }
            outputs = {'ctc': np.zeros([self.batch_size])}  
            yield (inputs, outputs)

# Building model

In [6]:
# Backbone CNN 

def get_model(input_shape, training):
    inputs = Input(name='the_inputs', shape=input_shape, dtype='float32')
    base_model = applications.VGG16(weights='imagenet', include_top=False)
    
    inner = base_model(inputs)
    inner = Reshape(target_shape=(int(inner.shape[1]), -1), name='reshape')(inner)
    inner = Dense(512, activation='relu', kernel_initializer='he_normal', name='dense1')(inner) 

    lstm = LSTM(512, return_sequences=True, kernel_initializer='he_normal', name='lstm1', dropout=0.25, recurrent_dropout=0.25)(inner) 

    y_pred = Dense(CHAR_DICT, activation='softmax', kernel_initializer='he_normal',name='dense2')(lstm)
    
    labels = Input(name='the_labels', shape=[MAX_LEN], dtype='float32')
    input_length = Input(name='input_length', shape=[1], dtype='int64')
    label_length = Input(name='label_length', shape=[1], dtype='int64')

    loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])
    
    if training:
        return Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out) 
    else:
        return Model(inputs=[inputs], outputs=y_pred)

# Training Model

In [9]:
def train_kfold(idx, kfold, datapath, labelpath,  epochs, batch_size, lr):

    model = get_model((*SIZE, 3), training=True)
    opt = Adam(lr=lr)
    model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=opt)

    ## load data
    train_idx, valid_idx = kfold[idx]
    train_generator = TextImageGenerator(datapath, labelpath, *SIZE, batch_size, 32, train_idx, True, MAX_LEN)
    train_generator.build_data()
    valid_generator  = TextImageGenerator(datapath, labelpath, *SIZE, batch_size, 32, valid_idx, False, MAX_LEN)
    valid_generator.build_data()

    ## callbacks

    weight_path = 'best_%d.h5' % idx
    early_stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10, mode='min', verbose=1)
    ckp = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True)

    #if finetune:
       #print('load pretrain model')
       #model.load_weights(weight_path)

    model.fit_generator(generator=train_generator.next_batch(),
                    steps_per_epoch=int(len(train_idx) / batch_size),
                    epochs=epochs,
                    callbacks=[ckp, early_stop],
                    validation_data=valid_generator.next_batch(),
                    validation_steps=int(len(valid_idx) / batch_size))
    
def train(datapath, labelpath, epochs, batch_size, lr):
    nsplits = 5

    nfiles = np.arange(len(os.listdir(datapath)))

    kfold = list(KFold(nsplits, random_state=2018, shuffle=True).split(nfiles))
    for idx in range(nsplits):
        train_kfold(idx, kfold, datapath, labelpath, epochs, batch_size, lr)

In [None]:
train_dir = "data_ocr/"
label_path = "data_ocr/labels.json"
epochs = 3
batch_size = 4 
os.environ["CUDA_VISIBLE_DEVICES"]=str(0)
finetune = 0
lr = 0.001 

train(train_dir, label_path, epochs, batch_size, lr)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


  super(Adam, self).__init__(name, **kwargs)


1458  Image Loading start...  data_ocr/
Image Loading finish...
365  Image Loading start...  data_ocr/
Image Loading finish...
Epoch 1/3


  model.fit_generator(generator=train_generator.next_batch(),


Epoch 1: val_loss improved from inf to 211.17636, saving model to best_0.h5
Epoch 2/3
Epoch 2: val_loss improved from 211.17636 to 210.69873, saving model to best_0.h5
Epoch 3/3
Epoch 3: val_loss did not improve from 210.69873




1458  Image Loading start...  data_ocr/
Image Loading finish...
365  Image Loading start...  data_ocr/
Image Loading finish...
Epoch 1/3
Epoch 1: val_loss improved from inf to 210.62976, saving model to best_1.h5
Epoch 2/3
Epoch 2: val_loss did not improve from 210.62976
Epoch 3/3
Epoch 3: val_loss did not improve from 210.62976


