In [54]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import & Prepare Library

In [55]:
!pip install fastwer

import numpy as np # linear algebra
import pandas as pd # data processing
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow import keras

import matplotlib.pyplot as plt
import os
import glob
import cv2
import fastwer #for calculating CER & WER

In [56]:
np.random.seed(63)
tf.random.set_seed(63)

# Data Cleaning

In [57]:
#Loading Data
train=pd.read_csv('../input/handwriting-recognition/written_name_train_v2.csv')
val=pd.read_csv('../input/handwriting-recognition/written_name_validation_v2.csv')
test=pd.read_csv('../input/handwriting-recognition/written_name_test_v2.csv')

**Dealing with Missing Value**

In [58]:
train.describe()

In [59]:
val.describe()

In [60]:
test.describe()

In [61]:
print("Jumlah NAN dalam train set      : ", train['IDENTITY'].isnull().sum())
print("Jumlah NAN dalam validation set : ", val['IDENTITY'].isnull().sum())
print("Jumlah NAN dalam validation set : ", test['IDENTITY'].isnull().sum())

In [62]:
#Karena jumlah NA tidak signifikan dibandingkan sample set yang kita miliki,
#maka kita mengeluarkan variabel yang memiliki missing label

train.dropna(inplace=True)
train.reset_index(drop=True, inplace=True)
test.dropna(inplace=True)
test.reset_index(drop=True, inplace=True)
val.dropna(inplace=True)
val.reset_index(drop=True, inplace=True)

**Dealing with Uppercase & Setting Number of Character**

In [63]:
#Count data with missing label
print("Jumlah Not Capital dalam train set      : ", len(train) - train['IDENTITY'].str.isupper().sum())
print("Jumlah Not Capital dalam validation set : ", len(val) - val['IDENTITY'].str.isupper().sum())
print("Jumlah Not Capital dalam test set : ", len(test) - test['IDENTITY'].str.isupper().sum())

In [64]:
#Show image with lower string
train_lower = train[~train['IDENTITY'].str.isupper()]
train_lower.reset_index(inplace = True, drop=True)

plt.figure(figsize=(20, 20))

for i in range(4):
    ax = plt.subplot(2, 2, i+1)
    img_dir = '../input/handwriting-recognition/train_v2/train/'+train_lower.loc[i, 'FILENAME']
    image = cv2.imread(img_dir, cv2.IMREAD_GRAYSCALE)
    plt.imshow(image, cmap = 'gray')
    plt.title(test_lower.loc[i, 'IDENTITY'], fontsize=12)
    plt.axis('off')

plt.subplots_adjust(wspace=0.2, hspace=-0.8)

In [65]:

#Terlihat bahwa sebenarnya karakter yang dilabeli dengan 
train['Length']=train['IDENTITY'].apply(lambda x : len(str(x)))
max_name = 16
min_name = 2
train21=train[train['Length']>max_name]
train=train[train['Length']<=max_name]
train=train[train['Length']>=min_name]
train['IDENTITY']=train['IDENTITY'].str.upper()
val['IDENTITY']=val['IDENTITY'].str.upper()
test['IDENTITY']=test['IDENTITY'].str.upper()

In [66]:
train21

**Dealing With Unreadable Character**

In [67]:
train[train['IDENTITY'] == 'UNREADABLE']

In [68]:
unreadable = train[train['IDENTITY'] == 'UNREADABLE']
unreadable.reset_index(inplace = True, drop=True)

plt.figure(figsize=(15, 10))

for i in range(6):
    ax = plt.subplot(2, 3, i+1)
    img_dir = '../input/handwriting-recognition/train_v2/train/'+unreadable.loc[i, 'FILENAME']
    image = cv2.imread(img_dir, cv2.IMREAD_GRAYSCALE)
    plt.imshow(image, cmap = 'gray')
    plt.title(unreadable.loc[i, 'IDENTITY'], fontsize=12)
    plt.axis('off')

plt.subplots_adjust(wspace=0.2, hspace=-0.8)

In [69]:
#Label pada gambar unreadable tidak konsisten
#Mata manusia masih kesusahan membedakan mana gambar yang seharusnya dilabeli unreadable dan tidak
#Kami telah mencoba menyusun model untuk memprediksi readability namun performanya kurang bagus
#Oleh karena itu, kami mengeluarkan gambar dgn label unreadable dari training & validation

train = train.loc[(train['IDENTITY'] != 'UNREADABLE')]
val = val.loc[(val['IDENTITY'] != 'UNREADABLE')]
# test = train.loc[(train['IDENTITY'] != 'UNREADABLE')]

In [70]:
#Dataset yang akan kami gunakan adalah sebanyak 100000 untuk training dan 10000 untuk validation
#Pertimbangannya, jumlah tersebut telah mencakup variability yang ada di dataset

max_data_train = 100000
max_data_val = 10000
train = train.iloc[:max_data_train]
val = val.iloc[:max_data_val]

In [71]:
print(f"Total training samples: {train.shape[0]}")
print(f"Total validation samples: {val.shape[0]}")

In [72]:
base_path = '../input/handwriting-recognition'

def get_image_paths_and_labels(df, status):
    paths = []
    corrected_samples = []
    data_path = os.path.join(base_path, status+'_v2', status)
    fn=df['FILENAME'].to_numpy()
    label=df['IDENTITY'].to_numpy()
    
    for i in range (len(fn)):
        img_path = os.path.join(
            data_path, fn[i]
        )
        if os.path.getsize(img_path):
            paths.append(img_path)
            corrected_samples.append(label[i])
            
    return paths, corrected_samples
    

# Data Preparation

In [73]:
train_img_paths, train_labels = get_image_paths_and_labels(train, 'train')
val_img_paths, val_labels = get_image_paths_and_labels(val, 'validation')
test_img_paths, test_labels = get_image_paths_and_labels(test, 'test')

In [74]:
# Get vocab of character
characters = set()

for label in train_labels:
    for char in label:
        characters.add(char)

characters = sorted(list(characters))
print("Vocab size: ", len(characters))
print(characters)

In [75]:
# Build character vocabulary
AUTOTUNE = tf.data.AUTOTUNE

# Convert characters to integers.
char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)

# Convert integers back to original characters.
num_to_char = StringLookup(
    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
)

In [76]:
def distortion_free_resize(image, img_size):
    w, h = img_size
    image = tf.image.resize_with_pad(image, h, w)

    # Check the amount of padding needed to be done.
#     pad_height = h - tf.shape(image)[0]
#     pad_width = w - tf.shape(image)[1]

    image = tf.transpose(image, perm=[1, 0, 2])
    image = tf.image.flip_left_right(image)
    return image


# Data Pipeline

In [77]:
batch_size = 64
padding_token = 99
image_width = 256
image_height = 64
max_len = 32


def preprocess_image(image_path, img_size=(image_width, image_height)):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, 1)
    image = distortion_free_resize(image, img_size)
    image = tf.cast(image, tf.float32) / 255.0
    return image


def vectorize_label(label):
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    length = tf.shape(label)[0]
    pad_amount = max_len - length
    label = tf.pad(label, paddings=[[0, pad_amount]], constant_values=padding_token)
    return label


def process_images_labels(image_path, label):
    image = preprocess_image(image_path)
    label = vectorize_label(label)
    return {"image": image, "label": label}


def prepare_dataset(image_paths, labels):
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels)).map(
        process_images_labels, num_parallel_calls=AUTOTUNE
    )
    return dataset.batch(batch_size).cache().prefetch(AUTOTUNE)


In [78]:
train_ds = prepare_dataset(train_img_paths, train_labels)
val_ds = prepare_dataset(val_img_paths, val_labels)
test_ds = prepare_dataset(test_img_paths, test_labels)

In [79]:
train_ds

In [80]:
for data in train_ds.take(1):
    images, labels = data["image"], data["label"]

    _, ax = plt.subplots(4, 4, figsize=(15, 8))

    for i in range(16):
        img = images[i]
        img = tf.image.flip_left_right(img)
        img = tf.transpose(img, perm=[1, 0, 2])
        img = (img * 255.0).numpy().clip(0, 255).astype(np.uint8)
        img = img[:, :, 0]

        # Gather indices where label!= padding_token.
        label = labels[i]
        indices = tf.gather(label, tf.where(tf.math.not_equal(label, padding_token)))
        # Convert to string.
        label = tf.strings.reduce_join(num_to_char(indices))
        label = label.numpy().decode("utf-8")

        ax[i // 4, i % 4].imshow(img, cmap="gray")
        ax[i // 4, i % 4].set_title(label)
        ax[i // 4, i % 4].axis("off")


plt.show()

# Build Model

In [81]:
val_images = []
val_labels = []

for batch in val_ds:
    val_images.append(batch["image"])
    val_labels.append(batch["label"])

In [82]:
#val_labels[:10]

In [83]:

def calculate_edit_distance(labels, predictions):
    # Get a single batch and convert its labels to sparse tensors.
    sparse_labels = tf.cast(tf.sparse.from_dense(labels), dtype=tf.int64)

    # Make predictions and convert them to sparse tensors.
    input_len = np.ones(predictions.shape[0]) * predictions.shape[1]
    predictions_decoded = keras.backend.ctc_decode(
        predictions, input_length=input_len, greedy=True
    )[0][0][:, :max_len]
    sparse_predictions = tf.cast(
        tf.sparse.from_dense(predictions_decoded), dtype=tf.int64
    )

    # Compute individual edit distances and average them out.
    edit_distances = tf.edit_distance(
        sparse_predictions, sparse_labels, normalize=False
    )
    return tf.reduce_mean(edit_distances)


class EditDistanceCallback(keras.callbacks.Callback):
    def __init__(self, pred_model):
        super().__init__()
        self.prediction_model = pred_model

    def on_epoch_end(self, epoch, logs=None):
        edit_distances = []

        for i in range(len(val_images)):
            labels = val_labels[i]
            predictions = self.prediction_model.predict(val_images[i])
            edit_distances.append(calculate_edit_distance(labels, predictions).numpy())

        print(
            f"Mean edit distance for epoch {epoch + 1}: {np.mean(edit_distances):.4f}"
        )


# **Model I**:
Dalam percobaan model pertama kami menggunakan 2 layer CNN, 1 layer dense, 2 layer LSTM, dan layer CTC.
Model ini terinspirasi dari dokumentasi keras.

In [87]:
class CTCLayer(keras.layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        return y_pred


def build_model1():
    # Inputs Model
    input_img = keras.Input(shape=(image_width, image_height, 1), name="image")
    labels = keras.layers.Input(name="label", shape=(None,))

    # First CNN
    x = keras.layers.Conv2D(
        32,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv1",
    )(input_img)
    x = keras.layers.MaxPooling2D((2, 2), name="pool1")(x)
    
    
    # Second CNN.
    x = keras.layers.Conv2D(
        64,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv2",
    )(x)
    x = keras.layers.MaxPooling2D((2, 2), name="pool2")(x)
    

    #CNN di layer terakhir menghasilkan 64 fitur
    #Maxpooling 2x2 dua kali -> ukuran berkurang sebanyak 1/4
    new_shape = ((image_width // 4), (image_height // 4) * 64)
    x = keras.layers.Reshape(target_shape=new_shape, name="reshape")(x)
    x = keras.layers.Dense(64, activation="relu", name="dense1")(x)

    #First RNN
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(128, return_sequences=True, dropout=0.25)
    )(x)
    
    #Second RNN
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(64, return_sequences=True, dropout=0.25)
    )(x)

    #Softmax Layers
    x = keras.layers.Dense(
        len(char_to_num.get_vocabulary()) + 2, activation="softmax", name="dense2"
    )(x)

    #CTC Layer
    output = CTCLayer(name="ctc_loss")(labels, x)

    #Mendefinisikan Model
    model = keras.models.Model(
        inputs=[input_img, labels], outputs=output, name="handwriting_recognizer"
    )
    #Optimization.
    opt = keras.optimizers.Adam()
    #Mengcompile Model
    model.compile(optimizer=opt)
    return model


# Menginisiasi Model
model1 = build_model1()
model1.summary()

In [90]:
epochs = 10
model1 = build_model1()
prediction_model1 = keras.models.Model(
    model1.get_layer(name="image").input, model1.get_layer(name="dense2").output
)
edit_distance_callback = EditDistanceCallback(prediction_model1)

#Train the model.
history = model1.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=[edit_distance_callback],
)

In [91]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)

**Showing The Results of Model I**

In [94]:
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][
        :, :max_len
    ]
    # Iterate over the results
    output_text = []
    for res in results:
        res = tf.gather(res, tf.where(tf.math.not_equal(res, -1)))
        res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8")
        output_text.append(res)
    return output_text


for batch in test_ds.take(1):
    batch_images = batch["image"]
    _, ax = plt.subplots(4, 4, figsize=(15, 8))

    preds = prediction_model1.predict(batch_images)
    pred_texts = decode_batch_predictions(preds)

    for i in range(16):
        img = batch_images[i]
        img = tf.image.flip_left_right(img)
        img = tf.transpose(img, perm=[1, 0, 2])
        img = (img * 255.0).numpy().clip(0, 255).astype(np.uint8)
        img = img[:, :, 0]

        title = f"Prediction: {pred_texts[i]}"
        ax[i // 4, i % 4].imshow(img, cmap="gray")
        ax[i // 4, i % 4].set_title(title)
        ax[i // 4, i % 4].axis("off")

plt.show()

**Model I Performance on Train Dataset**

In [95]:
train["PREDICTION"] = ""
prediksi = []
start = 0
finish = batch_size-1
number_of_batches = 100
for batch in train_ds.take(number_of_batches):
    batch_images = batch["image"]
    preds = prediction_model1.predict(batch_images)
    pred_texts = decode_batch_predictions(preds)
    train.loc[start:finish,"PREDICTION"] = pred_texts
    start = start + len(pred_texts)
    finish = finish + len(pred_texts)
train_output = train[:batch_size*number_of_batches]
train_output["CER"] = train_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=True), axis = 1)
train_output["WER"] = train_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=False), axis = 1)
display(train_output)
print("CER_train_average: ", train_output["CER"].mean())
print("WER_train_average: ", train_output["WER"].mean())

**Model I Performance on Validation Dataset**

In [96]:
val["PREDICTION"] = ""
prediksi = []
start = 0
finish = batch_size-1
number_of_batches = 50
i = 0
for batch in val_ds.take(number_of_batches):
    finish = finish
    batch_images = batch["image"]
    preds = prediction_model1.predict(batch_images)
    pred_texts = decode_batch_predictions(preds)
    val.loc[start:finish,"PREDICTION"] = pred_texts
    start = start + len(pred_texts)
    finish = finish + len(pred_texts)
val_output = val[:batch_size*number_of_batches]
val_output["CER"] = val_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=True), axis = 1)
val_output["WER"] = val_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=False), axis = 1)
display(val_output)
print("CER_val_average: ", val_output["CER"].mean())
print("WER_val_average: ", val_output["WER"].mean())

**Penentuan Single Evaluation Metrics:**
Sebelum melakukan model tuning, kami menentukan single evaluation metrics yang akan digunakan untuk model kami.
Berdasarkan berbagai literatur, CER & WER paling banyak digunakan untuk text recognition, dimana CER digunakan untuk menilai model dengan jumlah karakter sedikit, sedangkan WER digunakan untuk tulisan yang mengandung banyak kata2 (Panjang).
Karena model ini memprediksi jumlah karakter yang rata-rata jumlahnya kurang dari 20 karakter, maka kami akan menggunakan CER sebagai single evaluation Metrics.

**Penentuan Unreducable Bias:**
Sebelum melakukan analisis terhadap model, kami menentukan unreducable Bias terlebih dahulu. Kami melakukan pengecekan terhadap 300 gambar, dan didapat bahwa rata-rata terdapat 2-3 gambar yang unreadable. Oleh karena itu, kita menentukan unreadable bias sebanyak 1%.

**Performance Model I Analysis:**
Kami akan menggunakan teknik Orthogonalization untuk melakukan tuning pada model. Model I memiliki CER pada training sebesar 6.1% dan CER pada validation sebesar 6.8%. Selisih error pada training dan unreducable bias menandakan bahwa model masih mengalami underfitting. Oleh karena itu, kami akan fokus membenahi masalah underfitting terlebih dahulu.



# **Model II**
karena model I masih memiliki masalah underfitting, kami akan mencoba mengatasi masalah underfitting tersebut dengan menambah CNN layer menjadi 4 layer, dan dense layer menjadi 2 layer, dan menambah epoch menjadi 25

In [97]:
class CTCLayer(keras.layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        return y_pred


def build_model2():
    # Inputs Model
    input_img = keras.Input(shape=(image_width, image_height, 1), name="image")
    labels = keras.layers.Input(name="label", shape=(None,))

    # First CNN
    x = keras.layers.Conv2D(
        32,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv1",
    )(input_img)
    
    # Second CNN.
    x = keras.layers.Conv2D(
        32,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv2",
    )(x)
    x = keras.layers.MaxPooling2D((2, 2), name="pool1")(x)
    
    # Third CNN.
    x = keras.layers.Conv2D(
        64,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv3",
    )(x)
    x = keras.layers.MaxPooling2D((2, 2), name="pool2")(x)
    
    # Fourth CNN.
    x = keras.layers.Conv2D(
        64,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv4",
    )(x)
    
    #CNN di layer terakhir menghasilkan 64 fitur
    #Maxpooling 2x2 dua kali -> ukuran berkurang sebanyak 1/4
    new_shape = ((image_width // 4), (image_height // 4) * 64)
    x = keras.layers.Reshape(target_shape=new_shape, name="reshape")(x)
    x = keras.layers.Dense(256, activation="relu", name="dense3")(x)
    x = keras.layers.Dense(64, activation="relu", name="dense1")(x)

    #First RNN
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(128, return_sequences=True, dropout=0.25)
    )(x)
    
    #Second RNN
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(64, return_sequences=True, dropout=0.25)
    )(x)

    #Softmax Layers
    x = keras.layers.Dense(
        len(char_to_num.get_vocabulary()) + 2, activation="softmax", name="dense2"
    )(x)

    #CTC Layer
    output = CTCLayer(name="ctc_loss")(labels, x)

    #Mendefinisikan Model
    model = keras.models.Model(
        inputs=[input_img, labels], outputs=output, name="handwriting_recognizer"
    )
    #Optimization.
    opt = keras.optimizers.Adam()
    #Mengcompile Model
    model.compile(optimizer=opt)
    return model

In [98]:
# Menginisiasi Model
model2 = build_model2()
model2.summary()

In [100]:
epochs = 25
model2 = build_model2()
prediction_model2 = keras.models.Model(
    model2.get_layer(name="image").input, model2.get_layer(name="dense2").output
)
edit_distance_callback = EditDistanceCallback(prediction_model2)

#Train the model.
history = model2.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=[edit_distance_callback],
)

In [106]:
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][
        :, :max_len
    ]
    # Iterate over the results
    output_text = []
    for res in results:
        res = tf.gather(res, tf.where(tf.math.not_equal(res, -1)))
        res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8")
        output_text.append(res)
    return output_text


for batch in test_ds.take(1):
    batch_images = batch["image"]
    _, ax = plt.subplots(4, 4, figsize=(15, 8))

    preds = prediction_model2.predict(batch_images)
    pred_texts = decode_batch_predictions(preds)

    for i in range(16):
        img = batch_images[i]
        img = tf.image.flip_left_right(img)
        img = tf.transpose(img, perm=[1, 0, 2])
        img = (img * 255.0).numpy().clip(0, 255).astype(np.uint8)
        img = img[:, :, 0]

        title = f"Prediction: {pred_texts[i]}"
        ax[i // 4, i % 4].imshow(img, cmap="gray")
        ax[i // 4, i % 4].set_title(title)
        ax[i // 4, i % 4].axis("off")

plt.show()

**Model II Performance on Train Dataset**

In [104]:
train["PREDICTION"] = ""
prediksi = []
start = 0
finish = batch_size-1
number_of_batches = 100
for batch in train_ds.take(number_of_batches):
    batch_images = batch["image"]
    preds = prediction_model2.predict(batch_images)
    pred_texts = decode_batch_predictions(preds)
    train.loc[start:finish,"PREDICTION"] = pred_texts
    start = start + len(pred_texts)
    finish = finish + len(pred_texts)
train_output = train[:batch_size*number_of_batches]
train_output["CER"] = train_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=True), axis = 1)
train_output["WER"] = train_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=False), axis = 1)
display(train_output)
print("CER_train_average: ", train_output["CER"].mean())
print("WER_train_average: ", train_output["WER"].mean())

**Model II Performance on Validation Dataset**

In [103]:
val["PREDICTION"] = ""
prediksi = []
start = 0
finish = batch_size-1
number_of_batches = 50
i = 0
for batch in val_ds.take(number_of_batches):
    finish = finish
    batch_images = batch["image"]
    preds = prediction_model2.predict(batch_images)
    pred_texts = decode_batch_predictions(preds)
    val.loc[start:finish,"PREDICTION"] = pred_texts
    start = start + len(pred_texts)
    finish = finish + len(pred_texts)
val_output = val[:batch_size*number_of_batches]
val_output["CER"] = val_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=True), axis = 1)
val_output["WER"] = val_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=False), axis = 1)
display(val_output)
print("CER_val_average: ", val_output["CER"].mean())
print("WER_val_average: ", val_output["WER"].mean())

**Analisis:**
Berdasarkan performa Model II, kita dapat melihat bahwa model II mampu meingkatkan CER pada training dataset menjadi sebesar 2.93%. Namun, performa pada validation dataset CERnya masih di angka 5.67. Hal ini mengindikasikan bahwa model memiliki masalah overfitting. Oleh karena itu, pada model selanjutnya kita akan berfokus menangani masalah underfitting tersebut

# **Model III**
Pada model ini, kita akan mengurangi masalah overfitting dengan menambahkan layer drop out.

In [126]:

class CTCLayer(keras.layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        return y_pred


def build_model():
    # Inputs Model
    input_img = keras.Input(shape=(image_width, image_height, 1), name="image")
    labels = keras.layers.Input(name="label", shape=(None,))

    # First CNN
    x = keras.layers.Conv2D(
        32,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv1",
    )(input_img)
    
    # Second CNN.
    x = keras.layers.Conv2D(
        32,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv2",
    )(x)
    x = keras.layers.MaxPooling2D((2, 2), name="pool1")(x)
    x = keras.layers.Dropout(0.2)(x)
    
    # Third CNN.
    x = keras.layers.Conv2D(
        64,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv3",
    )(x)
    x = keras.layers.MaxPooling2D((2, 2), name="pool2")(x)
    
    # Fourth CNN.
    x = keras.layers.Conv2D(
        64,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv4",
    )(x)
    x = keras.layers.Dropout(0.2)(x)
    
    #CNN di layer terakhir menghasilkan 64 fitur
    #Maxpooling 2x2 dua kali -> ukuran berkurang sebanyak 1/4
    new_shape = ((image_width // 4), (image_height // 4) * 64)
    x = keras.layers.Reshape(target_shape=new_shape, name="reshape")(x)
    x = keras.layers.Dense(256, activation="relu", name="dense3")(x)
    x = keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(64, activation="relu", name="dense1")(x)
    x = keras.layers.Dropout(0.2)(x)

    #First RNN
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(128, return_sequences=True, dropout=0.25)
    )(x)
    
    #Second RNN
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(64, return_sequences=True, dropout=0.25)
    )(x)

    #Softmax Layers
    x = keras.layers.Dense(
        len(char_to_num.get_vocabulary()) + 2, activation="softmax", name="dense2"
    )(x)

    #CTC Layer
    output = CTCLayer(name="ctc_loss")(labels, x)

    #Mendefinisikan Model
    model = keras.models.Model(
        inputs=[input_img, labels], outputs=output, name="handwriting_recognizer"
    )
    #Optimization.
    opt = keras.optimizers.Adam()
    #Mengcompile Model
    model.compile(optimizer=opt)
    return model


# Menginisiasi Model
model = build_model()
model.summary()

In [127]:
epochs = 25
model = build_model()
prediction_model = keras.models.Model(
    model.get_layer(name="image").input, model.get_layer(name="dense2").output
)
edit_distance_callback = EditDistanceCallback(prediction_model)

#Train the model.
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=[edit_distance_callback],
)

In [132]:
#Add 5 epoch.
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=5,
    callbacks=[edit_distance_callback],
)

In [None]:
model.save("my_model_v3.h5")

**Showing The Results of Model III**

In [160]:
def decode_batch_predictions(pred):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][
        :, :max_len
    ]
    # Iterate over the results
    output_text = []
    for res in results:
        res = tf.gather(res, tf.where(tf.math.not_equal(res, -1)))
        res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8")
        output_text.append(res)
    return output_text


for batch in test_ds.take(2):
    batch_images = batch["image"]
    _, ax = plt.subplots(4, 4, figsize=(15, 8))

    preds = prediction_model.predict(batch_images)
    pred_texts = decode_batch_predictions(preds)

    for i in range(16):
        img = batch_images[i]
        img = tf.image.flip_left_right(img)
        img = tf.transpose(img, perm=[1, 0, 2])
        img = (img * 255.0).numpy().clip(0, 255).astype(np.uint8)
        img = img[:, :, 0]

        title = f"Prediction: {pred_texts[i]}"
        ax[i // 4, i % 4].imshow(img, cmap="gray")
        ax[i // 4, i % 4].set_title(title)
        ax[i // 4, i % 4].axis("off")

plt.show()

# Model Performance

**Model Performance on Train Dataset**

In [134]:
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)

In [139]:
train["PREDICTION"] = ""
prediksi = []
start = 0
finish = batch_size-1
number_of_batches = 100
for batch in train_ds.take(number_of_batches):
    batch_images = batch["image"]
    preds = prediction_model.predict(batch_images)
    pred_texts = decode_batch_predictions(preds)
    train.loc[start:finish,"PREDICTION"] = pred_texts
    start = start + len(pred_texts)
    finish = finish + len(pred_texts)
train_output = train[:batch_size*number_of_batches]
train_output["CER"] = train_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=True), axis = 1)
train_output["WER"] = train_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=False), axis = 1)
display(train_output)
print("CER_train_average: ", train_output["CER"].mean())
print("WER_train_average: ", train_output["WER"].mean())

**Model Performance on Validation Dataset**

In [136]:
val["PREDICTION"] = ""
prediksi = []
start = 0
finish = batch_size-1
number_of_batches = 50
i = 0
for batch in val_ds.take(number_of_batches):
    finish = finish
    batch_images = batch["image"]
    preds = prediction_model.predict(batch_images)
    pred_texts = decode_batch_predictions(preds)
    val.loc[start:finish,"PREDICTION"] = pred_texts
    start = start + len(pred_texts)
    finish = finish + len(pred_texts)
val_output = val[:batch_size*number_of_batches]
val_output["CER"] = val_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=True), axis = 1)
val_output["WER"] = val_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=False), axis = 1)
display(val_output)
print("CER_test_average: ", val_output["CER"].mean())
print("WER_test_average: ", val_output["WER"].mean())

**Model Performance on Test Dataset**

In [140]:
test["PREDICTION"] = ""
prediksi = []
start = 0
finish = batch_size-1
number_of_batches = 50
i = 0
for batch in test_ds.take(number_of_batches):
    finish = finish
    batch_images = batch["image"]
    preds = prediction_model.predict(batch_images)
    pred_texts = decode_batch_predictions(preds)
    test.loc[start:finish,"PREDICTION"] = pred_texts
    start = start + len(pred_texts)
    finish = finish + len(pred_texts)
test_output = test[:batch_size*number_of_batches]
test_output["CER"] = test_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=True), axis = 1)
test_output["WER"] = test_output.apply(lambda x: fastwer.score_sent(x["PREDICTION"], x["IDENTITY"], char_level=False), axis = 1)
display(test_output)
print("CER_test_average: ", test_output["CER"].mean())
print("WER_test_average: ", test_output["WER"].mean())

**Analisis:**
Berdasarkan hasil di atas, memiliki CER sebanyak 3% untuk training set dan sekitar 4% untuk validation dan testing set. Standar OCR yang bagus adalah sekitar 1-2%, rata-rata sekitar 2-10%, dan kurang apabila lebih dari 10%. Mengingat OCR ini melibatkan karakter tulisan tangan yang kadang susah dikenali bahkan oleh manusia, maka kami merasa error rate tersebut dapat diterima dan siap di deploy.

# Error Analysis

In [143]:
#rank the highest error
test_highest_error = test_output.sort_values(by="CER", ascending = False)
test_highest_error.reset_index(drop=True, inplace=True)
test_highest_error.head(20)

In [159]:
plt.figure(figsize=(15, 20))

for i in range(18):
    ax = plt.subplot(6, 3, i+1)
    img_dir = '../input/handwriting-recognition/test_v2/test/'+test_highest_error.loc[i, 'FILENAME']
    image = cv2.imread(img_dir, cv2.IMREAD_GRAYSCALE)
    plt.imshow(image, cmap = 'gray')
    plt.title(test_highest_error.loc[i, 'IDENTITY'], fontsize=12)
    plt.axis('off')

plt.subplots_adjust(wspace=0.2, hspace=-0.8)

**Analisis:**
Terlihat bahwa kebanyakan gambar yang salah terprediksi dengan nilai CER besar karena memang tulisan tersebut sulit dibaca oleh mata manusia atau karena gambar tersebut terjadi kesalahan dalam pelabelan