# Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import cv2
import os
import sys
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers, Model
from tensorflow.keras.losses import CTC

In [None]:
# # Tokenize the words
# tokenizer = Tokenizer()  # Character-level tokenizer
# tokenizer.fit_on_texts(df['word'])
# word_sequences = tokenizer.texts_to_sequences(df['word'])
# # word_sequences = df['word'].values
# print(word_sequences)
# print(type(word_sequences))
# word_padded = pad_sequences(word_sequences, maxlen=max_word_length, padding='post')

# Helpers

In [None]:
def prepare_dataset(df, max_word_length, batch_size):
    tokenizer = Tokenizer(char_level=True)  # Character-level tokenizer
    word_sequences = tf.strings.unicode_decode(df['word'].values, 'ASCII').to_list()
    word_padded = pad_sequences(word_sequences, maxlen=max_word_length, padding='post')
    word_padded = tf.sparse.from_dense(word_padded)

    # Create a TensorFlow dataset
    def process_row(filepath, word):
        print(f'{filepath} {word}')
        # Read and decode the image
        image = tf.io.read_file(filepath)
        image = tf.image.decode_png(image, channels=1)  # Assuming grayscale images
        image = tf.image.resize(image, [128, 32])  # Resize to a fixed size
        image = image / 255.0  # Normalize to [0, 1]
        return image, word

    filepaths = df['filename'].values
    dataset = tf.data.Dataset.from_tensor_slices((filepaths, word_padded))
    dataset = dataset.map(lambda x, y: process_row(x, y), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=128).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    # dataset = dataset.filter(lambda x, y: tf.size(x) > 0)

    return dataset, tokenizer

In [None]:
def build_handwriting_recognition_model(input_shape, num_classes):
    # Input layer
    inputs = layers.Input(shape=input_shape, name="image")

    # Convolutional layers (feature extraction)
    x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(inputs)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
    x = layers.MaxPooling2D((2, 2))(x)

    # Reshape for RNN
    x = layers.Reshape((-1, x.shape[-1] * x.shape[-2]))(x)

    # Recurrent layers (sequence modeling)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)

    # Dense layer (character probabilities)
    # +1 for the CTC blank token
    x = layers.Dense(num_classes + 1, activation="softmax")(x)

    # Define the model
    model = Model(inputs, x, name="handwriting_recognition_model")
    return model

# Read data

In [None]:
with open('iam_words/words.txt', 'r') as f:
    words = f.readlines()
words = [word.strip() for word in words]
words = words[18:-1]
words = [w for w in words if ' err ' not in w]
words = [[w.split(' ')[0], w.split(' ')[-1]] for w in words]
words = [[f'iam_words/words/{w.split('-')[0]}/{w.split('-')[0]}-{w.split('-')[1]}/{w}.png', y] for w, y in words]
df = pd.DataFrame(words, columns=['filename', 'word'])
df = df[df['filename'].apply(os.path.exists)]
# df['word'] = df['word'].apply(lambda x: [np.int32(ord(a)) for a in x])
df

In [None]:
df = df.iloc[:1000, :]

In [None]:
dataset, tokenizer = prepare_dataset(df, max_word_length=32, batch_size=32)

In [None]:
dataset.take(1).element_spec[0].shape[1:]

In [None]:
dataset.take(1)

In [None]:
model = build_handwriting_recognition_model(input_shape=dataset.take(1).element_spec[0].shape[1:], num_classes=256)
model.summary()

In [None]:
inputs = layers.Input(shape=dataset.take(1).element_spec[0].shape[1:], name="image")

# Convolutional layers (feature extraction)
x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(inputs)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(64, (3, 3), activation="relu", padding="same")(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(128, (3, 3), activation="relu", padding="same")(x)
x = layers.MaxPooling2D((2, 2))(x)

print(x)
# Reshape for RNN
x = layers.Reshape((-1, x.shape[-1] * x.shape[-2]))(x)
print(x)

# Recurrent layers (sequence modeling)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)

# Dense layer (character probabilities)
# +1 for the CTC blank token
x = layers.Dense(256 + 1, activation="softmax")(x)

# Define the model
model = Model(inputs, x, name="handwriting_recognition_model")

model.compile(optimizer='adam', loss='ctc')
history = model.fit(dataset, epochs=1)

In [None]:
model.compile(optimizer='adam', loss='ctc')
history = model.fit(dataset, epochs=1)

In [None]:
plt.plot(history.history['loss'])

In [None]:
batch = dataset.take(1)
print(batch)
for x, y in batch:
    print(x.shape)
    print(y.shape)
    break

In [None]:
try:
    for x, y in dataset.take(150):
        pass
except Exception as e:
    print(f"Error: {e}")

In [None]:
i = 0
while i < 120:
    try:
        for x, y in dataset.skip(i).take(1):
            pass
        i += 1
    except:
        print(f"Wrong {i} {df.iloc[i]['filename']}")

In [None]:
tmpdf = df.iloc[:, :]
print(tmpdf)
dataset, tokenizer = prepare_dataset(tmpdf, max_word_length=32, batch_size=32)
i = 0
for x, y in dataset.take(150):
    print(i, end=' ')
    i += 1

In [None]:
tmpdf = df.iloc[100:, :]
print(tmpdf)
dataset, tokenizer = prepare_dataset(tmpdf, max_word_length=32, batch_size=32)
i = 100
for x, y in dataset.take(150):
    print(i, end=' ')
    i += 1