<a href="https://colab.research.google.com/github/hejazn86/OCR_Persian_textDetecter/blob/master/OCR_main_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
drive.mount('/drive')


Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [6]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from pathlib import Path
from collections import Counter

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import csv

In [8]:
path = '/drive/My Drive/datasets'
words_path = '/drive/My Drive/datasets/words/words.csv'

# Get lists of all the images, the labels, and the characters
def loadImage(path):
  image_files = sorted([os.path.join(path, 'images_with_persian_text', file) for file in os.listdir(path + '/images_with_persian_text') if file.endswith(".png")])
  return image_files

images = loadImage(path)


labels = []
with open(words_path) as csvfile:
  reader = csv.reader(csvfile)
  next(reader)
  for i, row in enumerate(reader):
    labels.append(row[0])

characters = set(char for label in labels for char in label)

#print(labels[0])
#print(images[10])
#img = cv2.imread(images[0], cv2.IMREAD_UNCHANGED)
#plt.imshow(img)

print("Number of images: ", len(images))
print("Number of labels: ", len(labels))
print("Number of unique: ", len(characters))
print("Characters present: ", characters)

# Batch size for training and validation
batch_size = 20

# Desired image dimensions
img_width = 200
img_height = 50

'''Factor by which the image is going to be downsampled
 by the convolutional blocks. We will be using two
 convolution blocks and each block will have
 a pooling layer which downsample the features by a factor of 2.
 Hence total downsampling factor would be 4.'''

downsample_factor = 4

# Maximum length of any captcha in the dataset
max_length = max([len(label) for label in labels])


Number of images:  30000
Number of labels:  30000
Number of unique:  36
Characters present:  {'ژ', 'أ', 'ق', 'ه', 'گ', 'ص', 'ء', 'غ', 'ل', 'م', 'ج', 'ف', 'ب', 'ا', 'ر', 'س', 'ع', '\u200c', 'و', 'ن', 'ز', 'ذ', 'ض', 'آ', 'ط', 'خ', 'ش', 'چ', 'ت', 'د', 'ظ', 'ی', 'ث', 'ح', 'پ', 'ک'}


In [9]:
# Mapping characters to integers
char_to_num = layers.experimental.preprocessing.StringLookup(
    vocabulary=list(characters), num_oov_indices=0, mask_token=None
)

# Mapping integers back to original characters
num_to_char = layers.experimental.preprocessing.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
)


def split_data(images, labels, train_size=0.7, shuffle=True):
    # 1. Get the total size of the dataset
    size = len(images)
    # 2. Make an indices array and shuffle it, if required
    indices = np.arange(size)
    if shuffle:
        np.random.shuffle(indices)
    # 3. Get the size of training samples
    train_samples = int(size * train_size)
    # 4. Split data into training and validation sets
    x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]]
    x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]]
    return x_train, x_valid, y_train, y_valid


# Splitting data into training and validation sets
x_train, x_valid, y_train, y_valid = split_data(np.array(images), np.array(labels))


def encode_single_sample(img_path, label):
    # 1. Read image
    img = tf.io.read_file(img_path)
    # 2. Decode and convert to grayscale
    img = tf.io.decode_png(img, channels=1)
    # 3. Convert to float32 in [0, 1] range
    img = tf.image.convert_image_dtype(img, tf.float32)
    # 4. Resize to the desired size
    img = tf.image.resize(img, [img_height, img_width])
    # 5. Transpose the image because we want the time
    # dimension correspond to the width of the image.
    img = tf.transpose(img, perm=[1, 0, 2])
    # 6. Map the characters in label to numbers
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    # 7. Return a dict as our model is expecting two inputs
    return {"image": img, "label": label}

print(len(x_train))
print(len(x_valid))


21000
9000


In [10]:
# creating the datasets
def create_dataset(image_data, label_data):
  dataset = tf.data.Dataset.from_tensor_slices((image_data, label_data))
  dataset = (dataset.map(encode_single_sample, num_parallel_calls=1000))
  dataset = dataset.batch(batch_size)
  dataset = dataset.prefetch(buffer_size=1000)
  return dataset

#validation_dataset = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
#validation_dataset = (validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE))
#validation_dataset = validation_dataset.batch(batch_size)
#validation_dataset = validation_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

train_dataset = create_dataset(x_train, y_train)
validation_dataset = create_dataset(x_valid, y_valid)

print(len(train_dataset))
print(len(validation_dataset))

1050
450


In [None]:
figure = figsize=(10, 5)
for batch in train_dataset.take(1):
    images = batch["image"]
    labels = batch["label"]
    img = (images[0] * 255).numpy().astype("uint8")
    label = tf.strings.reduce_join(num_to_char(labels[0])).numpy().decode("utf-8")
    ax[0].imshow(img[:, :, 0].T, cmap="gray")
    ax[0].set_title(label)
    ax[0].axis("off")
plt.show()

In [13]:
# The model 


class CTCLayer(layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        # Compute the training-time loss value and add it
        # to the layer using `self.add_loss()`.
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # At test time, just return the computed predictions
        return y_pred


def build_model():
    # Inputs to the model
    input_img = layers.Input(
        shape=(img_width, img_height, 1), name="image", dtype="float32"
    )
    labels = layers.Input(name="label", shape=(None,), dtype="float32")

    # First conv block
    x = layers.Conv2D(
        32,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv1",
    )(input_img)
    x = layers.MaxPooling2D((2, 2), name="pool1")(x)

    # Second conv block
    x = layers.Conv2D(
        64,
        (3, 3),
        activation="relu",
        kernel_initializer="he_normal",
        padding="same",
        name="Conv2",
    )(x)
    x = layers.MaxPooling2D((2, 2), name="pool2")(x)

    # We have used two max pool with pool size and strides 2.
    # Hence, downsampled feature maps are 4x smaller. The number of
    # filters in the last layer is 64. Reshape accordingly before
    # passing the output to the RNN part of the model
    new_shape = ((img_width // 4), (img_height // 4) * 64)
    x = layers.Reshape(target_shape=new_shape, name="reshape")(x)
    x = layers.Dense(64, activation="relu", name="dense1")(x)
    x = layers.Dropout(0.2)(x)

    # RNNs
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x)

    # Output layer
    x = layers.Dense(len(characters) + 1, activation="softmax", name="dense2")(x)

    # Add CTC layer for calculating CTC loss at each step
    output = CTCLayer(name="ctc_loss")(labels, x)

    # Define the model
    model = keras.models.Model(
        inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
    )
    # Optimizer
    opt = keras.optimizers.Adam()
    # Compile the model and return
    model.compile(optimizer=opt)
    return model


# Get the model
model = build_model()
model.summary()

Model: "ocr_model_v1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image (InputLayer)              [(None, 200, 50, 1)] 0                                            
__________________________________________________________________________________________________
Conv1 (Conv2D)                  (None, 200, 50, 32)  320         image[0][0]                      
__________________________________________________________________________________________________
pool1 (MaxPooling2D)            (None, 100, 25, 32)  0           Conv1[0][0]                      
__________________________________________________________________________________________________
Conv2 (Conv2D)                  (None, 100, 25, 64)  18496       pool1[0][0]                      
_______________________________________________________________________________________

In [14]:
# Training the model
epochs = 100
early_stopping_patience = 10
# Add early stopping
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=early_stopping_patience, restore_best_weights=True
)

# Train the model
history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=epochs,
    callbacks=[early_stopping],
)

Epoch 1/100


InvalidArgumentError: ignored