diff --git a/examples/vision/captcha_ocr.py b/examples/vision/captcha_ocr.py
new file mode 100644
index 0000000000..70aa65a80a
--- /dev/null
+++ b/examples/vision/captcha_ocr.py
@@ -0,0 +1,340 @@
+"""
+Title: OCR model for reading captcha
+Author: [A_K_Nain](https://twitter.com/A_K_Nain)
+Date created: 2020/06/14
+Last modified: 2020/06/14
+Description: How to implement an OCR model using CNNs, RNNs and CTC loss.
+"""
+
+"""
+## Introduction
+
+This example demonstrates a simple OCR model using Functional API. Apart from
+combining CNN and RNN, it also illustrates how you can instantiate a new layer
+and use it as an `Endpoint` layer for implementing CTC loss. For a detailed
+description on layer subclassing, please check out this
+[example](https://keras.io/guides/making_new_layers_and_models_via_subclassing/#the-addmetric-method)
+in the developer guides.
+"""
+
+"""
+## Setup
+"""
+
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+
+from pathlib import Path
+from collections import Counter
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+
+
+"""
+## Load the data: [Captcha Images](https://www.kaggle.com/fournierp/captcha-version-2-images)
+Let's download the data.
+"""
+
+
+"""shell
+curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip
+unzip -qq captcha_images_v2.zip
+"""
+
+
+"""
+The dataset contains 1040 captcha files as png images. The label for each sample is the
+name of the file (excluding the '.png' part). The label for each sample is a string.
+We will map each character in the string to a number for training the model. Similary,
+we would be required to map the predictions of the model back to string. For this purpose
+would maintain two dictionary mapping characters to numbers and numbers to characters
+respectively.
+"""
+
+
+# Path to the data directory
+data_dir = Path("./captcha_images_v2/")
+
+# Get list of all the images
+images = sorted(list(map(str, list(data_dir.glob("*.png")))))
+labels = [img.split(os.path.sep)[-1].split(".png")[0] for img in images]
+characters = set(char for label in labels for char in label)
+
+print("Number of images found: ", len(images))
+print("Number of labels found: ", len(labels))
+print("Number of unique characters: ", len(characters))
+print("Characters present: ", characters)
+
+# Batch size for training and validation
+batch_size = 16
+
+# Desired image dimensions
+img_width = 200
+img_height = 50
+
+# Factor by which the image is going to be downsampled
+# by the convolutional blocks. We will be using two
+# convolution blocks and each convolution block will have
+# a pooling layer which downsample the features by a factor of 2.
+# Hence total downsampling factor would be 4.
+downsample_factor = 4
+
+# Maximum length of any captcha in the dataset
+max_length = max([len(label) for label in labels])
+
+
+"""
+## Preprocessing
+"""
+
+
+# Mapping characters to numbers
+char_to_num = layers.experimental.preprocessing.StringLookup(
+ vocabulary=list(characters), num_oov_indices=0, mask_token=None
+)
+
+# Mapping numbers back to original characters
+num_to_char = layers.experimental.preprocessing.StringLookup(
+ vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
+)
+
+
+def split_data(images, labels, train_size=0.9, shuffle=True):
+ # 1. Get the total size of the dataset
+ size = len(images)
+ # 2. Make an indices array and shuffle it, if required
+ indices = np.arange(size)
+ if shuffle:
+ np.random.shuffle(indices)
+ # 3. Get the size of training samples
+ train_samples = int(size * train_size)
+ # 4. Split data into training and validation sets
+ x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]]
+ x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]]
+ return x_train, x_valid, y_train, y_valid
+
+
+# Splitting data into training and validation sets
+x_train, x_valid, y_train, y_valid = split_data(np.array(images), np.array(labels))
+
+
+def encode_single_sample(img_path, label):
+ # 1. Read image
+ img = tf.io.read_file(img_path)
+ # 2. Decode and convert to grayscale
+ img = tf.io.decode_png(img, channels=1)
+ # 3. Convert to float32 in [0, 1] range
+ img = tf.image.convert_image_dtype(img, tf.float32)
+ # 4. Resize to the desired size
+ img = tf.image.resize(img, [img_height, img_width])
+ # 5. Transpose the image because we want the time
+ # dimension to correspond to the width of the image.
+ img = tf.transpose(img, perm=[1, 0, 2])
+ # 6. Map the characters in label to numbers
+ label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
+ # 7. Return a dict as our model is expecting two inputs
+ return {"image": img, "label": label}
+
+
+"""
+## Data Generators
+"""
+
+
+train_data_generator = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+train_data_generator = (
+ train_data_generator.map(
+ encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
+ )
+ .batch(batch_size)
+ .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+)
+
+valid_data_generator = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
+valid_data_generator = (
+ valid_data_generator.map(
+ encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
+ )
+ .batch(batch_size)
+ .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+)
+
+"""
+## Visualize the data
+"""
+
+
+_, ax = plt.subplots(4, 4, figsize=(10, 5))
+for batch in train_data_generator.take(1):
+ images = batch["image"]
+ labels = batch["label"]
+ for i in range(16):
+ img = (images[i] * 255).numpy().astype("uint8")
+ label = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode("utf-8")
+ ax[i // 4, i % 4].imshow(img[:, :, 0].T, cmap="gray")
+ ax[i // 4, i % 4].set_title(label)
+ ax[i // 4, i % 4].axis("off")
+plt.show()
+
+"""
+## Model
+"""
+
+
+class CTCLayer(layers.Layer):
+ def __init__(self, name=None):
+ super().__init__(name=name)
+ self.loss_fn = keras.backend.ctc_batch_cost
+
+ def call(self, y_true, y_pred):
+ # Compute the training-time loss value and add it
+ # to the layer using `self.add_loss()`.
+ batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
+ input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
+ label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
+
+ input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
+ label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
+
+ loss = self.loss_fn(y_true, y_pred, input_length, label_length)
+ self.add_loss(loss)
+
+ # On test time, just return the computed loss
+ return y_pred
+
+
+def build_model():
+ # Inputs to the model
+ input_img = layers.Input(
+ shape=(img_width, img_height, 1), name="image", dtype="float32"
+ )
+ labels = layers.Input(name="label", shape=(None,), dtype="float32")
+
+ # First conv block
+ x = layers.Conv2D(
+ 32,
+ (3, 3),
+ activation="relu",
+ kernel_initializer="he_normal",
+ padding="same",
+ name="Conv1",
+ )(input_img)
+ x = layers.MaxPooling2D((2, 2), name="pool1")(x)
+
+ # Second conv block
+ x = layers.Conv2D(
+ 64,
+ (3, 3),
+ activation="relu",
+ kernel_initializer="he_normal",
+ padding="same",
+ name="Conv2",
+ )(x)
+ x = layers.MaxPooling2D((2, 2), name="pool2")(x)
+
+ # We have used two max pool with pool size and strides of 2.
+ # Hence, downsampled feature maps are 4x smaller. The number of
+ # filters in the last layer is 64. Reshape accordingly before
+ # passing it to RNNs
+ new_shape = ((img_width // 4), (img_height // 4) * 64)
+ x = layers.Reshape(target_shape=new_shape, name="reshape")(x)
+ x = layers.Dense(64, activation="relu", name="dense1")(x)
+ x = layers.Dropout(0.2)(x)
+
+ # RNNs
+ x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.2))(x)
+ x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x)
+
+ # Output layer
+ x = layers.Dense(len(characters) + 1, activation="softmax", name="dense2")(x)
+
+ # Add CTC layer for calculating CTC loss at each step
+ output = CTCLayer(name="ctc_loss")(labels, x)
+
+ # Define the model
+ model = keras.models.Model(
+ inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
+ )
+ # Optimizer
+ opt = keras.optimizers.Adam()
+ # Compile the model and return
+ model.compile(optimizer=opt)
+ return model
+
+
+# Get the model
+model = build_model()
+model.summary()
+
+"""
+## Training
+"""
+
+
+epochs = 100
+es_patience = 10
+# Add early stopping
+es = keras.callbacks.EarlyStopping(
+ monitor="val_loss", patience=es_patience, restore_best_weights=True
+)
+
+# Train the model
+history = model.fit(
+ train_data_generator,
+ validation_data=valid_data_generator,
+ epochs=epochs,
+ callbacks=[es],
+)
+
+
+"""
+## Let's test-drive it
+"""
+
+
+# Get the prediction model by extracting layers till the output layer
+prediction_model = keras.models.Model(
+ model.get_layer(name="image").input, model.get_layer(name="dense2").output
+)
+prediction_model.summary()
+
+# A utility function to decode the output of the network
+def decode_batch_predictions(pred):
+ input_len = np.ones(pred.shape[0]) * pred.shape[1]
+ # Use greedy search. For complex tasks, you can use beam search
+ results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][
+ :, :max_length
+ ]
+ # Iterate over the results and get back the text
+ output_text = []
+ for res in results:
+ res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8")
+ output_text.append(res)
+ return output_text
+
+
+# Let's check results on some validation samples
+for batch in valid_data_generator.take(1):
+ batch_images = batch["image"]
+ batch_labels = batch["label"]
+
+ preds = prediction_model.predict(batch_images)
+ pred_texts = decode_batch_predictions(preds)
+
+ orig_texts = []
+ for label in batch_labels:
+ label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
+ orig_texts.append(label)
+
+ _, ax = plt.subplots(4, 4, figsize=(15, 5))
+ for i in range(len(pred_texts)):
+ img = (batch_images[i, :, :, 0] * 255).numpy().astype(np.uint8)
+ img = img.T
+ title = f"Prediction: {pred_texts[i]}"
+ ax[i // 4, i % 4].imshow(img, cmap="gray")
+ ax[i // 4, i % 4].set_title(title)
+ ax[i // 4, i % 4].axis("off")
+plt.show()
diff --git a/examples/vision/img/captcha_ocr/captcha_ocr_13_0.png b/examples/vision/img/captcha_ocr/captcha_ocr_13_0.png
new file mode 100644
index 0000000000..2479339d62
Binary files /dev/null and b/examples/vision/img/captcha_ocr/captcha_ocr_13_0.png differ
diff --git a/examples/vision/img/captcha_ocr/captcha_ocr_19_1.png b/examples/vision/img/captcha_ocr/captcha_ocr_19_1.png
new file mode 100644
index 0000000000..251b48e2ad
Binary files /dev/null and b/examples/vision/img/captcha_ocr/captcha_ocr_19_1.png differ
diff --git a/examples/vision/ipynb/captcha_ocr.ipynb b/examples/vision/ipynb/captcha_ocr.ipynb
new file mode 100644
index 0000000000..d116567d0b
--- /dev/null
+++ b/examples/vision/ipynb/captcha_ocr.ipynb
@@ -0,0 +1,506 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "# OCR model for reading captcha\n",
+ "\n",
+ "**Author:** [A_K_Nain](https://twitter.com/A_K_Nain)
\n",
+ "**Date created:** 2020/06/14
\n",
+ "**Last modified:** 2020/06/14
\n",
+ "**Description:** How to implement an OCR model using CNNs, RNNs and CTC loss."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Introduction\n",
+ "\n",
+ "This example demonstrates a simple OCR model using Functional API. Apart from\n",
+ "combining CNN and RNN, it also illustrates how you can instantiate a new layer\n",
+ "and use it as an `Endpoint` layer for implementing CTC loss. For a detailed\n",
+ "description on layer subclassing, please check out this\n",
+ "[example](https://keras.io/guides/making_new_layers_and_models_via_subclassing/#the-addmetric-method)\n",
+ "in the developer guides."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Setup"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "from pathlib import Path\n",
+ "from collections import Counter\n",
+ "\n",
+ "import tensorflow as tf\n",
+ "from tensorflow import keras\n",
+ "from tensorflow.keras import layers\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Load the data: [Captcha Images](https://www.kaggle.com/fournierp/captcha-version-2-images)\n",
+ "Let's download the data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "!curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip\n",
+ "!unzip -qq captcha_images_v2.zip"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "The dataset contains 1040 captcha files as png images. The label for each sample is the\n",
+ "name of the file (excluding the '.png' part). The label for each sample is a string.\n",
+ "We will map each character in the string to a number for training the model. Similary,\n",
+ "we would be required to map the predictions of the model back to string. For this purpose\n",
+ "would maintain two dictionary mapping characters to numbers and numbers to characters\n",
+ "respectively."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "# Path to the data directory\n",
+ "data_dir = Path(\"./captcha_images_v2/\")\n",
+ "\n",
+ "# Get list of all the images\n",
+ "images = sorted(list(map(str, list(data_dir.glob(\"*.png\")))))\n",
+ "labels = [img.split(os.path.sep)[-1].split(\".png\")[0] for img in images]\n",
+ "characters = set(char for label in labels for char in label)\n",
+ "\n",
+ "print(\"Number of images found: \", len(images))\n",
+ "print(\"Number of labels found: \", len(labels))\n",
+ "print(\"Number of unique characters: \", len(characters))\n",
+ "print(\"Characters present: \", characters)\n",
+ "\n",
+ "# Batch size for training and validation\n",
+ "batch_size = 16\n",
+ "\n",
+ "# Desired image dimensions\n",
+ "img_width = 200\n",
+ "img_height = 50\n",
+ "\n",
+ "# Factor by which the image is going to be downsampled\n",
+ "# by the convolutional blocks. We will be using two\n",
+ "# convolution blocks and each convolution block will have\n",
+ "# a pooling layer which downsample the features by a factor of 2.\n",
+ "# Hence total downsampling factor would be 4.\n",
+ "downsample_factor = 4\n",
+ "\n",
+ "# Maximum length of any captcha in the dataset\n",
+ "max_length = max([len(label) for label in labels])\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Preprocessing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "# Mapping characters to numbers\n",
+ "char_to_num = layers.experimental.preprocessing.StringLookup(\n",
+ " vocabulary=list(characters), num_oov_indices=0, mask_token=None\n",
+ ")\n",
+ "\n",
+ "# Mapping numbers back to original characters\n",
+ "num_to_char = layers.experimental.preprocessing.StringLookup(\n",
+ " vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True\n",
+ ")\n",
+ "\n",
+ "\n",
+ "def split_data(images, labels, train_size=0.9, shuffle=True):\n",
+ " # 1. Get the total size of the dataset\n",
+ " size = len(images)\n",
+ " # 2. Make an indices array and shuffle it, if required\n",
+ " indices = np.arange(size)\n",
+ " if shuffle:\n",
+ " np.random.shuffle(indices)\n",
+ " # 3. Get the size of training samples\n",
+ " train_samples = int(size * train_size)\n",
+ " # 4. Split data into training and validation sets\n",
+ " x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]]\n",
+ " x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]]\n",
+ " return x_train, x_valid, y_train, y_valid\n",
+ "\n",
+ "\n",
+ "# Splitting data into training and validation sets\n",
+ "x_train, x_valid, y_train, y_valid = split_data(np.array(images), np.array(labels))\n",
+ "\n",
+ "\n",
+ "def encode_single_sample(img_path, label):\n",
+ " # 1. Read image\n",
+ " img = tf.io.read_file(img_path)\n",
+ " # 2. Decode and convert to grayscale\n",
+ " img = tf.io.decode_png(img, channels=1)\n",
+ " # 3. Convert to float32 in [0, 1] range\n",
+ " img = tf.image.convert_image_dtype(img, tf.float32)\n",
+ " # 4. Resize to the desired size\n",
+ " img = tf.image.resize(img, [img_height, img_width])\n",
+ " # 5. Transpose the image because we want the time\n",
+ " # dimension to correspond to the width of the image.\n",
+ " img = tf.transpose(img, perm=[1, 0, 2])\n",
+ " # 6. Map the characters in label to numbers\n",
+ " label = char_to_num(tf.strings.unicode_split(label, input_encoding=\"UTF-8\"))\n",
+ " # 7. Return a dict as our model is expecting two inputs\n",
+ " return {\"image\": img, \"label\": label}\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Data Generators"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "train_data_generator = tf.data.Dataset.from_tensor_slices((x_train, y_train))\n",
+ "train_data_generator = (\n",
+ " train_data_generator.map(\n",
+ " encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE\n",
+ " )\n",
+ " .batch(batch_size)\n",
+ " .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)\n",
+ ")\n",
+ "\n",
+ "valid_data_generator = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))\n",
+ "valid_data_generator = (\n",
+ " valid_data_generator.map(\n",
+ " encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE\n",
+ " )\n",
+ " .batch(batch_size)\n",
+ " .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Visualize the data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "_, ax = plt.subplots(4, 4, figsize=(10, 5))\n",
+ "for batch in train_data_generator.take(1):\n",
+ " images = batch[\"image\"]\n",
+ " labels = batch[\"label\"]\n",
+ " for i in range(16):\n",
+ " img = (images[i] * 255).numpy().astype(\"uint8\")\n",
+ " label = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode(\"utf-8\")\n",
+ " ax[i // 4, i % 4].imshow(img[:, :, 0].T, cmap=\"gray\")\n",
+ " ax[i // 4, i % 4].set_title(label)\n",
+ " ax[i // 4, i % 4].axis(\"off\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "class CTCLayer(layers.Layer):\n",
+ " def __init__(self, name=None):\n",
+ " super().__init__(name=name)\n",
+ " self.loss_fn = keras.backend.ctc_batch_cost\n",
+ "\n",
+ " def call(self, y_true, y_pred):\n",
+ " # Compute the training-time loss value and add it\n",
+ " # to the layer using `self.add_loss()`.\n",
+ " batch_len = tf.cast(tf.shape(y_true)[0], dtype=\"int64\")\n",
+ " input_length = tf.cast(tf.shape(y_pred)[1], dtype=\"int64\")\n",
+ " label_length = tf.cast(tf.shape(y_true)[1], dtype=\"int64\")\n",
+ "\n",
+ " input_length = input_length * tf.ones(shape=(batch_len, 1), dtype=\"int64\")\n",
+ " label_length = label_length * tf.ones(shape=(batch_len, 1), dtype=\"int64\")\n",
+ "\n",
+ " loss = self.loss_fn(y_true, y_pred, input_length, label_length)\n",
+ " self.add_loss(loss)\n",
+ "\n",
+ " # On test time, just return the computed loss\n",
+ " return y_pred\n",
+ "\n",
+ "\n",
+ "def build_model():\n",
+ " # Inputs to the model\n",
+ " input_img = layers.Input(\n",
+ " shape=(img_width, img_height, 1), name=\"image\", dtype=\"float32\"\n",
+ " )\n",
+ " labels = layers.Input(name=\"label\", shape=(None,), dtype=\"float32\")\n",
+ "\n",
+ " # First conv block\n",
+ " x = layers.Conv2D(\n",
+ " 32,\n",
+ " (3, 3),\n",
+ " activation=\"relu\",\n",
+ " kernel_initializer=\"he_normal\",\n",
+ " padding=\"same\",\n",
+ " name=\"Conv1\",\n",
+ " )(input_img)\n",
+ " x = layers.MaxPooling2D((2, 2), name=\"pool1\")(x)\n",
+ "\n",
+ " # Second conv block\n",
+ " x = layers.Conv2D(\n",
+ " 64,\n",
+ " (3, 3),\n",
+ " activation=\"relu\",\n",
+ " kernel_initializer=\"he_normal\",\n",
+ " padding=\"same\",\n",
+ " name=\"Conv2\",\n",
+ " )(x)\n",
+ " x = layers.MaxPooling2D((2, 2), name=\"pool2\")(x)\n",
+ "\n",
+ " # We have used two max pool with pool size and strides of 2.\n",
+ " # Hence, downsampled feature maps are 4x smaller. The number of\n",
+ " # filters in the last layer is 64. Reshape accordingly before\n",
+ " # passing it to RNNs\n",
+ " new_shape = ((img_width // 4), (img_height // 4) * 64)\n",
+ " x = layers.Reshape(target_shape=new_shape, name=\"reshape\")(x)\n",
+ " x = layers.Dense(64, activation=\"relu\", name=\"dense1\")(x)\n",
+ " x = layers.Dropout(0.2)(x)\n",
+ "\n",
+ " # RNNs\n",
+ " x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.2))(x)\n",
+ " x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x)\n",
+ "\n",
+ " # Output layer\n",
+ " x = layers.Dense(len(characters) + 1, activation=\"softmax\", name=\"dense2\")(x)\n",
+ "\n",
+ " # Add CTC layer for calculating CTC loss at each step\n",
+ " output = CTCLayer(name=\"ctc_loss\")(labels, x)\n",
+ "\n",
+ " # Define the model\n",
+ " model = keras.models.Model(\n",
+ " inputs=[input_img, labels], outputs=output, name=\"ocr_model_v1\"\n",
+ " )\n",
+ " # Optimizer\n",
+ " opt = keras.optimizers.Adam()\n",
+ " # Compile the model and return\n",
+ " model.compile(optimizer=opt)\n",
+ " return model\n",
+ "\n",
+ "\n",
+ "# Get the model\n",
+ "model = build_model()\n",
+ "model.summary()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "epochs = 100\n",
+ "es_patience = 10\n",
+ "# Add early stopping\n",
+ "es = keras.callbacks.EarlyStopping(\n",
+ " monitor=\"val_loss\", patience=es_patience, restore_best_weights=True\n",
+ ")\n",
+ "\n",
+ "# Train the model\n",
+ "history = model.fit(\n",
+ " train_data_generator,\n",
+ " validation_data=valid_data_generator,\n",
+ " epochs=epochs,\n",
+ " callbacks=[es],\n",
+ ")\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "colab_type": "text"
+ },
+ "source": [
+ "## Let's test-drive it"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 0,
+ "metadata": {
+ "colab_type": "code"
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "# Get the prediction model by extracting layers till the output layer\n",
+ "prediction_model = keras.models.Model(\n",
+ " model.get_layer(name=\"image\").input, model.get_layer(name=\"dense2\").output\n",
+ ")\n",
+ "prediction_model.summary()\n",
+ "\n",
+ "# A utility function to decode the output of the network\n",
+ "def decode_batch_predictions(pred):\n",
+ " input_len = np.ones(pred.shape[0]) * pred.shape[1]\n",
+ " # Use greedy search. For complex tasks, you can use beam search\n",
+ " results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][\n",
+ " :, :max_length\n",
+ " ]\n",
+ " # Iterate over the results and get back the text\n",
+ " output_text = []\n",
+ " for res in results:\n",
+ " res = tf.strings.reduce_join(num_to_char(res)).numpy().decode(\"utf-8\")\n",
+ " output_text.append(res)\n",
+ " return output_text\n",
+ "\n",
+ "\n",
+ "# Let's check results on some validation samples\n",
+ "for batch in valid_data_generator.take(1):\n",
+ " batch_images = batch[\"image\"]\n",
+ " batch_labels = batch[\"label\"]\n",
+ "\n",
+ " preds = prediction_model.predict(batch_images)\n",
+ " pred_texts = decode_batch_predictions(preds)\n",
+ "\n",
+ " orig_texts = []\n",
+ " for label in batch_labels:\n",
+ " label = tf.strings.reduce_join(num_to_char(label)).numpy().decode(\"utf-8\")\n",
+ " orig_texts.append(label)\n",
+ "\n",
+ " _, ax = plt.subplots(4, 4, figsize=(15, 5))\n",
+ " for i in range(len(pred_texts)):\n",
+ " img = (batch_images[i, :, :, 0] * 255).numpy().astype(np.uint8)\n",
+ " img = img.T\n",
+ " title = f\"Prediction: {pred_texts[i]}\"\n",
+ " ax[i // 4, i % 4].imshow(img, cmap=\"gray\")\n",
+ " ax[i // 4, i % 4].set_title(title)\n",
+ " ax[i // 4, i % 4].axis(\"off\")\n",
+ "plt.show()"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [],
+ "name": "captcha_ocr",
+ "private_outputs": false,
+ "provenance": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/examples/vision/md/captcha_ocr.md b/examples/vision/md/captcha_ocr.md
new file mode 100644
index 0000000000..8fbe6545d3
--- /dev/null
+++ b/examples/vision/md/captcha_ocr.md
@@ -0,0 +1,652 @@
+# OCR model for reading captcha
+
+**Author:** [A_K_Nain](https://twitter.com/A_K_Nain)
+**Date created:** 2020/06/14
+**Last modified:** 2020/06/14
+**Description:** How to implement an OCR model using CNNs, RNNs and CTC loss.
+
+
+ [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/vision/ipynb/captcha_ocr.ipynb) • [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/vision/captcha_ocr.py)
+
+
+
+---
+## Introduction
+
+This example demonstrates a simple OCR model using Functional API. Apart from
+combining CNN and RNN, it also illustrates how you can instantiate a new layer
+and use it as an `Endpoint` layer for implementing CTC loss. For a detailed
+description on layer subclassing, please check out this
+[example](https://keras.io/guides/making_new_layers_and_models_via_subclassing/#the-addmetric-method)
+in the developer guides.
+
+---
+## Setup
+
+
+```python
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+
+from pathlib import Path
+from collections import Counter
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+
+```
+
+---
+## Load the data: [Captcha Images](https://www.kaggle.com/fournierp/captcha-version-2-images)
+Let's download the data.
+
+
+```python
+!curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip
+!unzip -qq captcha_images_v2.zip
+```
+
+