In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from collections import Counter

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
data_dir = Path("./captcha_images_v2/")

images = sorted(list(map(str, list(data_dir.glob("*.png")))))
labels = [img.split(os.path.sep)[-1].split(".png")[0] for img in images]
characters = set(char for label in labels for char in label)

In [3]:
batch_size = 16
img_width = 200
img_height = 50

In [4]:
downsample_factor = 4

In [5]:
max_length = max([len(label) for label in labels])

Preprocessing

In [6]:
# Mapping characters to integers
char_to_num = layers.StringLookup(vocabulary=list(characters),
                                  mask_token=None)

In [7]:
# Mapping integers back to orignial characters
num_to_char = layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(),
    mask_token=None,
    invert=True
)

In [8]:
def split_data(images, labels, train_size=0.9, shuffle=True):
    size = len(images)
    indices = np.arange(size)
    if shuffle:
        np.random.shuffle(indices)
    # 3. Get the size of training samples
    train_samples = int(size * train_size)
    # 4. Split data into training and validation sets
    x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]]
    x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]]
    return x_train, x_valid, y_train, y_valid


In [9]:
x_train, x_valid, y_train, y_valid = split_data(np.array(images),
                                                np.array(labels))

In [10]:
def encode_single_sample(img_path, label):
    #1. Read image
    img = tf.io.read_file(img_path)
    #2. Decode and convert it into grayscale
    img = tf.io.decode_png(img, channels=1)
    #3. Convert to float32 in [0,1] range.
    img = tf.image.convert_image_dtype(img, tf.float32)
    #4. Resize to the desired size.
    img = tf.image.resize(img, [img_height, img_width])
    #5. Transpose the image because we want the time
    #dimension to correspond to the width of the image.
    img = tf.transpose(img, perm=[1,0,2])
    #6. Map the characters in label to numbers.
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    #7. Return a dict as our model is expecting two inputs.
    return {"image":img, "label": label}

In [12]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = (
    train_dataset.map(
        encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE
    )
    .batch(batch_size)
    .prefetch(buffer_size=tf.data.AUTOTUNE)
)