# OCR(Optical Character Recognition)

In [40]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf 
import os 
from pathlib import Path

## Load Data

In [41]:
# !curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip
# !unzip -qq captcha_images_v2.zip

The dataset contains 1040 captcha files as `png` images. The label for each sample is a string,
the name of the file (minus the file extension).
We will map each character in the string to an integer for training the model. Similary,
we will need to map the predictions of the model back to strings. For this purpose
we will maintain two dictionaries, mapping characters to integers, and integers to characters,
respectively.

In [42]:
data_dir = Path('./captcha_images_v2')

# Get List of All Images
images = sorted(list(map(str,list(data_dir.glob(pattern='*.png')))))
print(images[:5])

# Get List of All Labels
labels = [img.split(os.path.sep)[-1].replace('.png', '') for img in images]
print(labels[:5])

# Get List of All Characters 
characters = sorted(list(set(char for label in labels for char in label)))

print("Number of images found: ", len(images))
print("Number of labels found: ", len(labels))
print("Number of unique characters: ", len(characters))
print("Characters present: ", characters)

['captcha_images_v2/226md.png', 'captcha_images_v2/22d5n.png', 'captcha_images_v2/2356g.png', 'captcha_images_v2/23mdg.png', 'captcha_images_v2/23n88.png']
['226md', '22d5n', '2356g', '23mdg', '23n88']
Number of images found:  1040
Number of labels found:  1040
Number of unique characters:  19
Characters present:  ['2', '3', '4', '5', '6', '7', '8', 'b', 'c', 'd', 'e', 'f', 'g', 'm', 'n', 'p', 'w', 'x', 'y']


In [43]:
batch_size = 16
img_width = 200
img_height = 50

# Factor by which the image is going to be downsampled
# by the convolutional blocks. We will be using two
# convolution blocks and each block will have
# a pooling layer which downsample the features by a factor of 2.
# Hence total downsampling factor would be 4.
downsample_factor = 4 

max_length = max([len(label) for label in labels])
max_length

5

## Pre-Processing

In [44]:
# Mapping Characters to Integers
char_to_num = tf.keras.layers.StringLookup(
    vocabulary=list(characters), mask_token=None
)

# Mapping Integers back to Original Characters
num_to_char = tf.keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
)

In [55]:
def split_data(images, labels, train_size=0.9, shuffle=True): 
    size = len(images)
    indices = tf.keras.ops.arange(size)
    
    if shuffle: 
        indices = tf.keras.random.shuffle(indices)

    train_samples = int(size * train_size)

    X_train, Y_train = images[indices[:train_samples]], labels[indices[:train_samples]]
    x_val, y_val = images[indices[train_samples:]], labels[indices[train_samples:]]

    return X_train, Y_train, x_val, y_val


In [54]:
indices = tf.keras.ops.arange(100)
print(indices)
indices = tf.keras.random.shuffle(indices)
print(indices)
sample = int(100*0.9)
print(sample)

indices[sample:]

tf.Tensor(
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99], shape=(100,), dtype=int32)
tf.Tensor(
[84 41  8 31 93 26 78 17 18 28 29 72 23 98 15 70 53 73 57 83 71 20  2 61
 74 13  0 43 87 24 56 58 32  1 44 14 54 51 89 50 67 30 90 27 66 80 38 45
 22 25 92 64 99 36 68 12 94 48 81 96 65 69 55 42 33 21 79 39  9 62 19 77
  5 60 91 40 52 76 75  7  3 59 46 47 95 35 10 37  4 97 34 88 82 63 86 49
  6 85 16 11], shape=(100,), dtype=int32)
90


<tf.Tensor: shape=(10,), dtype=int32, numpy=array([34, 88, 82, 63, 86, 49,  6, 85, 16, 11], dtype=int32)>

In [72]:
img = tf.io.read_file(images[0])

img = tf.image.decode_png(img, channels=1)

img = tf.image.convert_image_dtype(img, tf.float32)

img = tf.keras.ops.image.resize(img, [img_height, img_width])
print(img.shape)

img = tf.keras.ops.transpose(img, axes=[1,0,2])
print(img.shape)

label  = char_to_num(tf.strings.unicode_split('ax3h9k',input_encoding='UTF-8'))

print(label.numpy())


(50, 200, 1)
(200, 50, 1)
[ 0 18  2  0  0  0]


In [73]:
def encode_single_sample(img_path, label):
    img = tf.io.read_file(images[0])

    img = tf.image.decode_png(img, channels=1)

    img = tf.image.convert_image_dtype(img, tf.float32)

    img = tf.keras.ops.image.resize(img, [img_height, img_width])

    img = tf.keras.ops.transpose(img, axes=[1,0,2])

    label  = char_to_num(tf.strings.unicode_split('ax3h9k',input_encoding='UTF-8'))

    return {'image': img, 'label': label}

## Create Datasets

In [None]:
#